Statistics
| Branch: | Revision:

iof-bird-daemon / proto / bgp / packets.c @ 62e64905

History | View | Annotate | Download (49 KB)

1
/*
2
 *        BIRD -- BGP Packet Processing
3
 *
4
 *        (c) 2000 Martin Mares <mj@ucw.cz>
5
 *        (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
6
 *        (c) 2008--2016 CZ.NIC z.s.p.o.
7
 *
8
 *        Can be freely distributed and used under the terms of the GNU GPL.
9
 */
10

    
11
#undef LOCAL_DEBUG
12

    
13
#include <stdlib.h>
14

    
15
#include "nest/bird.h"
16
#include "nest/iface.h"
17
#include "nest/protocol.h"
18
#include "nest/route.h"
19
#include "nest/attrs.h"
20
#include "nest/mrtdump.h"
21
#include "conf/conf.h"
22
#include "lib/unaligned.h"
23
#include "lib/socket.h"
24

    
25
#include "nest/cli.h"
26

    
27
#include "bgp.h"
28

    
29

    
30
#define BGP_RR_REQUEST                0
31
#define BGP_RR_BEGIN                1
32
#define BGP_RR_END                2
33

    
34

    
35
static struct tbf rl_rcv_update = TBF_DEFAULT_LOG_LIMITS;
36
static struct tbf rl_snd_update = TBF_DEFAULT_LOG_LIMITS;
37

    
38
/* Table for state -> RFC 6608 FSM error subcodes */
39
static byte fsm_err_subcode[BS_MAX] = {
40
  [BS_OPENSENT] = 1,
41
  [BS_OPENCONFIRM] = 2,
42
  [BS_ESTABLISHED] = 3
43
};
44

    
45

    
46
static struct bgp_channel *
47
bgp_get_channel(struct bgp_proto *p, u32 afi)
48
{
49
  uint i;
50

    
51
  for (i = 0; i < p->channel_count; i++)
52
    if (p->afi_map[i] == afi)
53
      return p->channel_map[i];
54

    
55
  return NULL;
56
}
57

    
58
static inline void
59
put_af3(byte *buf, u32 id)
60
{
61
  put_u16(buf, id >> 16);
62
  buf[2] = id & 0xff;
63
}
64

    
65
static inline void
66
put_af4(byte *buf, u32 id)
67
{
68
  put_u16(buf, id >> 16);
69
  buf[2] = 0;
70
  buf[3] = id & 0xff;
71
}
72

    
73
static inline u32
74
get_af3(byte *buf)
75
{
76
  return (get_u16(buf) << 16) | buf[2];
77
}
78

    
79
static inline u32
80
get_af4(byte *buf)
81
{
82
  return (get_u16(buf) << 16) | buf[3];
83
}
84

    
85
/*
86
 * MRT Dump format is not semantically specified.
87
 * We will use these values in appropriate fields:
88
 *
89
 * Local AS, Remote AS - configured AS numbers for given BGP instance.
90
 * Local IP, Remote IP - IP addresses of the TCP connection (0 if no connection)
91
 *
92
 * We dump two kinds of MRT messages: STATE_CHANGE (for BGP state
93
 * changes) and MESSAGE (for received BGP messages).
94
 *
95
 * STATE_CHANGE uses always AS4 variant, but MESSAGE uses AS4 variant
96
 * only when AS4 session is established and even in that case MESSAGE
97
 * does not use AS4 variant for initial OPEN message. This strange
98
 * behavior is here for compatibility with Quagga and Bgpdump,
99
 */
100

    
101
static byte *
102
mrt_put_bgp4_hdr(byte *buf, struct bgp_conn *conn, int as4)
103
{
104
  struct bgp_proto *p = conn->bgp;
105
  uint v4 = ipa_is_ip4(p->cf->remote_ip);
106

    
107
  if (as4)
108
  {
109
    put_u32(buf+0, p->remote_as);
110
    put_u32(buf+4, p->public_as);
111
    buf+=8;
112
  }
113
  else
114
  {
115
    put_u16(buf+0, (p->remote_as <= 0xFFFF) ? p->remote_as : AS_TRANS);
116
    put_u16(buf+2, (p->public_as <= 0xFFFF) ? p->public_as : AS_TRANS);
117
    buf+=4;
118
  }
119

    
120
  put_u16(buf+0, (p->neigh && p->neigh->iface) ? p->neigh->iface->index : 0);
121
  put_u16(buf+2, v4 ? BGP_AFI_IPV4 : BGP_AFI_IPV6);
122
  buf+=4;
123

    
124
  if (v4)
125
  {
126
    buf = put_ip4(buf, conn->sk ? ipa_to_ip4(conn->sk->daddr) : IP4_NONE);
127
    buf = put_ip4(buf, conn->sk ? ipa_to_ip4(conn->sk->saddr) : IP4_NONE);
128
  }
129
  else
130
  {
131
    buf = put_ip6(buf, conn->sk ? ipa_to_ip6(conn->sk->daddr) : IP6_NONE);
132
    buf = put_ip6(buf, conn->sk ? ipa_to_ip6(conn->sk->saddr) : IP6_NONE);
133
  }
134

    
135
  return buf;
136
}
137

    
138
static void
139
mrt_dump_bgp_packet(struct bgp_conn *conn, byte *pkt, uint len)
140
{
141
  byte *buf = alloca(128+len);        /* 128 is enough for MRT headers */
142
  byte *bp = buf + MRTDUMP_HDR_LENGTH;
143
  int as4 = conn->bgp->as4_session;
144

    
145
  bp = mrt_put_bgp4_hdr(bp, conn, as4);
146
  memcpy(bp, pkt, len);
147
  bp += len;
148
  mrt_dump_message(&conn->bgp->p, BGP4MP, as4 ? BGP4MP_MESSAGE_AS4 : BGP4MP_MESSAGE,
149
                   buf, bp-buf);
150
}
151

    
152
static inline u16
153
convert_state(uint state)
154
{
155
  /* Convert state from our BS_* values to values used in MRTDump */
156
  return (state == BS_CLOSE) ? 1 : state + 1;
157
}
158

    
159
void
160
mrt_dump_bgp_state_change(struct bgp_conn *conn, uint old, uint new)
161
{
162
  byte buf[128];
163
  byte *bp = buf + MRTDUMP_HDR_LENGTH;
164

    
165
  bp = mrt_put_bgp4_hdr(bp, conn, 1);
166
  put_u16(bp+0, convert_state(old));
167
  put_u16(bp+2, convert_state(new));
168
  bp += 4;
169
  mrt_dump_message(&conn->bgp->p, BGP4MP, BGP4MP_STATE_CHANGE_AS4, buf, bp-buf);
170
}
171

    
172
static byte *
173
bgp_create_notification(struct bgp_conn *conn, byte *buf)
174
{
175
  struct bgp_proto *p = conn->bgp;
176

    
177
  BGP_TRACE(D_PACKETS, "Sending NOTIFICATION(code=%d.%d)", conn->notify_code, conn->notify_subcode);
178
  buf[0] = conn->notify_code;
179
  buf[1] = conn->notify_subcode;
180
  memcpy(buf+2, conn->notify_data, conn->notify_size);
181
  return buf + 2 + conn->notify_size;
182
}
183

    
184

    
185
/* Capability negotiation as per RFC 5492 */
186

    
187
#define WALK_AF_CAPS(caps,ac) \
188
  for (ac = caps->af_data; ac < &caps->af_data[caps->af_count]; ac++)
189

    
190
const struct bgp_af_caps *
191
bgp_find_af_caps(struct bgp_caps *caps, u32 afi)
192
{
193
  struct bgp_af_caps *ac;
194

    
195
  WALK_AF_CAPS(caps, ac)
196
    if (ac->afi == afi)
197
      return ac;
198

    
199
  return NULL;
200
}
201

    
202
static struct bgp_af_caps *
203
bgp_get_af_caps(struct bgp_caps *caps, u32 afi)
204
{
205
  struct bgp_af_caps *ac;
206

    
207
  WALK_AF_CAPS(caps, ac)
208
    if (ac->afi == afi)
209
      return ac;
210

    
211
  ac = &caps->af_data[caps->af_count++];
212
  memset(ac, 0, sizeof(struct bgp_af_caps));
213
  ac->afi = afi;
214

    
215
  return ac;
216
}
217

    
218
static int
219
bgp_af_caps_cmp(const void *X, const void *Y)
220
{
221
  const struct bgp_af_caps *x = X, *y = Y;
222
  return (x->afi < y->afi) ? -1 : (x->afi > y->afi) ? 1 : 0;
223
}
224

    
225

    
226
static byte *
227
bgp_write_capabilities(struct bgp_conn *conn, byte *buf)
228
{
229
  struct bgp_proto *p = conn->bgp;
230
  struct bgp_channel *c;
231
  struct bgp_caps *caps;
232
  struct bgp_af_caps *ac;
233
  uint any_add_path = 0;
234
  byte *data;
235

    
236
  /* Prepare bgp_caps structure */
237

    
238
  int n = list_length(&p->p.channels);
239
  caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + n * sizeof(struct bgp_af_caps));
240
  conn->local_caps = caps;
241

    
242
  caps->as4_support = p->cf->enable_as4;
243
  caps->ext_messages = p->cf->enable_extended_messages;
244
  caps->route_refresh = p->cf->enable_refresh;
245
  caps->enhanced_refresh = p->cf->enable_refresh;
246

    
247
  if (caps->as4_support)
248
    caps->as4_number = p->public_as;
249

    
250
  if (p->cf->gr_mode)
251
  {
252
    caps->gr_aware = 1;
253
    caps->gr_time = p->cf->gr_time;
254
    caps->gr_flags = p->p.gr_recovery ? BGP_GRF_RESTART : 0;
255
  }
256

    
257
  /* Allocate and fill per-AF fields */
258
  WALK_LIST(c, p->p.channels)
259
  {
260
    ac = &caps->af_data[caps->af_count++];
261
    ac->afi = c->afi;
262
    ac->ready = 1;
263

    
264
    ac->add_path = c->cf->add_path;
265
    any_add_path |= ac->add_path;
266

    
267
    if (c->cf->gr_able)
268
    {
269
      ac->gr_able = 1;
270

    
271
      if (p->p.gr_recovery)
272
        ac->gr_af_flags |= BGP_GRF_FORWARDING;
273
    }
274
  }
275

    
276
  /* Sort capability fields by AFI/SAFI */
277
  qsort(caps->af_data, caps->af_count, sizeof(struct bgp_af_caps), bgp_af_caps_cmp);
278

    
279

    
280
  /* Create capability list in buffer */
281

    
282
  WALK_AF_CAPS(caps, ac)
283
    if (ac->ready)
284
    {
285
      *buf++ = 1;                /* Capability 1: Multiprotocol extensions */
286
      *buf++ = 4;                /* Capability data length */
287
      put_af4(buf, ac->afi);
288
      buf += 4;
289
    }
290

    
291
  if (caps->route_refresh)
292
  {
293
    *buf++ = 2;                        /* Capability 2: Support for route refresh */
294
    *buf++ = 0;                        /* Capability data length */
295
  }
296

    
297
  if (caps->ext_messages)
298
  {
299
    *buf++ = 6;                        /* Capability 6: Support for extended messages */
300
    *buf++ = 0;                        /* Capability data length */
301
  }
302

    
303
  if (caps->gr_aware)
304
  {
305
    *buf++ = 64;                /* Capability 64: Support for graceful restart */
306
    *buf++ = 0;                        /* Capability data length, will be fixed later */
307
    data = buf;
308

    
309
    put_u16(buf, caps->gr_time);
310
    buf[0] |= caps->gr_flags;
311
    buf += 2;
312

    
313
    WALK_AF_CAPS(caps, ac)
314
      if (ac->gr_able)
315
      {
316
        put_af3(buf, ac->afi);
317
        buf[3] = ac->gr_af_flags;
318
        buf += 4;
319
      }
320

    
321
    data[-1] = buf - data;
322
  }
323

    
324
  if (caps->as4_support)
325
  {
326
    *buf++ = 65;                /* Capability 65: Support for 4-octet AS number */
327
    *buf++ = 4;                        /* Capability data length */
328
    put_u32(buf, p->public_as);
329
    buf += 4;
330
  }
331

    
332
  if (any_add_path)
333
  {
334
    *buf++ = 69;                /* Capability 69: Support for ADD-PATH */
335
    *buf++ = 0;                        /* Capability data length, will be fixed later */
336
    data = buf;
337

    
338
    WALK_AF_CAPS(caps, ac)
339
      if (ac->add_path)
340
      {
341
        put_af3(buf, ac->afi);
342
        buf[3] = ac->add_path;
343
        buf += 4;
344
      }
345

    
346
    data[-1] = buf - data;
347
  }
348

    
349
  if (caps->enhanced_refresh)
350
  {
351
    *buf++ = 70;                /* Capability 70: Support for enhanced route refresh */
352
    *buf++ = 0;                        /* Capability data length */
353
  }
354

    
355
  /* FIXME: Should not XXXX 255 */
356

    
357
  return buf;
358
}
359

    
360
static void
361
bgp_read_capabilities(struct bgp_conn *conn, struct bgp_caps *caps, byte *pos, int len)
362
{
363
  struct bgp_proto *p = conn->bgp;
364
  struct bgp_af_caps *ac;
365
  int i, cl;
366
  u32 af;
367

    
368
  while (len > 0)
369
  {
370
    if (len < 2 || len < (2 + pos[1]))
371
      goto err;
372

    
373
    /* Capability length */
374
    cl = pos[1];
375

    
376
    /* Capability type */
377
    switch (pos[0])
378
    {
379
    case  1: /* Multiprotocol capability, RFC 4760 */
380
      if (cl != 4)
381
        goto err;
382

    
383
      af = get_af4(pos+2);
384
      ac = bgp_get_af_caps(caps, af);
385
      ac->ready = 1;
386
      break;
387

    
388
    case  2: /* Route refresh capability, RFC 2918 */
389
      if (cl != 0)
390
        goto err;
391

    
392
      caps->route_refresh = 1;
393
      break;
394

    
395
    case  6: /* Extended message length capability, RFC draft */
396
      if (cl != 0)
397
        goto err;
398

    
399
      caps->ext_messages = 1;
400
      break;
401

    
402
    case 64: /* Graceful restart capability, RFC 4724 */
403
      if (cl % 4 != 2)
404
        goto err;
405

    
406
      /* Only the last instance is valid */
407
      WALK_AF_CAPS(caps, ac)
408
      {
409
        ac->gr_able = 0;
410
        ac->gr_af_flags = 0;
411
      }
412

    
413
      caps->gr_aware = 1;
414
      caps->gr_flags = pos[2] & 0xf0;
415
      caps->gr_time = get_u16(pos + 2) & 0x0fff;
416

    
417
      for (i = 2; i < cl; i += 4)
418
      {
419
        af = get_af3(pos+2+i);
420
        ac = bgp_get_af_caps(caps, af);
421
        ac->gr_able = 1;
422
        ac->gr_af_flags = pos[2+i+3];
423
      }
424
      break;
425

    
426
    case 65: /* AS4 capability, RFC 4893 */
427
      if (cl != 4)
428
        goto err;
429

    
430
      caps->as4_support = 1;
431
      caps->as4_number = get_u32(pos + 2);
432
      break;
433

    
434
    case 69: /* ADD-PATH capability, RFC 7911 */
435
      if (cl % 4)
436
        goto err;
437

    
438
      for (i = 0; i < cl; i += 4)
439
      {
440
        byte val = pos[2+i+3];
441
        if (!val || (val > BGP_ADD_PATH_FULL))
442
        {
443
          log(L_WARN "%s: Got ADD-PATH capability with unknown value %u, ignoring",
444
              p->p.name, val);
445
          break;
446
        }
447
      }
448

    
449
      for (i = 0; i < cl; i += 4)
450
      {
451
        af = get_af3(pos+2+i);
452
        ac = bgp_get_af_caps(caps, af);
453
        ac->add_path = pos[2+i+3];
454
      }
455
      break;
456

    
457
    case 70: /* Enhanced route refresh capability, RFC 7313 */
458
      if (cl != 0)
459
        goto err;
460

    
461
      caps->enhanced_refresh = 1;
462
      break;
463

    
464
      /* We can safely ignore all other capabilities */
465
    }
466

    
467
    ADVANCE(pos, len, 2 + cl);
468
  }
469
  return;
470

    
471
err:
472
  bgp_error(conn, 2, 0, NULL, 0);
473
  return;
474
}
475

    
476
static int
477
bgp_read_options(struct bgp_conn *conn, byte *pos, int len)
478
{
479
  struct bgp_proto *p = conn->bgp;
480
  struct bgp_caps *caps;
481
  int ol;
482

    
483
  /* Max number of announced AFIs is limited by max option length (255) */
484
  caps = alloca(sizeof(struct bgp_caps) + 64 * sizeof(struct bgp_af_caps));
485
  memset(caps, 0, sizeof(struct bgp_caps));
486

    
487
  while (len > 0)
488
  {
489
    if ((len < 2) || (len < (2 + pos[1])))
490
    { bgp_error(conn, 2, 0, NULL, 0); return -1; }
491

    
492
    ol = pos[1];
493
    if (pos[0] == 2)
494
    {
495
      /* BGP capabilities, RFC 5492 */
496
      if (p->cf->capabilities)
497
        bgp_read_capabilities(conn, caps, pos + 2, ol);
498
    }
499
    else
500
    {
501
      /* Unknown option */
502
      bgp_error(conn, 2, 4, pos, ol); /* FIXME: ol or ol+2 ? */
503
      return -1;
504
    }
505

    
506
    ADVANCE(pos, len, 2 + ol);
507
  }
508

    
509
  uint n = sizeof(struct bgp_caps) + caps->af_count * sizeof(struct bgp_af_caps);
510
  conn->remote_caps = mb_allocz(p->p.pool, n);
511
  memcpy(conn->remote_caps, caps, n);
512

    
513
  return 0;
514
}
515

    
516
static byte *
517
bgp_create_open(struct bgp_conn *conn, byte *buf)
518
{
519
  struct bgp_proto *p = conn->bgp;
520

    
521
  BGP_TRACE(D_PACKETS, "Sending OPEN(ver=%d,as=%d,hold=%d,id=%08x)",
522
            BGP_VERSION, p->public_as, p->cf->hold_time, p->local_id);
523

    
524
  buf[0] = BGP_VERSION;
525
  put_u16(buf+1, (p->public_as < 0xFFFF) ? p->public_as : AS_TRANS);
526
  put_u16(buf+3, p->cf->hold_time);
527
  put_u32(buf+5, p->local_id);
528

    
529
  if (p->cf->capabilities)
530
  {
531
    /* Prepare local_caps and write capabilities to buffer */
532
    byte *end = bgp_write_capabilities(conn, buf+12);
533
    uint len = end - (buf+12);
534

    
535
    buf[9] = len + 2;                /* Optional parameters length */
536
    buf[10] = 2;                /* Option 2: Capability list */
537
    buf[11] = len;                /* Option data length */
538

    
539
    return end;
540
  }
541
  else
542
  {
543
    /* Prepare empty local_caps */
544
    conn->local_caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps));
545

    
546
    buf[9] = 0;                        /* No optional parameters */
547
    return buf + 10;
548
  }
549

    
550
  return buf;
551
}
552

    
553
static void
554
bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len)
555
{
556
  struct bgp_proto *p = conn->bgp;
557
  struct bgp_conn *other;
558
  u32 asn, hold, id;
559

    
560
  /* Check state */
561
  if (conn->state != BS_OPENSENT)
562
  { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
563

    
564
  /* Check message contents */
565
  if (len < 29 || len != 29 + (uint) pkt[28])
566
  { bgp_error(conn, 1, 2, pkt+16, 2); return; }
567

    
568
  if (pkt[19] != BGP_VERSION)
569
  { u16 val = BGP_VERSION; bgp_error(conn, 2, 1, (byte *) &val, 2); return; }
570

    
571
  asn = get_u16(pkt+20);
572
  hold = get_u16(pkt+22);
573
  id = get_u32(pkt+24);
574
  BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%R)", asn, hold, id);
575

    
576
  if (bgp_read_options(conn, pkt+29, pkt[28]) < 0)
577
    return;
578

    
579
  if (hold > 0 && hold < 3)
580
  { bgp_error(conn, 2, 6, pkt+22, 2); return; }
581

    
582
  /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */
583
  if (!id || (p->is_internal && id == p->local_id))
584
  { bgp_error(conn, 2, 3, pkt+24, -4); return; }
585

    
586
  struct bgp_caps *caps = conn->remote_caps;
587

    
588
  if (caps->as4_support)
589
  {
590
    u32 as4 = caps->as4_number;
591

    
592
    if ((as4 != asn) && (asn != AS_TRANS))
593
      log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name);
594

    
595
    if (as4 != p->remote_as)
596
    { as4 = htonl(as4); bgp_error(conn, 2, 2, (byte *) &as4, 4); return; }
597
  }
598
  else
599
  {
600
    if (asn != p->remote_as)
601
    { bgp_error(conn, 2, 2, pkt+20, 2); return; }
602
  }
603

    
604
  /* Check the other connection */
605
  other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn;
606
  switch (other->state)
607
  {
608
  case BS_CONNECT:
609
  case BS_ACTIVE:
610
    /* Stop outgoing connection attempts */
611
    bgp_conn_enter_idle_state(other);
612
    break;
613

    
614
  case BS_IDLE:
615
  case BS_OPENSENT:
616
  case BS_CLOSE:
617
    break;
618

    
619
  case BS_OPENCONFIRM:
620
    /*
621
     * Description of collision detection rules in RFC 4271 is confusing and
622
     * contradictory, but it is essentially:
623
     *
624
     * 1. Router with higher ID is dominant
625
     * 2. If both have the same ID, router with higher ASN is dominant [RFC6286]
626
     * 3. When both connections are in OpenConfirm state, one initiated by
627
     *    the dominant router is kept.
628
     *
629
     * The first line in the expression below evaluates whether the neighbor
630
     * is dominant, the second line whether the new connection was initiated
631
     * by the neighbor. If both are true (or both are false), we keep the new
632
     * connection, otherwise we keep the old one.
633
     */
634
    if (((p->local_id < id) || ((p->local_id == id) && (p->public_as < p->remote_as)))
635
        == (conn == &p->incoming_conn))
636
    {
637
      /* Should close the other connection */
638
      BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection");
639
      bgp_error(other, 6, 7, NULL, 0);
640
      break;
641
    }
642
    /* Fall thru */
643
  case BS_ESTABLISHED:
644
    /* Should close this connection */
645
    BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection");
646
    bgp_error(conn, 6, 7, NULL, 0);
647
    return;
648

    
649
  default:
650
    bug("bgp_rx_open: Unknown state");
651
  }
652

    
653
  /* Update our local variables */
654
  conn->hold_time = MIN(hold, p->cf->hold_time);
655
  conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3;
656
  conn->as4_session = conn->local_caps->as4_support && caps->as4_support;
657
  conn->ext_messages = conn->local_caps->ext_messages && caps->ext_messages;
658
  p->remote_id = id;
659

    
660
  DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n",
661
      conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, conn->as4_session);
662

    
663
  bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE);
664
  bgp_start_timer(conn->hold_timer, conn->hold_time);
665
  bgp_conn_enter_openconfirm_state(conn);
666
}
667

    
668

    
669
/*
670
 *        Next hop handling
671
 */
672

    
673
#define REPORT(msg, args...) \
674
  ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); })
675

    
676
#define WITHDRAW(msg, args...) \
677
  ({ REPORT(msg, ## args); s->err_withdraw = 1; return; })
678

    
679
#define BAD_NEXT_HOP        "Invalid NEXT_HOP attribute"
680
#define NO_NEXT_HOP        "Missing NEXT_HOP attribute"
681

    
682

    
683
static void
684
bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll)
685
{
686
  struct bgp_proto *p = s->proto;
687
  struct bgp_channel *c = s->channel;
688

    
689
  if (c->cf->gw_mode == GW_DIRECT)
690
  {
691
    neighbor *nbr = NULL;
692

    
693
    /* GW_DIRECT -> single_hop -> p->neigh != NULL */
694
    if (ipa_nonzero(gw))
695
      nbr = neigh_find2(&p->p, &gw, NULL, 0);
696
    else if (ipa_nonzero(ll))
697
      nbr = neigh_find2(&p->p, &ll, p->neigh->iface, 0);
698

    
699
    if (!nbr || (nbr->scope == SCOPE_HOST))
700
      WITHDRAW(BAD_NEXT_HOP);
701

    
702
    a->dest = RTD_UNICAST;
703
    a->nh = (struct nexthop){ .gw = nbr->addr, .iface = nbr->iface };
704
    a->hostentry = NULL;
705
    a->igp_metric = 0;
706
  }
707
  else /* GW_RECURSIVE */
708
  {
709
    if (ipa_zero(gw))
710
      WITHDRAW(BAD_NEXT_HOP);
711

    
712
    rta_set_recursive_next_hop(c->c.table, a, c->igp_table, gw, ll);
713
  }
714
}
715

    
716
static inline int
717
bgp_use_next_hop(struct bgp_export_state *s, eattr *a)
718
{
719
  struct bgp_proto *p = s->proto;
720
  ip_addr *nh = (void *) a->u.ptr->data;
721

    
722
  if (s->channel->cf->next_hop_self)
723
    return 0;
724

    
725
  if (s->channel->cf->next_hop_keep)
726
    return 1;
727

    
728
  /* Keep it when explicitly set in export filter */
729
  if (a->type & EAF_FRESH)
730
    return 1;
731

    
732
  /* Keep it when exported to internal peers */
733
  if (p->is_interior && ipa_nonzero(*nh))
734
    return 1;
735

    
736
  /* Keep it when forwarded between single-hop BGPs on the same iface */
737
  struct iface *ifa = (s->src && s->src->neigh) ? s->src->neigh->iface : NULL;
738
  return p->neigh && (p->neigh->iface == ifa);
739
}
740

    
741
static inline int
742
bgp_use_gateway(struct bgp_export_state *s)
743
{
744
  struct bgp_proto *p = s->proto;
745
  rta *ra = s->route->attrs;
746

    
747
  if (s->channel->cf->next_hop_self)
748
    return 0;
749

    
750
  /* We need one valid global gateway */
751
  if ((ra->dest != RTD_UNICAST) || ra->nh.next || ipa_zero(ra->nh.gw) || ipa_is_link_local(ra->nh.gw))
752
    return 0;
753

    
754
  /* Use it when exported to internal peers */
755
  if (p->is_interior)
756
    return 1;
757

    
758
  /* Use it when forwarded to single-hop BGP peer on on the same iface */
759
  return p->neigh && (p->neigh->iface == ra->nh.iface);
760
}
761

    
762
static void
763
bgp_update_next_hop_ip(struct bgp_export_state *s, eattr *a, ea_list **to)
764
{
765
  if (!a || !bgp_use_next_hop(s, a))
766
  {
767
    if (bgp_use_gateway(s))
768
    {
769
      ip_addr nh[1] = { s->route->attrs->nh.gw };
770
      bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, 16);
771
    }
772
    else
773
    {
774
      ip_addr nh[2] = { s->channel->next_hop_addr, s->channel->link_addr };
775
      bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, ipa_nonzero(nh[1]) ? 32 : 16);
776
    }
777
  }
778

    
779
  /* Check if next hop is valid */
780
  a = bgp_find_attr(*to, BA_NEXT_HOP);
781
  if (!a)
782
    WITHDRAW(NO_NEXT_HOP);
783

    
784
  ip_addr *nh = (void *) a->u.ptr->data;
785
  ip_addr peer = s->proto->cf->remote_ip;
786
  uint len = a->u.ptr->length;
787

    
788
  if (ipa_zero(nh[0]) && ((len != 32) || ipa_zero(nh[1])))
789
    WITHDRAW(BAD_NEXT_HOP);
790

    
791
  if (ipa_equal(peer, nh[0]) || ((len == 32) && ipa_equal(peer, nh[1])))
792
    WITHDRAW(BAD_NEXT_HOP);
793
}
794

    
795

    
796
/*
797
 *        UPDATE
798
 */
799

    
800
static void
801
bgp_rte_update(struct bgp_parse_state *s, net_addr *n, u32 path_id, rta *a0)
802
{
803
  if (path_id != s->last_id)
804
  {
805
    s->last_src = rt_get_source(&s->proto->p, path_id);
806
    s->last_id = path_id;
807

    
808
    rta_free(s->cached_rta);
809
    s->cached_rta = NULL;
810
  }
811

    
812
  if (!a0)
813
  {
814
    /* Route withdraw */
815
    rte_update2(&s->channel->c, n, NULL, s->last_src);
816
    return;
817
  }
818

    
819
  /* Prepare cached route attributes */
820
  if (s->cached_rta == NULL)
821
  {
822
    a0->src = s->last_src;
823

    
824
    /* Workaround for rta_lookup() breaking eattrs */
825
    ea_list *ea = a0->eattrs;
826
    s->cached_rta = rta_lookup(a0);
827
    a0->eattrs = ea;
828
  }
829

    
830
  rta *a = rta_clone(s->cached_rta);
831
  rte *e = rte_get_temp(a);
832

    
833
  e->pflags = 0;
834
  e->u.bgp.suppressed = 0;
835
  rte_update2(&s->channel->c, n, e, s->last_src);
836
}
837

    
838

    
839

    
840
static uint
841
bgp_encode_nlri_ip4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
842
{
843
  byte *pos = buf;
844

    
845
  while (!EMPTY_LIST(buck->prefixes) && (size >= (5 + sizeof(ip4_addr))))
846
  {
847
    struct bgp_prefix *px = HEAD(buck->prefixes);
848
    struct net_addr_ip4 *net = (void *) px->net;
849

    
850
    /* Encode path ID */
851
    if (s->add_path)
852
    {
853
      put_u32(pos, px->path_id);
854
      ADVANCE(pos, size, 4);
855
    }
856

    
857
    ip4_addr a = ip4_hton(net->prefix);
858
    uint b = (net->pxlen + 7) / 8;
859

    
860
    /* Encode prefix length */
861
    *pos = net->pxlen;
862
    ADVANCE(pos, size, 1);
863

    
864
    /* Encode prefix body */
865
    memcpy(pos, &a, b);
866
    ADVANCE(pos, size, b);
867

    
868
    bgp_free_prefix(s->channel, px);
869
  }
870

    
871
  return pos - buf;
872
}
873

    
874
static void
875
bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
876
{
877
  while (len)
878
  {
879
    net_addr_ip4 net;
880
    u32 path_id = 0;
881

    
882
    /* Decode path ID */
883
    if (s->add_path)
884
    {
885
      if (len < 5)
886
        bgp_parse_error(s, 1);
887

    
888
      path_id = get_u32(pos);
889
      ADVANCE(pos, len, 4);
890
    }
891

    
892
    /* Decode prefix length */
893
    uint l = *pos;
894
    uint b = (l + 7) / 8;
895
    ADVANCE(pos, len, 1);
896

    
897
    if (l > IP4_MAX_PREFIX_LENGTH)
898
      bgp_parse_error(s, 10);
899

    
900
    if (len < b)
901
      bgp_parse_error(s, 1);
902

    
903
    /* Decode prefix body */
904
    ip4_addr addr = IP4_NONE;
905
    memcpy(&addr, pos, b);
906
    ADVANCE(pos, len, b);
907

    
908
    net = NET_ADDR_IP4(ip4_ntoh(addr), l);
909
    net_normalize_ip4(&net);
910

    
911
    // XXXX validate prefix
912

    
913
    bgp_rte_update(s, (net_addr *) &net, path_id, a);
914
  }
915
}
916

    
917
static uint
918
bgp_encode_next_hop_ip4(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size UNUSED)
919
{
920
  /* This function is used only for MP-BGP, see bgp_encode_next_hop() for IPv4 BGP */
921

    
922
  ASSERT(a->u.ptr->length == sizeof(ip_addr));
923

    
924
  put_ip4(buf, ipa_to_ip4( *(ip_addr *) a->u.ptr->data ));
925

    
926
  return 4;
927
}
928

    
929
static void
930
bgp_decode_next_hop_ip4(struct bgp_parse_state *s, byte *data, uint len, rta *a)
931
{
932
  if (len != 4)
933
    bgp_parse_error(s, 9);
934

    
935
  ip_addr nh = ipa_from_ip4(get_ip4(data));
936

    
937
  // XXXX validate next hop
938

    
939
  bgp_set_attr_data(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, &nh, sizeof(nh));
940
  bgp_apply_next_hop(s, a, nh, IPA_NONE);
941
}
942

    
943

    
944
static uint
945
bgp_encode_nlri_ip6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
946
{
947
  byte *pos = buf;
948

    
949
  while (!EMPTY_LIST(buck->prefixes) && (size >= (5 + sizeof(ip6_addr))))
950
  {
951
    struct bgp_prefix *px = HEAD(buck->prefixes);
952
    struct net_addr_ip6 *net = (void *) px->net;
953

    
954
    /* Encode path ID */
955
    if (s->add_path)
956
    {
957
      put_u32(pos, px->path_id);
958
      ADVANCE(pos, size, 4);
959
    }
960

    
961
    ip6_addr a = ip6_hton(net->prefix);
962
    uint b = (net->pxlen + 7) / 8;
963

    
964
    /* Encode prefix length */
965
    *pos = net->pxlen;
966
    ADVANCE(pos, size, 1);
967

    
968
    /* Encode prefix body */
969
    memcpy(pos, &a, b);
970
    ADVANCE(pos, size, b);
971

    
972
    bgp_free_prefix(s->channel, px);
973
  }
974

    
975
  return pos - buf;
976
}
977

    
978
static void
979
bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
980
{
981
  while (len)
982
  {
983
    net_addr_ip6 net;
984
    u32 path_id = 0;
985

    
986
    /* Decode path ID */
987
    if (s->add_path)
988
    {
989
      if (len < 5)
990
        bgp_parse_error(s, 1);
991

    
992
      path_id = get_u32(pos);
993
      ADVANCE(pos, len, 4);
994
    }
995

    
996
    /* Decode prefix length */
997
    uint l = *pos;
998
    uint b = (l + 7) / 8;
999
    ADVANCE(pos, len, 1);
1000

    
1001
    if (l > IP6_MAX_PREFIX_LENGTH)
1002
      bgp_parse_error(s, 10);
1003

    
1004
    if (len < b)
1005
      bgp_parse_error(s, 1);
1006

    
1007
    /* Decode prefix body */
1008
    ip6_addr addr = IP6_NONE;
1009
    memcpy(&addr, pos, b);
1010
    ADVANCE(pos, len, b);
1011

    
1012
    net = NET_ADDR_IP6(ip6_ntoh(addr), l);
1013
    net_normalize_ip6(&net);
1014

    
1015
    // XXXX validate prefix
1016

    
1017
    bgp_rte_update(s, (net_addr *) &net, path_id, a);
1018
  }
1019
}
1020

    
1021
static uint
1022
bgp_encode_next_hop_ip6(struct bgp_write_state *s UNUSED, eattr *a, byte *buf, uint size UNUSED)
1023
{
1024
  ip_addr *nh = (void *) a->u.ptr->data;
1025
  uint len = a->u.ptr->length;
1026

    
1027
  ASSERT((len == 16) || (len == 32));
1028

    
1029
  put_ip6(buf, ipa_to_ip6(nh[0]));
1030

    
1031
  if (len == 32)
1032
    put_ip6(buf+16, ipa_to_ip6(nh[1]));
1033

    
1034
  return len;
1035
}
1036

    
1037
static void
1038
bgp_decode_next_hop_ip6(struct bgp_parse_state *s, byte *data, uint len, rta *a)
1039
{
1040
  struct adata *ad = lp_alloc_adata(s->pool, 32);
1041
  ip_addr *nh = (void *) ad->data;
1042

    
1043
  if ((len != 16) && (len != 32))
1044
    bgp_parse_error(s, 9);
1045

    
1046
  nh[0] = ipa_from_ip6(get_ip6(data));
1047
  nh[1] = (len == 32) ? ipa_from_ip6(get_ip6(data+16)) : IPA_NONE;
1048

    
1049
  if (ip6_is_link_local(nh[0]))
1050
  {
1051
    nh[1] = nh[0];
1052
    nh[0] = IPA_NONE;
1053
  }
1054

    
1055
  if (!ip6_is_link_local(nh[1]))
1056
    nh[1] = IPA_NONE;
1057

    
1058
  if (ipa_zero(nh[1]))
1059
    ad->length = 16;
1060

    
1061
  // XXXX validate next hop
1062

    
1063
  bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad);
1064
  bgp_apply_next_hop(s, a, nh[0], nh[1]);
1065
}
1066

    
1067

    
1068
static const struct bgp_af_desc bgp_af_table[] = {
1069
  {
1070
    .afi = BGP_AF_IPV4,
1071
    .net = NET_IP4,
1072
    .name = "ipv4",
1073
    .encode_nlri = bgp_encode_nlri_ip4,
1074
    .decode_nlri = bgp_decode_nlri_ip4,
1075
    .encode_next_hop = bgp_encode_next_hop_ip4,
1076
    .decode_next_hop = bgp_decode_next_hop_ip4,
1077
    .update_next_hop = bgp_update_next_hop_ip,
1078
  },
1079
  {
1080
    .afi = BGP_AF_IPV4_MC,
1081
    .net = NET_IP4,
1082
    .name = "ipv4-mc",
1083
    .encode_nlri = bgp_encode_nlri_ip4,
1084
    .decode_nlri = bgp_decode_nlri_ip4,
1085
    .encode_next_hop = bgp_encode_next_hop_ip4,
1086
    .decode_next_hop = bgp_decode_next_hop_ip4,
1087
    .update_next_hop = bgp_update_next_hop_ip,
1088
  },
1089
  {
1090
    .afi = BGP_AF_IPV6,
1091
    .net = NET_IP6,
1092
    .name = "ipv6",
1093
    .encode_nlri = bgp_encode_nlri_ip6,
1094
    .decode_nlri = bgp_decode_nlri_ip6,
1095
    .encode_next_hop = bgp_encode_next_hop_ip6,
1096
    .decode_next_hop = bgp_decode_next_hop_ip6,
1097
    .update_next_hop = bgp_update_next_hop_ip,
1098
  },
1099
  {
1100
    .afi = BGP_AF_IPV6_MC,
1101
    .net = NET_IP6,
1102
    .name = "ipv6-mc",
1103
    .encode_nlri = bgp_encode_nlri_ip6,
1104
    .decode_nlri = bgp_decode_nlri_ip6,
1105
    .encode_next_hop = bgp_encode_next_hop_ip6,
1106
    .decode_next_hop = bgp_decode_next_hop_ip6,
1107
    .update_next_hop = bgp_update_next_hop_ip,
1108
  },
1109
};
1110

    
1111
const struct bgp_af_desc *
1112
bgp_get_af_desc(u32 afi)
1113
{
1114
  uint i;
1115
  for (i = 0; i < ARRAY_SIZE(bgp_af_table); i++)
1116
    if (bgp_af_table[i].afi == afi)
1117
      return &bgp_af_table[i];
1118

    
1119
  return NULL;
1120
}
1121

    
1122
static inline uint
1123
bgp_encode_nlri(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
1124
{
1125
  return s->channel->desc->encode_nlri(s, buck, buf, end - buf);
1126
}
1127

    
1128
static inline uint
1129
bgp_encode_next_hop(struct bgp_write_state *s, eattr *nh, byte *buf)
1130
{
1131
  return s->channel->desc->encode_next_hop(s, nh, buf, 255);
1132
}
1133

    
1134
void
1135
bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to)
1136
{
1137
  s->channel->desc->update_next_hop(s, a, to);
1138
}
1139

    
1140
#define MAX_ATTRS_LENGTH (end-buf+BGP_HEADER_LENGTH - 1024)
1141

    
1142
static byte *
1143
bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
1144
{
1145
  /*
1146
   *        2 B        Withdrawn Routes Length (zero)
1147
   *        ---        IPv4 Withdrawn Routes NLRI (unused)
1148
   *        2 B        Total Path Attribute Length
1149
   *        var        Path Attributes
1150
   *        var        IPv4 Network Layer Reachability Information
1151
   */
1152

    
1153
  int lr, la;
1154

    
1155
  la = bgp_encode_attrs(s, buck->eattrs, buf+4, buf + MAX_ATTRS_LENGTH);
1156
  if (la < 0)
1157
  {
1158
    /* Attribute list too long */
1159
    bgp_withdraw_bucket(s->channel, buck);
1160
    return NULL;
1161
  }
1162

    
1163
  put_u16(buf+0, 0);
1164
  put_u16(buf+2, la);
1165

    
1166
  lr = bgp_encode_nlri(s, buck, buf+4+la, end);
1167

    
1168
  return buf+4+la+lr;
1169
}
1170

    
1171
static byte *
1172
bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
1173
{
1174
  /*
1175
   *        2 B        IPv4 Withdrawn Routes Length (zero)
1176
   *        ---        IPv4 Withdrawn Routes NLRI (unused)
1177
   *        2 B        Total Path Attribute Length
1178
   *        1 B        MP_REACH_NLRI hdr - Attribute Flags
1179
   *        1 B        MP_REACH_NLRI hdr - Attribute Type Code
1180
   *        2 B        MP_REACH_NLRI hdr - Length of Attribute Data
1181
   *        2 B        MP_REACH_NLRI data - Address Family Identifier
1182
   *        1 B        MP_REACH_NLRI data - Subsequent Address Family Identifier
1183
   *        1 B        MP_REACH_NLRI data - Length of Next Hop Network Address
1184
   *        var        MP_REACH_NLRI data - Network Address of Next Hop
1185
   *        1 B        MP_REACH_NLRI data - Reserved (zero)
1186
   *        var        MP_REACH_NLRI data - Network Layer Reachability Information
1187
   *        var        Rest of Path Attributes
1188
   *        ---        IPv4 Network Layer Reachability Information (unused)
1189
   */
1190

    
1191
  int lh, lr, la;        /* Lengths of next hop, NLRI and attributes */
1192

    
1193
  /* Begin of MP_REACH_NLRI atribute */
1194
  buf[4] = BAF_OPTIONAL | BAF_EXT_LEN;
1195
  buf[5] = BA_MP_REACH_NLRI;
1196
  put_u16(buf+6, 0);                /* Will be fixed later */
1197
  put_af3(buf+8, s->channel->afi);
1198
  byte *pos = buf+11;
1199

    
1200
  /* Encode attributes to temporary buffer */
1201
  byte *abuf = alloca(MAX_ATTRS_LENGTH);
1202
  la = bgp_encode_attrs(s, buck->eattrs, abuf, abuf + MAX_ATTRS_LENGTH);
1203
  if (la < 0)
1204
  {
1205
    /* Attribute list too long */
1206
    bgp_withdraw_bucket(s->channel, buck);
1207
    return NULL;
1208
  }
1209

    
1210
  /* Encode the next hop */
1211
  lh = bgp_encode_next_hop(s, s->mp_next_hop, pos+1);
1212
  *pos = lh;
1213
  pos += 1+lh;
1214

    
1215
  /* Reserved field */
1216
  *pos++ = 0;
1217

    
1218
  /* Encode the NLRI */
1219
  lr = bgp_encode_nlri(s, buck, pos, end - la);
1220
  pos += lr;
1221

    
1222
  /* End of MP_REACH_NLRI atribute, update data length */
1223
  put_u16(buf+6, pos-buf-8);
1224

    
1225
  /* Copy remaining attributes */
1226
  memcpy(pos, abuf, la);
1227
  pos += la;
1228

    
1229
  /* Initial UPDATE fields */
1230
  put_u16(buf+0, 0);
1231
  put_u16(buf+2, pos-buf-4);
1232

    
1233
  return pos;
1234
}
1235

    
1236
#undef MAX_ATTRS_LENGTH
1237

    
1238
static byte *
1239
bgp_create_ip_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
1240
{
1241
  /*
1242
   *        2 B        Withdrawn Routes Length
1243
   *        var        IPv4 Withdrawn Routes NLRI
1244
   *        2 B        Total Path Attribute Length (zero)
1245
   *        ---        Path Attributes (unused)
1246
   *        ---        IPv4 Network Layer Reachability Information (unused)
1247
   */
1248

    
1249
  uint len = bgp_encode_nlri(s, buck, buf+2, end);
1250

    
1251
  put_u16(buf+0, len);
1252
  put_u16(buf+2+len, 0);
1253

    
1254
  return buf+4+len;
1255
}
1256

    
1257
static byte *
1258
bgp_create_mp_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
1259
{
1260
  /*
1261
   *        2 B        Withdrawn Routes Length (zero)
1262
   *        ---        IPv4 Withdrawn Routes NLRI (unused)
1263
   *        2 B        Total Path Attribute Length
1264
   *        1 B        MP_UNREACH_NLRI hdr - Attribute Flags
1265
   *        1 B        MP_UNREACH_NLRI hdr - Attribute Type Code
1266
   *        2 B        MP_UNREACH_NLRI hdr - Length of Attribute Data
1267
   *        2 B        MP_UNREACH_NLRI data - Address Family Identifier
1268
   *        1 B        MP_UNREACH_NLRI data - Subsequent Address Family Identifier
1269
   *        var        MP_UNREACH_NLRI data - Network Layer Reachability Information
1270
   *        ---        IPv4 Network Layer Reachability Information (unused)
1271
   */
1272

    
1273
  uint len = bgp_encode_nlri(s, buck, buf+11, end);
1274

    
1275
  put_u16(buf+0, 0);
1276
  put_u16(buf+2, 7+len);
1277

    
1278
  /* Begin of MP_UNREACH_NLRI atribute */
1279
  buf[4] = BAF_OPTIONAL | BAF_EXT_LEN;
1280
  buf[5] = BA_MP_UNREACH_NLRI;
1281
  put_u16(buf+6, 3+len);
1282
  put_af3(buf+8, s->channel->afi);
1283

    
1284
  return buf+11+len;
1285
}
1286

    
1287
static byte *
1288
bgp_create_update(struct bgp_channel *c, byte *buf)
1289
{
1290
  struct bgp_proto *p = (void *) c->c.proto;
1291
  struct bgp_bucket *buck;
1292
  byte *end = buf + (bgp_max_packet_length(p->conn) - BGP_HEADER_LENGTH);
1293
  byte *res = NULL;
1294

    
1295
  /* Initialize write state */
1296
  struct bgp_write_state s = {
1297
    .proto = p,
1298
    .channel = c,
1299
    .pool = bgp_linpool,
1300
    .as4_session = p->as4_session,
1301
    .add_path = c->add_path_tx,
1302
  };
1303

    
1304
again:
1305

    
1306
  /* Try unreachable bucket */
1307
  if ((buck = c->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
1308
  {
1309
    res = (c->afi == BGP_AF_IPV4) ?
1310
      bgp_create_ip_unreach(&s, buck, buf, end):
1311
      bgp_create_mp_unreach(&s, buck, buf, end);
1312

    
1313
    goto done;
1314
  }
1315

    
1316
  /* Try reachable buckets */
1317
  if (!EMPTY_LIST(c->bucket_queue))
1318
  {
1319
    buck = HEAD(c->bucket_queue);
1320

    
1321
    /* Cleanup empty buckets */
1322
    if (EMPTY_LIST(buck->prefixes))
1323
    {
1324
      bgp_free_bucket(c, buck);
1325
      goto again;
1326
    }
1327

    
1328
    res = (c->afi == BGP_AF_IPV4) ?
1329
      bgp_create_ip_reach(&s, buck, buf, end):
1330
      bgp_create_mp_reach(&s, buck, buf, end);
1331

    
1332
    if (EMPTY_LIST(buck->prefixes))
1333
      bgp_free_bucket(c, buck);
1334
    else
1335
      bgp_defer_bucket(c, buck);
1336

    
1337
    if (!res)
1338
      goto again;
1339

    
1340
    goto done;
1341
  }
1342

    
1343
  /* No more prefixes to send */
1344
  return NULL;
1345

    
1346
done:
1347
  BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE");
1348
  lp_flush(s.pool);
1349

    
1350
  return res;
1351
}
1352

    
1353
static byte *
1354
bgp_create_ip_end_mark(struct bgp_channel *c UNUSED, byte *buf)
1355
{
1356
  /* Empty update packet */
1357
  put_u32(buf, 0);
1358

    
1359
  return buf+4;
1360
}
1361

    
1362
static byte *
1363
bgp_create_mp_end_mark(struct bgp_channel *c, byte *buf)
1364
{
1365
  put_u16(buf+0, 0);
1366
  put_u16(buf+2, 6);                /* length 4--9 */
1367

    
1368
  /* Empty MP_UNREACH_NLRI atribute */
1369
  buf[4] = BAF_OPTIONAL;
1370
  buf[5] = BA_MP_UNREACH_NLRI;
1371
  buf[6] = 3;                        /* Length 7--9 */
1372
  put_af3(buf+7, c->afi);
1373

    
1374
  return buf+10;
1375
}
1376

    
1377
static byte *
1378
bgp_create_end_mark(struct bgp_channel *c, byte *buf)
1379
{
1380
  struct bgp_proto *p = (void *) c->c.proto;
1381

    
1382
  BGP_TRACE(D_PACKETS, "Sending END-OF-RIB");
1383

    
1384
  return (c->afi == BGP_AF_IPV4) ?
1385
    bgp_create_ip_end_mark(c, buf):
1386
    bgp_create_mp_end_mark(c, buf);
1387
}
1388

    
1389
static inline void
1390
bgp_rx_end_mark(struct bgp_proto *p, u32 afi)
1391
{
1392
  struct bgp_channel *c = bgp_get_channel(p, afi);
1393

    
1394
  BGP_TRACE(D_PACKETS, "Got END-OF-RIB");
1395

    
1396
  /* XXXX handle unknown AF in MP_*_NLRI */
1397
  if (!c)
1398
    return;
1399

    
1400
  if (c->load_state == BFS_LOADING)
1401
    c->load_state = BFS_NONE;
1402

    
1403
  if (p->p.gr_recovery)
1404
    channel_graceful_restart_unlock(&c->c);
1405

    
1406
  if (c->gr_active)
1407
    bgp_graceful_restart_done(c);
1408
}
1409

    
1410
static inline void
1411
bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_list *ea, byte *nh, uint nh_len)
1412
{
1413
  struct bgp_channel *c = bgp_get_channel(s->proto, afi);
1414
  rta *a = NULL;
1415

    
1416
  /* XXXX handle unknown AF in MP_*_NLRI */
1417
  if (!c)
1418
    return;
1419

    
1420
  s->channel = c;
1421
  s->add_path = c->add_path_rx;
1422

    
1423
  s->last_id = 0;
1424
  s->last_src = s->proto->p.main_source;
1425

    
1426
  /*
1427
   * IPv4 BGP and MP-BGP may be used together in one update, therefore we do not
1428
   * add BA_NEXT_HOP in bgp_decode_attrs(), but we add it here independently for
1429
   * IPv4 BGP and MP-BGP. We undo the attribute (and possibly others attached by
1430
   * decode_next_hop hooks) by restoring a->eattrs afterwards.
1431
   */
1432

    
1433
  if (ea)
1434
  {
1435
    a = allocz(sizeof(struct rta));
1436

    
1437
    a->source = RTS_BGP;
1438
    a->scope = SCOPE_UNIVERSE;
1439
    a->from = s->proto->cf->remote_ip;
1440
    a->eattrs = ea;
1441

    
1442
    c->desc->decode_next_hop(s, nh, nh_len, a);
1443

    
1444
    /* Handle withdraw during next hop decoding */
1445
    if (s->err_withdraw)
1446
      a = NULL;
1447
  }
1448

    
1449
  c->desc->decode_nlri(s, nlri, len, a);
1450

    
1451
  rta_free(s->cached_rta);
1452
  s->cached_rta = NULL;
1453
}
1454

    
1455
static void
1456
bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len)
1457
{
1458
  struct bgp_proto *p = conn->bgp;
1459
  ea_list *ea = NULL;
1460

    
1461
  BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE");
1462

    
1463
  /* Workaround for some BGP implementations that skip initial KEEPALIVE */
1464
  if (conn->state == BS_OPENCONFIRM)
1465
    bgp_conn_enter_established_state(conn);
1466

    
1467
  if (conn->state != BS_ESTABLISHED)
1468
  { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
1469

    
1470
  bgp_start_timer(conn->hold_timer, conn->hold_time);
1471

    
1472
  /* Initialize parse state */
1473
  struct bgp_parse_state s = {
1474
    .proto = p,
1475
    .pool = bgp_linpool,
1476
    .as4_session = p->as4_session,
1477
  };
1478

    
1479
  /* Parse error handler */
1480
  if (setjmp(s.err_jmpbuf))
1481
  {
1482
    bgp_error(conn, 3, s.err_subcode, NULL, 0);
1483
    goto done;
1484
  }
1485

    
1486
  /* Check minimal length */
1487
  if (len < 23)
1488
  { bgp_error(conn, 1, 2, pkt+16, 2); return; }
1489

    
1490
  /* Skip fixed header */
1491
  uint pos = 19;
1492

    
1493
  /*
1494
   *        UPDATE message format
1495
   *
1496
   *        2 B        IPv4 Withdrawn Routes Length
1497
   *        var        IPv4 Withdrawn Routes NLRI
1498
   *        2 B        Total Path Attribute Length
1499
   *        var        Path Attributes
1500
   *        var        IPv4 Reachable Routes NLRI
1501
   */
1502

    
1503
  s.ip_unreach_len = get_u16(pkt + pos);
1504
  s.ip_unreach_nlri = pkt + pos + 2;
1505
  pos += 2 + s.ip_unreach_len;
1506

    
1507
  if (pos + 2 > len)
1508
    bgp_parse_error(&s, 1);
1509

    
1510
  s.attr_len = get_u16(pkt + pos);
1511
  s.attrs = pkt + pos + 2;
1512
  pos += 2 + s.attr_len;
1513

    
1514
  if (pos > len)
1515
    bgp_parse_error(&s, 1);
1516

    
1517
  s.ip_reach_len = len - pos;
1518
  s.ip_reach_nlri = pkt + pos;
1519

    
1520

    
1521
  if (s.attr_len)
1522
    ea = bgp_decode_attrs(&s, s.attrs, s.attr_len);
1523

    
1524
  /* Check for End-of-RIB marker */
1525
  if (!s.attr_len && !s.ip_unreach_len && !s.ip_reach_len)
1526
  { bgp_rx_end_mark(p, BGP_AF_IPV4); goto done; }
1527

    
1528
  /* Check for MP End-of-RIB marker */
1529
  if ((s.attr_len < 8) && !s.ip_unreach_len && !s.ip_reach_len &&
1530
      !s.mp_reach_len && !s.mp_unreach_len && s.mp_unreach_af) /* XXXX  See RFC 7606 5.2 */
1531
  { bgp_rx_end_mark(p, s.mp_unreach_af); goto done; }
1532

    
1533
  if (s.ip_unreach_len)
1534
    bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_unreach_nlri, s.ip_unreach_len, NULL, NULL, 0);
1535

    
1536
  if (s.mp_unreach_len)
1537
    bgp_decode_nlri(&s, s.mp_unreach_af, s.mp_unreach_nlri, s.mp_unreach_len, NULL, NULL, 0);
1538

    
1539
  if (s.ip_reach_len)
1540
    bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_reach_nlri, s.ip_reach_len,
1541
                    ea, s.ip_next_hop_data, s.ip_next_hop_len);
1542

    
1543
  if (s.mp_reach_len)
1544
    bgp_decode_nlri(&s, s.mp_reach_af, s.mp_reach_nlri, s.mp_reach_len,
1545
                    ea, s.mp_next_hop_data, s.mp_next_hop_len);
1546

    
1547
done:
1548
  rta_free(s.cached_rta);
1549
  lp_flush(s.pool);
1550
  return;
1551
}
1552

    
1553

    
1554
/*
1555
 *        ROUTE-REFRESH
1556
 */
1557

    
1558
static inline byte *
1559
bgp_create_route_refresh(struct bgp_channel *c, byte *buf)
1560
{
1561
  struct bgp_proto *p = (void *) c->c.proto;
1562

    
1563
  BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH");
1564

    
1565
  /* Original route refresh request, RFC 2918 */
1566
  put_af4(buf, c->afi);
1567
  buf[2] = BGP_RR_REQUEST;
1568

    
1569
  return buf+4;
1570
}
1571

    
1572
static inline byte *
1573
bgp_create_begin_refresh(struct bgp_channel *c, byte *buf)
1574
{
1575
  struct bgp_proto *p = (void *) c->c.proto;
1576

    
1577
  BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR");
1578

    
1579
  /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */
1580
  put_af4(buf, c->afi);
1581
  buf[2] = BGP_RR_BEGIN;
1582

    
1583
  return buf+4;
1584
}
1585

    
1586
static inline byte *
1587
bgp_create_end_refresh(struct bgp_channel *c, byte *buf)
1588
{
1589
  struct bgp_proto *p = (void *) c->c.proto;
1590

    
1591
  BGP_TRACE(D_PACKETS, "Sending END-OF-RR");
1592

    
1593
  /* Demarcation of ending of route refresh (EoRR), RFC 7313 */
1594
  put_af4(buf, c->afi);
1595
  buf[2] = BGP_RR_END;
1596

    
1597
  return buf+4;
1598
}
1599

    
1600
static void
1601
bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len)
1602
{
1603
  struct bgp_proto *p = conn->bgp;
1604

    
1605
  if (conn->state != BS_ESTABLISHED)
1606
  { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
1607

    
1608
  if (!conn->local_caps->route_refresh)
1609
  { bgp_error(conn, 1, 3, pkt+18, 1); return; }
1610

    
1611
  if (len < (BGP_HEADER_LENGTH + 4))
1612
  { bgp_error(conn, 1, 2, pkt+16, 2); return; }
1613

    
1614
  if (len > (BGP_HEADER_LENGTH + 4))
1615
  { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; }
1616

    
1617
  struct bgp_channel *c = bgp_get_channel(p, get_af4(pkt+19));
1618
  if (!c)
1619
  {
1620
    log(L_WARN "%s: Got ROUTE-REFRESH subtype %u for AF %u.%u, ignoring",
1621
        p->p.name, pkt[21], get_u16(pkt+19), pkt[22]);
1622
    return;
1623
  }
1624

    
1625
  /* RFC 7313 redefined reserved field as RR message subtype */
1626
  uint subtype = p->enhanced_refresh ? pkt[21] : BGP_RR_REQUEST;
1627

    
1628
  switch (subtype)
1629
  {
1630
  case BGP_RR_REQUEST:
1631
    BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH");
1632
    channel_request_feeding(&c->c);
1633
    break;
1634

    
1635
  case BGP_RR_BEGIN:
1636
    BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR");
1637
    bgp_refresh_begin(c);
1638
    break;
1639

    
1640
  case BGP_RR_END:
1641
    BGP_TRACE(D_PACKETS, "Got END-OF-RR");
1642
    bgp_refresh_end(c);
1643
    break;
1644

    
1645
  default:
1646
    log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring",
1647
        p->p.name, subtype);
1648
    break;
1649
  }
1650
}
1651

    
1652
static inline struct bgp_channel *
1653
bgp_get_channel_to_send(struct bgp_proto *p, struct bgp_conn *conn)
1654
{
1655
  uint i = conn->last_channel;
1656

    
1657
  /* Try the last channel, but at most several times */
1658
  if ((conn->channels_to_send & (1 << i)) &&
1659
      (conn->last_channel_count < 16))
1660
    goto found;
1661

    
1662
  /* Find channel with non-zero channels_to_send */
1663
  do
1664
  {
1665
    i++;
1666
    if (i >= p->channel_count)
1667
      i = 0;
1668
  }
1669
  while (! (conn->channels_to_send & (1 << i)));
1670

    
1671
  /* Use that channel */
1672
  conn->last_channel = i;
1673
  conn->last_channel_count = 0;
1674

    
1675
found:
1676
  conn->last_channel_count++;
1677
  return p->channel_map[i];
1678
}
1679

    
1680
static inline int
1681
bgp_send(struct bgp_conn *conn, uint type, uint len)
1682
{
1683
  sock *sk = conn->sk;
1684
  byte *buf = sk->tbuf;
1685

    
1686
  memset(buf, 0xff, 16);                /* Marker */
1687
  put_u16(buf+16, len);
1688
  buf[18] = type;
1689

    
1690
  return sk_send(sk, len);
1691
}
1692

    
1693
/**
1694
 * bgp_fire_tx - transmit packets
1695
 * @conn: connection
1696
 *
1697
 * Whenever the transmit buffers of the underlying TCP connection
1698
 * are free and we have any packets queued for sending, the socket functions
1699
 * call bgp_fire_tx() which takes care of selecting the highest priority packet
1700
 * queued (Notification > Keepalive > Open > Update), assembling its header
1701
 * and body and sending it to the connection.
1702
 */
1703
static int
1704
bgp_fire_tx(struct bgp_conn *conn)
1705
{
1706
  struct bgp_proto *p = conn->bgp;
1707
  struct bgp_channel *c;
1708
  byte *buf, *pkt, *end;
1709
  uint s;
1710

    
1711
  if (!conn->sk)
1712
    return 0;
1713

    
1714
  buf = conn->sk->tbuf;
1715
  pkt = buf + BGP_HEADER_LENGTH;
1716
  s = conn->packets_to_send;
1717

    
1718
  if (s & (1 << PKT_SCHEDULE_CLOSE))
1719
  {
1720
    /* We can finally close connection and enter idle state */
1721
    bgp_conn_enter_idle_state(conn);
1722
    return 0;
1723
  }
1724
  if (s & (1 << PKT_NOTIFICATION))
1725
  {
1726
    conn->packets_to_send = 1 << PKT_SCHEDULE_CLOSE;
1727
    end = bgp_create_notification(conn, pkt);
1728
    return bgp_send(conn, PKT_NOTIFICATION, end - buf);
1729
  }
1730
  else if (s & (1 << PKT_KEEPALIVE))
1731
  {
1732
    conn->packets_to_send &= ~(1 << PKT_KEEPALIVE);
1733
    BGP_TRACE(D_PACKETS, "Sending KEEPALIVE");
1734
    bgp_start_timer(conn->keepalive_timer, conn->keepalive_time);
1735
    return bgp_send(conn, PKT_KEEPALIVE, BGP_HEADER_LENGTH);
1736
  }
1737
  else if (s & (1 << PKT_OPEN))
1738
  {
1739
    conn->packets_to_send &= ~(1 << PKT_OPEN);
1740
    end = bgp_create_open(conn, pkt);
1741
    return bgp_send(conn, PKT_OPEN, end - buf);
1742
  }
1743
  else while (conn->channels_to_send)
1744
  {
1745
    c = bgp_get_channel_to_send(p, conn);
1746
    s = c->packets_to_send;
1747

    
1748
    if (s & (1 << PKT_ROUTE_REFRESH))
1749
    {
1750
      c->packets_to_send &= ~(1 << PKT_ROUTE_REFRESH);
1751
      end = bgp_create_route_refresh(c, pkt);
1752
      return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
1753
    }
1754
    else if (s & (1 << PKT_BEGIN_REFRESH))
1755
    {
1756
      /* BoRR is a subtype of RR, but uses separate bit in packets_to_send */
1757
      c->packets_to_send &= ~(1 << PKT_BEGIN_REFRESH);
1758
      end = bgp_create_begin_refresh(c, pkt);
1759
      return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
1760
    }
1761
    else if (s & (1 << PKT_UPDATE))
1762
    {
1763
      end = bgp_create_update(c, pkt);
1764
      if (end)
1765
        return bgp_send(conn, PKT_UPDATE, end - buf);
1766

    
1767
      /* No update to send, perhaps we need to send End-of-RIB or EoRR */
1768
      c->packets_to_send = 0;
1769
      conn->channels_to_send &= ~(1 << c->index);
1770

    
1771
      if (c->feed_state == BFS_LOADED)
1772
      {
1773
        c->feed_state = BFS_NONE;
1774
        end = bgp_create_end_mark(c, pkt);
1775
        return bgp_send(conn, PKT_UPDATE, end - buf);
1776
      }
1777

    
1778
      else if (c->feed_state == BFS_REFRESHED)
1779
      {
1780
        c->feed_state = BFS_NONE;
1781
        end = bgp_create_end_refresh(c, pkt);
1782
        return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
1783
      }
1784
    }
1785
    else if (s)
1786
      bug("Channel packets_to_send: %x", s);
1787

    
1788
    c->packets_to_send = 0;
1789
    conn->channels_to_send &= ~(1 << c->index);
1790
  }
1791

    
1792
  return 0;
1793
}
1794

    
1795
/**
1796
 * bgp_schedule_packet - schedule a packet for transmission
1797
 * @conn: connection
1798
 * @c: channel
1799
 * @type: packet type
1800
 *
1801
 * Schedule a packet of type @type to be sent as soon as possible.
1802
 */
1803
void
1804
bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type)
1805
{
1806
  ASSERT(conn->sk);
1807

    
1808
  DBG("BGP: Scheduling packet type %d\n", type);
1809

    
1810
  if (c)
1811
  {
1812
    if (! conn->channels_to_send)
1813
    {
1814
      conn->last_channel = c->index;
1815
      conn->last_channel_count = 0;
1816
    }
1817

    
1818
    c->packets_to_send |= 1 << type;
1819
    conn->channels_to_send |= 1 << c->index;
1820
  }
1821
  else
1822
    conn->packets_to_send |= 1 << type;
1823

    
1824
  if ((conn->sk->tpos == conn->sk->tbuf) && !ev_active(conn->tx_ev))
1825
    ev_schedule(conn->tx_ev);
1826
}
1827

    
1828
void
1829
bgp_kick_tx(void *vconn)
1830
{
1831
  struct bgp_conn *conn = vconn;
1832

    
1833
  DBG("BGP: kicking TX\n");
1834
  while (bgp_fire_tx(conn) > 0)
1835
    ;
1836
}
1837

    
1838
void
1839
bgp_tx(sock *sk)
1840
{
1841
  struct bgp_conn *conn = sk->data;
1842

    
1843
  DBG("BGP: TX hook\n");
1844
  while (bgp_fire_tx(conn) > 0)
1845
    ;
1846
}
1847

    
1848

    
1849
static struct {
1850
  byte major, minor;
1851
  byte *msg;
1852
} bgp_msg_table[] = {
1853
  { 1, 0, "Invalid message header" },
1854
  { 1, 1, "Connection not synchronized" },
1855
  { 1, 2, "Bad message length" },
1856
  { 1, 3, "Bad message type" },
1857
  { 2, 0, "Invalid OPEN message" },
1858
  { 2, 1, "Unsupported version number" },
1859
  { 2, 2, "Bad peer AS" },
1860
  { 2, 3, "Bad BGP identifier" },
1861
  { 2, 4, "Unsupported optional parameter" },
1862
  { 2, 5, "Authentication failure" },
1863
  { 2, 6, "Unacceptable hold time" },
1864
  { 2, 7, "Required capability missing" }, /* [RFC5492] */
1865
  { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */
1866
  { 3, 0, "Invalid UPDATE message" },
1867
  { 3, 1, "Malformed attribute list" },
1868
  { 3, 2, "Unrecognized well-known attribute" },
1869
  { 3, 3, "Missing mandatory attribute" },
1870
  { 3, 4, "Invalid attribute flags" },
1871
  { 3, 5, "Invalid attribute length" },
1872
  { 3, 6, "Invalid ORIGIN attribute" },
1873
  { 3, 7, "AS routing loop" },                /* Deprecated */
1874
  { 3, 8, "Invalid NEXT_HOP attribute" },
1875
  { 3, 9, "Optional attribute error" },
1876
  { 3, 10, "Invalid network field" },
1877
  { 3, 11, "Malformed AS_PATH" },
1878
  { 4, 0, "Hold timer expired" },
1879
  { 5, 0, "Finite state machine error" }, /* Subcodes are according to [RFC6608] */
1880
  { 5, 1, "Unexpected message in OpenSent state" },
1881
  { 5, 2, "Unexpected message in OpenConfirm state" },
1882
  { 5, 3, "Unexpected message in Established state" },
1883
  { 6, 0, "Cease" }, /* Subcodes are according to [RFC4486] */
1884
  { 6, 1, "Maximum number of prefixes reached" },
1885
  { 6, 2, "Administrative shutdown" },
1886
  { 6, 3, "Peer de-configured" },
1887
  { 6, 4, "Administrative reset" },
1888
  { 6, 5, "Connection rejected" },
1889
  { 6, 6, "Other configuration change" },
1890
  { 6, 7, "Connection collision resolution" },
1891
  { 6, 8, "Out of Resources" },
1892
  { 7, 0, "Invalid ROUTE-REFRESH message" }, /* [RFC7313] */
1893
  { 7, 1, "Invalid ROUTE-REFRESH message length" } /* [RFC7313] */
1894
};
1895

    
1896
/**
1897
 * bgp_error_dsc - return BGP error description
1898
 * @code: BGP error code
1899
 * @subcode: BGP error subcode
1900
 *
1901
 * bgp_error_dsc() returns error description for BGP errors
1902
 * which might be static string or given temporary buffer.
1903
 */
1904
const char *
1905
bgp_error_dsc(uint code, uint subcode)
1906
{
1907
  static char buff[32];
1908
  uint i;
1909

    
1910
  for (i=0; i < ARRAY_SIZE(bgp_msg_table); i++)
1911
    if (bgp_msg_table[i].major == code && bgp_msg_table[i].minor == subcode)
1912
      return bgp_msg_table[i].msg;
1913

    
1914
  bsprintf(buff, "Unknown error %u.%u", code, subcode);
1915
  return buff;
1916
}
1917

    
1918
void
1919
bgp_log_error(struct bgp_proto *p, u8 class, char *msg, uint code, uint subcode, byte *data, uint len)
1920
{
1921
  const byte *name;
1922
  byte *t, argbuf[36];
1923
  uint i;
1924

    
1925
  /* Don't report Cease messages generated by myself */
1926
  if (code == 6 && class == BE_BGP_TX)
1927
    return;
1928

    
1929
  name = bgp_error_dsc(code, subcode);
1930
  t = argbuf;
1931
  if (len)
1932
    {
1933
      *t++ = ':';
1934
      *t++ = ' ';
1935

    
1936
      if ((code == 2) && (subcode == 2) && ((len == 2) || (len == 4)))
1937
        {
1938
          /* Bad peer AS - we would like to print the AS */
1939
          t += bsprintf(t, "%u", (len == 2) ? get_u16(data) : get_u32(data));
1940
          goto done;
1941
        }
1942
      if (len > 16)
1943
        len = 16;
1944
      for (i=0; i<len; i++)
1945
        t += bsprintf(t, "%02x", data[i]);
1946
    }
1947
 done:
1948
  *t = 0;
1949
  log(L_REMOTE "%s: %s: %s%s", p->p.name, msg, name, argbuf);
1950
}
1951

    
1952
static void
1953
bgp_rx_notification(struct bgp_conn *conn, byte *pkt, uint len)
1954
{
1955
  struct bgp_proto *p = conn->bgp;
1956

    
1957
  if (len < 21)
1958
  { bgp_error(conn, 1, 2, pkt+16, 2); return; }
1959

    
1960
  uint code = pkt[19];
1961
  uint subcode = pkt[20];
1962
  int err = (code != 6);
1963

    
1964
  bgp_log_error(p, BE_BGP_RX, "Received", code, subcode, pkt+21, len-21);
1965
  bgp_store_error(p, conn, BE_BGP_RX, (code << 16) | subcode);
1966

    
1967
  bgp_conn_enter_close_state(conn);
1968
  bgp_schedule_packet(conn, NULL, PKT_SCHEDULE_CLOSE);
1969

    
1970
  if (err)
1971
  {
1972
    bgp_update_startup_delay(p);
1973
    bgp_stop(p, 0);
1974
  }
1975
}
1976

    
1977
static void
1978
bgp_rx_keepalive(struct bgp_conn *conn)
1979
{
1980
  struct bgp_proto *p = conn->bgp;
1981

    
1982
  BGP_TRACE(D_PACKETS, "Got KEEPALIVE");
1983
  bgp_start_timer(conn->hold_timer, conn->hold_time);
1984

    
1985
  if (conn->state == BS_OPENCONFIRM)
1986
  { bgp_conn_enter_established_state(conn); return; }
1987

    
1988
  if (conn->state != BS_ESTABLISHED)
1989
    bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0);
1990
}
1991

    
1992

    
1993
/**
1994
 * bgp_rx_packet - handle a received packet
1995
 * @conn: BGP connection
1996
 * @pkt: start of the packet
1997
 * @len: packet size
1998
 *
1999
 * bgp_rx_packet() takes a newly received packet and calls the corresponding
2000
 * packet handler according to the packet type.
2001
 */
2002
static void
2003
bgp_rx_packet(struct bgp_conn *conn, byte *pkt, uint len)
2004
{
2005
  byte type = pkt[18];
2006

    
2007
  DBG("BGP: Got packet %02x (%d bytes)\n", type, len);
2008

    
2009
  if (conn->bgp->p.mrtdump & MD_MESSAGES)
2010
    mrt_dump_bgp_packet(conn, pkt, len);
2011

    
2012
  switch (type)
2013
  {
2014
  case PKT_OPEN:                return bgp_rx_open(conn, pkt, len);
2015
  case PKT_UPDATE:                return bgp_rx_update(conn, pkt, len);
2016
  case PKT_NOTIFICATION:        return bgp_rx_notification(conn, pkt, len);
2017
  case PKT_KEEPALIVE:                return bgp_rx_keepalive(conn);
2018
  case PKT_ROUTE_REFRESH:        return bgp_rx_route_refresh(conn, pkt, len);
2019
  default:                        bgp_error(conn, 1, 3, pkt+18, 1);
2020
  }
2021
}
2022

    
2023
/**
2024
 * bgp_rx - handle received data
2025
 * @sk: socket
2026
 * @size: amount of data received
2027
 *
2028
 * bgp_rx() is called by the socket layer whenever new data arrive from
2029
 * the underlying TCP connection. It assembles the data fragments to packets,
2030
 * checks their headers and framing and passes complete packets to
2031
 * bgp_rx_packet().
2032
 */
2033
int
2034
bgp_rx(sock *sk, uint size)
2035
{
2036
  struct bgp_conn *conn = sk->data;
2037
  byte *pkt_start = sk->rbuf;
2038
  byte *end = pkt_start + size;
2039
  uint i, len;
2040

    
2041
  DBG("BGP: RX hook: Got %d bytes\n", size);
2042
  while (end >= pkt_start + BGP_HEADER_LENGTH)
2043
    {
2044
      if ((conn->state == BS_CLOSE) || (conn->sk != sk))
2045
        return 0;
2046
      for(i=0; i<16; i++)
2047
        if (pkt_start[i] != 0xff)
2048
          {
2049
            bgp_error(conn, 1, 1, NULL, 0);
2050
            break;
2051
          }
2052
      len = get_u16(pkt_start+16);
2053
      if ((len < BGP_HEADER_LENGTH) || (len > bgp_max_packet_length(conn)))
2054
        {
2055
          bgp_error(conn, 1, 2, pkt_start+16, 2);
2056
          break;
2057
        }
2058
      if (end < pkt_start + len)
2059
        break;
2060
      bgp_rx_packet(conn, pkt_start, len);
2061
      pkt_start += len;
2062
    }
2063
  if (pkt_start != sk->rbuf)
2064
    {
2065
      memmove(sk->rbuf, pkt_start, end - pkt_start);
2066
      sk->rpos = sk->rbuf + (end - pkt_start);
2067
    }
2068
  return 0;
2069
}