Statistics
| Branch: | Revision:

iof-bird-daemon / proto / bgp / packets.c @ 49c7ef3b

History | View | Annotate | Download (68.1 KB)

1
/*
2
 *        BIRD -- BGP Packet Processing
3
 *
4
 *        (c) 2000 Martin Mares <mj@ucw.cz>
5
 *        (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
6
 *        (c) 2008--2016 CZ.NIC z.s.p.o.
7
 *
8
 *        Can be freely distributed and used under the terms of the GNU GPL.
9
 */
10

    
11
#undef LOCAL_DEBUG
12

    
13
#include <stdlib.h>
14

    
15
#include "nest/bird.h"
16
#include "nest/iface.h"
17
#include "nest/protocol.h"
18
#include "nest/route.h"
19
#include "nest/attrs.h"
20
#include "nest/mrtdump.h"
21
#include "conf/conf.h"
22
#include "lib/unaligned.h"
23
#include "lib/flowspec.h"
24
#include "lib/socket.h"
25

    
26
#include "nest/cli.h"
27

    
28
#include "bgp.h"
29

    
30

    
31
#define BGP_RR_REQUEST                0
32
#define BGP_RR_BEGIN                1
33
#define BGP_RR_END                2
34

    
35
#define BGP_NLRI_MAX                (4 + 1 + 32)
36

    
37
#define BGP_MPLS_BOS                1        /* Bottom-of-stack bit */
38
#define BGP_MPLS_MAX                10        /* Max number of labels that 24*n <= 255 */
39
#define BGP_MPLS_NULL                3        /* Implicit NULL label */
40
#define BGP_MPLS_MAGIC                0x800000 /* Magic withdraw label value, RFC 3107 3 */
41

    
42

    
43
static struct tbf rl_rcv_update = TBF_DEFAULT_LOG_LIMITS;
44
static struct tbf rl_snd_update = TBF_DEFAULT_LOG_LIMITS;
45

    
46
/* Table for state -> RFC 6608 FSM error subcodes */
47
static byte fsm_err_subcode[BS_MAX] = {
48
  [BS_OPENSENT] = 1,
49
  [BS_OPENCONFIRM] = 2,
50
  [BS_ESTABLISHED] = 3
51
};
52

    
53

    
54
static struct bgp_channel *
55
bgp_get_channel(struct bgp_proto *p, u32 afi)
56
{
57
  uint i;
58

    
59
  for (i = 0; i < p->channel_count; i++)
60
    if (p->afi_map[i] == afi)
61
      return p->channel_map[i];
62

    
63
  return NULL;
64
}
65

    
66
static inline void
67
put_af3(byte *buf, u32 id)
68
{
69
  put_u16(buf, id >> 16);
70
  buf[2] = id & 0xff;
71
}
72

    
73
static inline void
74
put_af4(byte *buf, u32 id)
75
{
76
  put_u16(buf, id >> 16);
77
  buf[2] = 0;
78
  buf[3] = id & 0xff;
79
}
80

    
81
static inline u32
82
get_af3(byte *buf)
83
{
84
  return (get_u16(buf) << 16) | buf[2];
85
}
86

    
87
static inline u32
88
get_af4(byte *buf)
89
{
90
  return (get_u16(buf) << 16) | buf[3];
91
}
92

    
93
/*
94
 * MRT Dump format is not semantically specified.
95
 * We will use these values in appropriate fields:
96
 *
97
 * Local AS, Remote AS - configured AS numbers for given BGP instance.
98
 * Local IP, Remote IP - IP addresses of the TCP connection (0 if no connection)
99
 *
100
 * We dump two kinds of MRT messages: STATE_CHANGE (for BGP state
101
 * changes) and MESSAGE (for received BGP messages).
102
 *
103
 * STATE_CHANGE uses always AS4 variant, but MESSAGE uses AS4 variant
104
 * only when AS4 session is established and even in that case MESSAGE
105
 * does not use AS4 variant for initial OPEN message. This strange
106
 * behavior is here for compatibility with Quagga and Bgpdump,
107
 */
108

    
109
static byte *
110
mrt_put_bgp4_hdr(byte *buf, struct bgp_conn *conn, int as4)
111
{
112
  struct bgp_proto *p = conn->bgp;
113
  uint v4 = ipa_is_ip4(p->cf->remote_ip);
114

    
115
  if (as4)
116
  {
117
    put_u32(buf+0, p->remote_as);
118
    put_u32(buf+4, p->public_as);
119
    buf+=8;
120
  }
121
  else
122
  {
123
    put_u16(buf+0, (p->remote_as <= 0xFFFF) ? p->remote_as : AS_TRANS);
124
    put_u16(buf+2, (p->public_as <= 0xFFFF) ? p->public_as : AS_TRANS);
125
    buf+=4;
126
  }
127

    
128
  put_u16(buf+0, (p->neigh && p->neigh->iface) ? p->neigh->iface->index : 0);
129
  put_u16(buf+2, v4 ? BGP_AFI_IPV4 : BGP_AFI_IPV6);
130
  buf+=4;
131

    
132
  if (v4)
133
  {
134
    buf = put_ip4(buf, conn->sk ? ipa_to_ip4(conn->sk->daddr) : IP4_NONE);
135
    buf = put_ip4(buf, conn->sk ? ipa_to_ip4(conn->sk->saddr) : IP4_NONE);
136
  }
137
  else
138
  {
139
    buf = put_ip6(buf, conn->sk ? ipa_to_ip6(conn->sk->daddr) : IP6_NONE);
140
    buf = put_ip6(buf, conn->sk ? ipa_to_ip6(conn->sk->saddr) : IP6_NONE);
141
  }
142

    
143
  return buf;
144
}
145

    
146
static void
147
mrt_dump_bgp_packet(struct bgp_conn *conn, byte *pkt, uint len)
148
{
149
  byte *buf = alloca(128+len);        /* 128 is enough for MRT headers */
150
  byte *bp = buf + MRTDUMP_HDR_LENGTH;
151
  int as4 = conn->bgp->as4_session;
152

    
153
  bp = mrt_put_bgp4_hdr(bp, conn, as4);
154
  memcpy(bp, pkt, len);
155
  bp += len;
156
  mrt_dump_message(&conn->bgp->p, BGP4MP, as4 ? BGP4MP_MESSAGE_AS4 : BGP4MP_MESSAGE,
157
                   buf, bp-buf);
158
}
159

    
160
static inline u16
161
convert_state(uint state)
162
{
163
  /* Convert state from our BS_* values to values used in MRTDump */
164
  return (state == BS_CLOSE) ? 1 : state + 1;
165
}
166

    
167
void
168
mrt_dump_bgp_state_change(struct bgp_conn *conn, uint old, uint new)
169
{
170
  byte buf[128];
171
  byte *bp = buf + MRTDUMP_HDR_LENGTH;
172

    
173
  bp = mrt_put_bgp4_hdr(bp, conn, 1);
174
  put_u16(bp+0, convert_state(old));
175
  put_u16(bp+2, convert_state(new));
176
  bp += 4;
177
  mrt_dump_message(&conn->bgp->p, BGP4MP, BGP4MP_STATE_CHANGE_AS4, buf, bp-buf);
178
}
179

    
180
static byte *
181
bgp_create_notification(struct bgp_conn *conn, byte *buf)
182
{
183
  struct bgp_proto *p = conn->bgp;
184

    
185
  BGP_TRACE(D_PACKETS, "Sending NOTIFICATION(code=%d.%d)", conn->notify_code, conn->notify_subcode);
186
  buf[0] = conn->notify_code;
187
  buf[1] = conn->notify_subcode;
188
  memcpy(buf+2, conn->notify_data, conn->notify_size);
189
  return buf + 2 + conn->notify_size;
190
}
191

    
192

    
193
/* Capability negotiation as per RFC 5492 */
194

    
195
const struct bgp_af_caps *
196
bgp_find_af_caps(struct bgp_caps *caps, u32 afi)
197
{
198
  struct bgp_af_caps *ac;
199

    
200
  WALK_AF_CAPS(caps, ac)
201
    if (ac->afi == afi)
202
      return ac;
203

    
204
  return NULL;
205
}
206

    
207
static struct bgp_af_caps *
208
bgp_get_af_caps(struct bgp_caps *caps, u32 afi)
209
{
210
  struct bgp_af_caps *ac;
211

    
212
  WALK_AF_CAPS(caps, ac)
213
    if (ac->afi == afi)
214
      return ac;
215

    
216
  ac = &caps->af_data[caps->af_count++];
217
  memset(ac, 0, sizeof(struct bgp_af_caps));
218
  ac->afi = afi;
219

    
220
  return ac;
221
}
222

    
223
static int
224
bgp_af_caps_cmp(const void *X, const void *Y)
225
{
226
  const struct bgp_af_caps *x = X, *y = Y;
227
  return (x->afi < y->afi) ? -1 : (x->afi > y->afi) ? 1 : 0;
228
}
229

    
230

    
231
static byte *
232
bgp_write_capabilities(struct bgp_conn *conn, byte *buf)
233
{
234
  struct bgp_proto *p = conn->bgp;
235
  struct bgp_channel *c;
236
  struct bgp_caps *caps;
237
  struct bgp_af_caps *ac;
238
  uint any_ext_next_hop = 0;
239
  uint any_add_path = 0;
240
  byte *data;
241

    
242
  /* Prepare bgp_caps structure */
243

    
244
  int n = list_length(&p->p.channels);
245
  caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + n * sizeof(struct bgp_af_caps));
246
  conn->local_caps = caps;
247

    
248
  caps->as4_support = p->cf->enable_as4;
249
  caps->ext_messages = p->cf->enable_extended_messages;
250
  caps->route_refresh = p->cf->enable_refresh;
251
  caps->enhanced_refresh = p->cf->enable_refresh;
252

    
253
  if (caps->as4_support)
254
    caps->as4_number = p->public_as;
255

    
256
  if (p->cf->gr_mode)
257
  {
258
    caps->gr_aware = 1;
259
    caps->gr_time = p->cf->gr_time;
260
    caps->gr_flags = p->p.gr_recovery ? BGP_GRF_RESTART : 0;
261
  }
262

    
263
  /* Allocate and fill per-AF fields */
264
  WALK_LIST(c, p->p.channels)
265
  {
266
    ac = &caps->af_data[caps->af_count++];
267
    ac->afi = c->afi;
268
    ac->ready = 1;
269

    
270
    ac->ext_next_hop = bgp_channel_is_ipv4(c) && c->cf->ext_next_hop;
271
    any_ext_next_hop |= ac->ext_next_hop;
272

    
273
    ac->add_path = c->cf->add_path;
274
    any_add_path |= ac->add_path;
275

    
276
    if (c->cf->gr_able)
277
    {
278
      ac->gr_able = 1;
279

    
280
      if (p->p.gr_recovery)
281
        ac->gr_af_flags |= BGP_GRF_FORWARDING;
282
    }
283
  }
284

    
285
  /* Sort capability fields by AFI/SAFI */
286
  qsort(caps->af_data, caps->af_count, sizeof(struct bgp_af_caps), bgp_af_caps_cmp);
287

    
288

    
289
  /* Create capability list in buffer */
290

    
291
  /*
292
   * Note that max length is ~ 20+14*af_count. With max 12 channels that is
293
   * 188. Option limit is 253 and buffer size is 4096, so we cannot overflow
294
   * unless we add new capabilities or more AFs.
295
   */
296

    
297
  WALK_AF_CAPS(caps, ac)
298
    if (ac->ready)
299
    {
300
      *buf++ = 1;                /* Capability 1: Multiprotocol extensions */
301
      *buf++ = 4;                /* Capability data length */
302
      put_af4(buf, ac->afi);
303
      buf += 4;
304
    }
305

    
306
  if (caps->route_refresh)
307
  {
308
    *buf++ = 2;                        /* Capability 2: Support for route refresh */
309
    *buf++ = 0;                        /* Capability data length */
310
  }
311

    
312
  if (any_ext_next_hop)
313
  {
314
    *buf++ = 5;                        /* Capability 5: Support for extended next hop */
315
    *buf++ = 0;                        /* Capability data length, will be fixed later */
316
    data = buf;
317

    
318
    WALK_AF_CAPS(caps, ac)
319
      if (ac->ext_next_hop)
320
      {
321
        put_af4(buf, ac->afi);
322
        put_u16(buf+4, BGP_AFI_IPV6);
323
        buf += 6;
324
      }
325

    
326
    data[-1] = buf - data;
327
  }
328

    
329
  if (caps->ext_messages)
330
  {
331
    *buf++ = 6;                        /* Capability 6: Support for extended messages */
332
    *buf++ = 0;                        /* Capability data length */
333
  }
334

    
335
  if (caps->gr_aware)
336
  {
337
    *buf++ = 64;                /* Capability 64: Support for graceful restart */
338
    *buf++ = 0;                        /* Capability data length, will be fixed later */
339
    data = buf;
340

    
341
    put_u16(buf, caps->gr_time);
342
    buf[0] |= caps->gr_flags;
343
    buf += 2;
344

    
345
    WALK_AF_CAPS(caps, ac)
346
      if (ac->gr_able)
347
      {
348
        put_af3(buf, ac->afi);
349
        buf[3] = ac->gr_af_flags;
350
        buf += 4;
351
      }
352

    
353
    data[-1] = buf - data;
354
  }
355

    
356
  if (caps->as4_support)
357
  {
358
    *buf++ = 65;                /* Capability 65: Support for 4-octet AS number */
359
    *buf++ = 4;                        /* Capability data length */
360
    put_u32(buf, p->public_as);
361
    buf += 4;
362
  }
363

    
364
  if (any_add_path)
365
  {
366
    *buf++ = 69;                /* Capability 69: Support for ADD-PATH */
367
    *buf++ = 0;                        /* Capability data length, will be fixed later */
368
    data = buf;
369

    
370
    WALK_AF_CAPS(caps, ac)
371
      if (ac->add_path)
372
      {
373
        put_af3(buf, ac->afi);
374
        buf[3] = ac->add_path;
375
        buf += 4;
376
      }
377

    
378
    data[-1] = buf - data;
379
  }
380

    
381
  if (caps->enhanced_refresh)
382
  {
383
    *buf++ = 70;                /* Capability 70: Support for enhanced route refresh */
384
    *buf++ = 0;                        /* Capability data length */
385
  }
386

    
387
  return buf;
388
}
389

    
390
static void
391
bgp_read_capabilities(struct bgp_conn *conn, struct bgp_caps *caps, byte *pos, int len)
392
{
393
  struct bgp_proto *p = conn->bgp;
394
  struct bgp_af_caps *ac;
395
  int i, cl;
396
  u32 af;
397

    
398
  while (len > 0)
399
  {
400
    if (len < 2 || len < (2 + pos[1]))
401
      goto err;
402

    
403
    /* Capability length */
404
    cl = pos[1];
405

    
406
    /* Capability type */
407
    switch (pos[0])
408
    {
409
    case  1: /* Multiprotocol capability, RFC 4760 */
410
      if (cl != 4)
411
        goto err;
412

    
413
      af = get_af4(pos+2);
414
      ac = bgp_get_af_caps(caps, af);
415
      ac->ready = 1;
416
      break;
417

    
418
    case  2: /* Route refresh capability, RFC 2918 */
419
      if (cl != 0)
420
        goto err;
421

    
422
      caps->route_refresh = 1;
423
      break;
424

    
425
    case  5: /* Extended next hop encoding capability, RFC 5549 */
426
      if (cl % 6)
427
        goto err;
428

    
429
      for (i = 0; i < cl; i += 6)
430
      {
431
        /* Specified only for IPv4 prefixes with IPv6 next hops */
432
        if ((get_u16(pos+2+i+0) != BGP_AFI_IPV4) ||
433
            (get_u16(pos+2+i+4) != BGP_AFI_IPV6))
434
          continue;
435

    
436
        af = get_af4(pos+2+i);
437
        ac = bgp_get_af_caps(caps, af);
438
        ac->ext_next_hop = 1;
439
      }
440
      break;
441

    
442
    case  6: /* Extended message length capability, RFC draft */
443
      if (cl != 0)
444
        goto err;
445

    
446
      caps->ext_messages = 1;
447
      break;
448

    
449
    case 64: /* Graceful restart capability, RFC 4724 */
450
      if (cl % 4 != 2)
451
        goto err;
452

    
453
      /* Only the last instance is valid */
454
      WALK_AF_CAPS(caps, ac)
455
      {
456
        ac->gr_able = 0;
457
        ac->gr_af_flags = 0;
458
      }
459

    
460
      caps->gr_aware = 1;
461
      caps->gr_flags = pos[2] & 0xf0;
462
      caps->gr_time = get_u16(pos + 2) & 0x0fff;
463

    
464
      for (i = 2; i < cl; i += 4)
465
      {
466
        af = get_af3(pos+2+i);
467
        ac = bgp_get_af_caps(caps, af);
468
        ac->gr_able = 1;
469
        ac->gr_af_flags = pos[2+i+3];
470
      }
471
      break;
472

    
473
    case 65: /* AS4 capability, RFC 6793 */
474
      if (cl != 4)
475
        goto err;
476

    
477
      caps->as4_support = 1;
478
      caps->as4_number = get_u32(pos + 2);
479
      break;
480

    
481
    case 69: /* ADD-PATH capability, RFC 7911 */
482
      if (cl % 4)
483
        goto err;
484

    
485
      for (i = 0; i < cl; i += 4)
486
      {
487
        byte val = pos[2+i+3];
488
        if (!val || (val > BGP_ADD_PATH_FULL))
489
        {
490
          log(L_WARN "%s: Got ADD-PATH capability with unknown value %u, ignoring",
491
              p->p.name, val);
492
          break;
493
        }
494
      }
495

    
496
      for (i = 0; i < cl; i += 4)
497
      {
498
        af = get_af3(pos+2+i);
499
        ac = bgp_get_af_caps(caps, af);
500
        ac->add_path = pos[2+i+3];
501
      }
502
      break;
503

    
504
    case 70: /* Enhanced route refresh capability, RFC 7313 */
505
      if (cl != 0)
506
        goto err;
507

    
508
      caps->enhanced_refresh = 1;
509
      break;
510

    
511
      /* We can safely ignore all other capabilities */
512
    }
513

    
514
    ADVANCE(pos, len, 2 + cl);
515
  }
516
  return;
517

    
518
err:
519
  bgp_error(conn, 2, 0, NULL, 0);
520
  return;
521
}
522

    
523
static int
524
bgp_read_options(struct bgp_conn *conn, byte *pos, int len)
525
{
526
  struct bgp_proto *p = conn->bgp;
527
  struct bgp_caps *caps;
528
  int ol;
529

    
530
  /* Max number of announced AFIs is limited by max option length (255) */
531
  caps = alloca(sizeof(struct bgp_caps) + 64 * sizeof(struct bgp_af_caps));
532
  memset(caps, 0, sizeof(struct bgp_caps));
533

    
534
  while (len > 0)
535
  {
536
    if ((len < 2) || (len < (2 + pos[1])))
537
    { bgp_error(conn, 2, 0, NULL, 0); return -1; }
538

    
539
    ol = pos[1];
540
    if (pos[0] == 2)
541
    {
542
      /* BGP capabilities, RFC 5492 */
543
      if (p->cf->capabilities)
544
        bgp_read_capabilities(conn, caps, pos + 2, ol);
545
    }
546
    else
547
    {
548
      /* Unknown option */
549
      bgp_error(conn, 2, 4, pos, ol); /* FIXME: ol or ol+2 ? */
550
      return -1;
551
    }
552

    
553
    ADVANCE(pos, len, 2 + ol);
554
  }
555

    
556
  uint n = sizeof(struct bgp_caps) + caps->af_count * sizeof(struct bgp_af_caps);
557
  conn->remote_caps = mb_allocz(p->p.pool, n);
558
  memcpy(conn->remote_caps, caps, n);
559

    
560
  return 0;
561
}
562

    
563
static byte *
564
bgp_create_open(struct bgp_conn *conn, byte *buf)
565
{
566
  struct bgp_proto *p = conn->bgp;
567

    
568
  BGP_TRACE(D_PACKETS, "Sending OPEN(ver=%d,as=%d,hold=%d,id=%08x)",
569
            BGP_VERSION, p->public_as, p->cf->hold_time, p->local_id);
570

    
571
  buf[0] = BGP_VERSION;
572
  put_u16(buf+1, (p->public_as < 0xFFFF) ? p->public_as : AS_TRANS);
573
  put_u16(buf+3, p->cf->hold_time);
574
  put_u32(buf+5, p->local_id);
575

    
576
  if (p->cf->capabilities)
577
  {
578
    /* Prepare local_caps and write capabilities to buffer */
579
    byte *end = bgp_write_capabilities(conn, buf+12);
580
    uint len = end - (buf+12);
581

    
582
    buf[9] = len + 2;                /* Optional parameters length */
583
    buf[10] = 2;                /* Option 2: Capability list */
584
    buf[11] = len;                /* Option data length */
585

    
586
    return end;
587
  }
588
  else
589
  {
590
    /* Prepare empty local_caps */
591
    conn->local_caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps));
592

    
593
    buf[9] = 0;                        /* No optional parameters */
594
    return buf + 10;
595
  }
596

    
597
  return buf;
598
}
599

    
600
static void
601
bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len)
602
{
603
  struct bgp_proto *p = conn->bgp;
604
  struct bgp_conn *other;
605
  u32 asn, hold, id;
606

    
607
  /* Check state */
608
  if (conn->state != BS_OPENSENT)
609
  { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
610

    
611
  /* Check message contents */
612
  if (len < 29 || len != 29 + (uint) pkt[28])
613
  { bgp_error(conn, 1, 2, pkt+16, 2); return; }
614

    
615
  if (pkt[19] != BGP_VERSION)
616
  { u16 val = BGP_VERSION; bgp_error(conn, 2, 1, (byte *) &val, 2); return; }
617

    
618
  asn = get_u16(pkt+20);
619
  hold = get_u16(pkt+22);
620
  id = get_u32(pkt+24);
621
  BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%R)", asn, hold, id);
622

    
623
  if (bgp_read_options(conn, pkt+29, pkt[28]) < 0)
624
    return;
625

    
626
  if (hold > 0 && hold < 3)
627
  { bgp_error(conn, 2, 6, pkt+22, 2); return; }
628

    
629
  /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */
630
  if (!id || (p->is_internal && id == p->local_id))
631
  { bgp_error(conn, 2, 3, pkt+24, -4); return; }
632

    
633
  struct bgp_caps *caps = conn->remote_caps;
634

    
635
  if (caps->as4_support)
636
  {
637
    u32 as4 = caps->as4_number;
638

    
639
    if ((as4 != asn) && (asn != AS_TRANS))
640
      log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name);
641

    
642
    if (as4 != p->remote_as)
643
    { as4 = htonl(as4); bgp_error(conn, 2, 2, (byte *) &as4, 4); return; }
644
  }
645
  else
646
  {
647
    if (asn != p->remote_as)
648
    { bgp_error(conn, 2, 2, pkt+20, 2); return; }
649
  }
650

    
651
  /* Check the other connection */
652
  other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn;
653
  switch (other->state)
654
  {
655
  case BS_CONNECT:
656
  case BS_ACTIVE:
657
    /* Stop outgoing connection attempts */
658
    bgp_conn_enter_idle_state(other);
659
    break;
660

    
661
  case BS_IDLE:
662
  case BS_OPENSENT:
663
  case BS_CLOSE:
664
    break;
665

    
666
  case BS_OPENCONFIRM:
667
    /*
668
     * Description of collision detection rules in RFC 4271 is confusing and
669
     * contradictory, but it is essentially:
670
     *
671
     * 1. Router with higher ID is dominant
672
     * 2. If both have the same ID, router with higher ASN is dominant [RFC6286]
673
     * 3. When both connections are in OpenConfirm state, one initiated by
674
     *    the dominant router is kept.
675
     *
676
     * The first line in the expression below evaluates whether the neighbor
677
     * is dominant, the second line whether the new connection was initiated
678
     * by the neighbor. If both are true (or both are false), we keep the new
679
     * connection, otherwise we keep the old one.
680
     */
681
    if (((p->local_id < id) || ((p->local_id == id) && (p->public_as < p->remote_as)))
682
        == (conn == &p->incoming_conn))
683
    {
684
      /* Should close the other connection */
685
      BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection");
686
      bgp_error(other, 6, 7, NULL, 0);
687
      break;
688
    }
689
    /* Fall thru */
690
  case BS_ESTABLISHED:
691
    /* Should close this connection */
692
    BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection");
693
    bgp_error(conn, 6, 7, NULL, 0);
694
    return;
695

    
696
  default:
697
    bug("bgp_rx_open: Unknown state");
698
  }
699

    
700
  /* Update our local variables */
701
  conn->hold_time = MIN(hold, p->cf->hold_time);
702
  conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3;
703
  conn->as4_session = conn->local_caps->as4_support && caps->as4_support;
704
  conn->ext_messages = conn->local_caps->ext_messages && caps->ext_messages;
705
  p->remote_id = id;
706

    
707
  DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n",
708
      conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, conn->as4_session);
709

    
710
  bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE);
711
  bgp_start_timer(conn->hold_timer, conn->hold_time);
712
  bgp_conn_enter_openconfirm_state(conn);
713
}
714

    
715

    
716
/*
717
 *        Next hop handling
718
 */
719

    
720
#define REPORT(msg, args...) \
721
  ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); })
722

    
723
#define DISCARD(msg, args...) \
724
  ({ REPORT(msg, ## args); return; })
725

    
726
#define WITHDRAW(msg, args...) \
727
  ({ REPORT(msg, ## args); s->err_withdraw = 1; return; })
728

    
729
#define BAD_AFI                "Unexpected AF <%u/%u> in UPDATE"
730
#define BAD_NEXT_HOP        "Invalid NEXT_HOP attribute"
731
#define NO_NEXT_HOP        "Missing NEXT_HOP attribute"
732
#define NO_LABEL_STACK        "Missing MPLS stack"
733

    
734

    
735
static void
736
bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll)
737
{
738
  struct bgp_proto *p = s->proto;
739
  struct bgp_channel *c = s->channel;
740

    
741
  if (c->cf->gw_mode == GW_DIRECT)
742
  {
743
    neighbor *nbr = NULL;
744

    
745
    /* GW_DIRECT -> single_hop -> p->neigh != NULL */
746
    if (ipa_nonzero(gw))
747
      nbr = neigh_find2(&p->p, &gw, NULL, 0);
748
    else if (ipa_nonzero(ll))
749
      nbr = neigh_find2(&p->p, &ll, p->neigh->iface, 0);
750

    
751
    if (!nbr || (nbr->scope == SCOPE_HOST))
752
      WITHDRAW(BAD_NEXT_HOP);
753

    
754
    a->dest = RTD_UNICAST;
755
    a->nh.gw = nbr->addr;
756
    a->nh.iface = nbr->iface;
757
  }
758
  else /* GW_RECURSIVE */
759
  {
760
    if (ipa_zero(gw))
761
      WITHDRAW(BAD_NEXT_HOP);
762

    
763
    rtable *tab = ipa_is_ip4(gw) ? c->igp_table_ip4 : c->igp_table_ip6;
764
    s->hostentry = rt_get_hostentry(tab, gw, ll, c->c.table);
765

    
766
    if (!s->mpls)
767
      rta_apply_hostentry(a, s->hostentry, NULL);
768

    
769
    /* With MPLS, hostentry is applied later in bgp_apply_mpls_labels() */
770
  }
771
}
772

    
773
static void
774
bgp_apply_mpls_labels(struct bgp_parse_state *s, rta *a, u32 *labels, uint lnum)
775
{
776
  if (lnum > MPLS_MAX_LABEL_STACK)
777
  {
778
    REPORT("Too many MPLS labels ($u)", lnum);
779

    
780
    a->dest = RTD_UNREACHABLE;
781
    a->hostentry = NULL;
782
    a->nh = (struct nexthop) { };
783
    return;
784
  }
785

    
786
  /* Handle implicit NULL as empty MPLS stack */
787
  if ((lnum == 1) && (labels[0] == BGP_MPLS_NULL))
788
    lnum = 0;
789

    
790
  if (s->channel->cf->gw_mode == GW_DIRECT)
791
  {
792
    a->nh.labels = lnum;
793
    memcpy(a->nh.label, labels, 4*lnum);
794
  }
795
  else /* GW_RECURSIVE */
796
  {
797
    mpls_label_stack ms;
798

    
799
    ms.len = lnum;
800
    memcpy(ms.stack, labels, 4*lnum);
801
    rta_apply_hostentry(a, s->hostentry, &ms);
802
  }
803
}
804

    
805

    
806
static inline int
807
bgp_use_next_hop(struct bgp_export_state *s, eattr *a)
808
{
809
  struct bgp_proto *p = s->proto;
810
  ip_addr *nh = (void *) a->u.ptr->data;
811

    
812
  if (s->channel->cf->next_hop_self)
813
    return 0;
814

    
815
  if (s->channel->cf->next_hop_keep)
816
    return 1;
817

    
818
  /* Keep it when explicitly set in export filter */
819
  if (a->type & EAF_FRESH)
820
    return 1;
821

    
822
  /* Keep it when exported to internal peers */
823
  if (p->is_interior && ipa_nonzero(*nh))
824
    return 1;
825

    
826
  /* Keep it when forwarded between single-hop BGPs on the same iface */
827
  struct iface *ifa = (s->src && s->src->neigh) ? s->src->neigh->iface : NULL;
828
  return p->neigh && (p->neigh->iface == ifa);
829
}
830

    
831
static inline int
832
bgp_use_gateway(struct bgp_export_state *s)
833
{
834
  struct bgp_proto *p = s->proto;
835
  rta *ra = s->route->attrs;
836

    
837
  if (s->channel->cf->next_hop_self)
838
    return 0;
839

    
840
  /* We need one valid global gateway */
841
  if ((ra->dest != RTD_UNICAST) || ra->nh.next || ipa_zero(ra->nh.gw) || ipa_is_link_local(ra->nh.gw))
842
    return 0;
843

    
844
  /* Use it when exported to internal peers */
845
  if (p->is_interior)
846
    return 1;
847

    
848
  /* Use it when forwarded to single-hop BGP peer on on the same iface */
849
  return p->neigh && (p->neigh->iface == ra->nh.iface);
850
}
851

    
852
static void
853
bgp_update_next_hop_ip(struct bgp_export_state *s, eattr *a, ea_list **to)
854
{
855
  if (!a || !bgp_use_next_hop(s, a))
856
  {
857
    if (bgp_use_gateway(s))
858
    {
859
      rta *ra = s->route->attrs;
860
      ip_addr nh[1] = { ra->nh.gw };
861
      bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, 16);
862

    
863
      if (s->mpls)
864
      {
865
        u32 implicit_null = BGP_MPLS_NULL;
866
        u32 *labels = ra->nh.labels ? ra->nh.label : &implicit_null;
867
        uint lnum = ra->nh.labels ? ra->nh.labels : 1;
868
        bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, labels, lnum * 4);
869
      }
870
    }
871
    else
872
    {
873
      ip_addr nh[2] = { s->channel->next_hop_addr, s->channel->link_addr };
874
      bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, ipa_nonzero(nh[1]) ? 32 : 16);
875

    
876
      /* TODO: Use local MPLS assigned label */
877
      if (s->mpls)
878
        bgp_unset_attr(to, s->pool, BA_MPLS_LABEL_STACK);
879
    }
880
  }
881

    
882
  /* Check if next hop is valid */
883
  a = bgp_find_attr(*to, BA_NEXT_HOP);
884
  if (!a)
885
    WITHDRAW(NO_NEXT_HOP);
886

    
887
  ip_addr *nh = (void *) a->u.ptr->data;
888
  ip_addr peer = s->proto->cf->remote_ip;
889
  uint len = a->u.ptr->length;
890

    
891
  /* Forbid zero next hop */
892
  if (ipa_zero(nh[0]) && ((len != 32) || ipa_zero(nh[1])))
893
    WITHDRAW(BAD_NEXT_HOP);
894

    
895
  /* Forbid next hop equal to neighbor IP */
896
  if (ipa_equal(peer, nh[0]) || ((len == 32) && ipa_equal(peer, nh[1])))
897
    WITHDRAW(BAD_NEXT_HOP);
898

    
899
  /* Forbid next hop with non-matching AF */
900
  if ((ipa_is_ip4(nh[0]) != bgp_channel_is_ipv4(s->channel)) &&
901
      !s->channel->ext_next_hop)
902
    WITHDRAW(BAD_NEXT_HOP);
903

    
904
  /* Just check if MPLS stack */
905
  if (s->mpls && !bgp_find_attr(*to, BA_MPLS_LABEL_STACK))
906
    WITHDRAW(NO_LABEL_STACK);
907
}
908

    
909
static uint
910
bgp_encode_next_hop_ip(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED)
911
{
912
  /* This function is used only for MP-BGP, see bgp_encode_next_hop() for IPv4 BGP */
913
  ip_addr *nh = (void *) a->u.ptr->data;
914
  uint len = a->u.ptr->length;
915

    
916
  ASSERT((len == 16) || (len == 32));
917

    
918
  /*
919
   * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This
920
   * is specified in RFC 5549 for IPv4 and in RFC 4798 for IPv6. The difference
921
   * is that IPv4 address is directly encoded with IPv4 NLRI, but as IPv4-mapped
922
   * IPv6 address with IPv6 NLRI.
923
   */
924

    
925
  if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0]))
926
  {
927
    put_ip4(buf, ipa_to_ip4(nh[0]));
928
    return 4;
929
  }
930

    
931
  put_ip6(buf, ipa_to_ip6(nh[0]));
932

    
933
  if (len == 32)
934
    put_ip6(buf+16, ipa_to_ip6(nh[1]));
935

    
936
  return len;
937
}
938

    
939
static void
940
bgp_decode_next_hop_ip(struct bgp_parse_state *s, byte *data, uint len, rta *a)
941
{
942
  struct bgp_channel *c = s->channel;
943
  struct adata *ad = lp_alloc_adata(s->pool, 32);
944
  ip_addr *nh = (void *) ad->data;
945

    
946
  if (len == 4)
947
  {
948
    nh[0] = ipa_from_ip4(get_ip4(data));
949
    nh[1] = IPA_NONE;
950
  }
951
  else if (len == 16)
952
  {
953
    nh[0] = ipa_from_ip6(get_ip6(data));
954
    nh[1] = IPA_NONE;
955

    
956
    if (ipa_is_link_local(nh[0]))
957
    { nh[1] = nh[0]; nh[0] = IPA_NONE; }
958
  }
959
  else if (len == 32)
960
  {
961
    nh[0] = ipa_from_ip6(get_ip6(data));
962
    nh[1] = ipa_from_ip6(get_ip6(data+16));
963

    
964
    if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1]))
965
      nh[1] = IPA_NONE;
966
  }
967
  else
968
    bgp_parse_error(s, 9);
969

    
970
  if (ipa_zero(nh[1]))
971
    ad->length = 16;
972

    
973
  if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop)
974
    WITHDRAW(BAD_NEXT_HOP);
975

    
976
  // XXXX validate next hop
977

    
978
  bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad);
979
  bgp_apply_next_hop(s, a, nh[0], nh[1]);
980
}
981

    
982
static uint
983
bgp_encode_next_hop_vpn(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED)
984
{
985
  ip_addr *nh = (void *) a->u.ptr->data;
986
  uint len = a->u.ptr->length;
987

    
988
  ASSERT((len == 16) || (len == 32));
989

    
990
  /*
991
   * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This
992
   * is specified in RFC 5549 for VPNv4 and in RFC 4659 for VPNv6. The difference
993
   * is that IPv4 address is directly encoded with VPNv4 NLRI, but as IPv4-mapped
994
   * IPv6 address with VPNv6 NLRI.
995
   */
996

    
997
  if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0]))
998
  {
999
    put_u64(buf, 0); /* VPN RD is 0 */
1000
    put_ip4(buf+8, ipa_to_ip4(nh[0]));
1001
    return 12;
1002
  }
1003

    
1004
  put_u64(buf, 0); /* VPN RD is 0 */
1005
  put_ip6(buf+8, ipa_to_ip6(nh[0]));
1006

    
1007
  if (len == 16)
1008
    return 24;
1009

    
1010
  put_u64(buf+24, 0); /* VPN RD is 0 */
1011
  put_ip6(buf+32, ipa_to_ip6(nh[1]));
1012

    
1013
  return 48;
1014
}
1015

    
1016
static void
1017
bgp_decode_next_hop_vpn(struct bgp_parse_state *s, byte *data, uint len, rta *a)
1018
{
1019
  struct bgp_channel *c = s->channel;
1020
  struct adata *ad = lp_alloc_adata(s->pool, 32);
1021
  ip_addr *nh = (void *) ad->data;
1022

    
1023
  if (len == 12)
1024
  {
1025
    nh[0] = ipa_from_ip4(get_ip4(data+8));
1026
    nh[1] = IPA_NONE;
1027
  }
1028
  else if (len == 24)
1029
  {
1030
    nh[0] = ipa_from_ip6(get_ip6(data+8));
1031
    nh[1] = IPA_NONE;
1032

    
1033
    if (ipa_is_link_local(nh[0]))
1034
    { nh[1] = nh[0]; nh[0] = IPA_NONE; }
1035
  }
1036
  else if (len == 48)
1037
  {
1038
    nh[0] = ipa_from_ip6(get_ip6(data+8));
1039
    nh[1] = ipa_from_ip6(get_ip6(data+32));
1040

    
1041
    if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1]))
1042
      nh[1] = IPA_NONE;
1043
  }
1044
  else
1045
    bgp_parse_error(s, 9);
1046

    
1047
  if (ipa_zero(nh[1]))
1048
    ad->length = 16;
1049

    
1050
  /* XXXX which error */
1051
  if ((get_u64(data) != 0) || ((len == 48) && (get_u64(data+24) != 0)))
1052
    bgp_parse_error(s, 9);
1053

    
1054
  if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop)
1055
    WITHDRAW(BAD_NEXT_HOP);
1056

    
1057
  // XXXX validate next hop
1058

    
1059
  bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad);
1060
  bgp_apply_next_hop(s, a, nh[0], nh[1]);
1061
}
1062

    
1063

    
1064

    
1065
static uint
1066
bgp_encode_next_hop_none(struct bgp_write_state *s UNUSED, eattr *a UNUSED, byte *buf UNUSED, uint size UNUSED)
1067
{
1068
  return 0;
1069
}
1070

    
1071
static void
1072
bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, uint len UNUSED, rta *a UNUSED)
1073
{
1074
  /*
1075
   * Although we expect no next hop and RFC 7606 7.11 states that attribute
1076
   * MP_REACH_NLRI with unexpected next hop length is considered malformed,
1077
   * FlowSpec RFC 5575 4 states that next hop shall be ignored on receipt.
1078
   */
1079

    
1080
  return;
1081
}
1082

    
1083
static void
1084
bgp_update_next_hop_none(struct bgp_export_state *s, eattr *a, ea_list **to)
1085
{
1086
  /* NEXT_HOP shall not pass */
1087
  if (a)
1088
    bgp_unset_attr(to, s->pool, BA_NEXT_HOP);
1089
}
1090

    
1091

    
1092
/*
1093
 *        UPDATE
1094
 */
1095

    
1096
static void
1097
bgp_rte_update(struct bgp_parse_state *s, net_addr *n, u32 path_id, rta *a0)
1098
{
1099
  if (path_id != s->last_id)
1100
  {
1101
    s->last_src = rt_get_source(&s->proto->p, path_id);
1102
    s->last_id = path_id;
1103

    
1104
    rta_free(s->cached_rta);
1105
    s->cached_rta = NULL;
1106
  }
1107

    
1108
  if (!a0)
1109
  {
1110
    /* Route withdraw */
1111
    rte_update2(&s->channel->c, n, NULL, s->last_src);
1112
    return;
1113
  }
1114

    
1115
  /* Prepare cached route attributes */
1116
  if (s->cached_rta == NULL)
1117
  {
1118
    a0->src = s->last_src;
1119

    
1120
    /* Workaround for rta_lookup() breaking eattrs */
1121
    ea_list *ea = a0->eattrs;
1122
    s->cached_rta = rta_lookup(a0);
1123
    a0->eattrs = ea;
1124
  }
1125

    
1126
  rta *a = rta_clone(s->cached_rta);
1127
  rte *e = rte_get_temp(a);
1128

    
1129
  e->pflags = 0;
1130
  e->u.bgp.suppressed = 0;
1131
  rte_update2(&s->channel->c, n, e, s->last_src);
1132
}
1133

    
1134
static void
1135
bgp_encode_mpls_labels(struct bgp_write_state *s UNUSED, adata *mpls, byte **pos, uint *size, byte *pxlen)
1136
{
1137
  u32 dummy = 0;
1138
  u32 *labels = mpls ? (u32 *) mpls->data : &dummy;
1139
  uint lnum = mpls ? (mpls->length / 4) : 1;
1140

    
1141
  for (uint i = 0; i < lnum; i++)
1142
  {
1143
    put_u24(*pos, labels[i] << 4);
1144
    ADVANCE(*pos, *size, 3);
1145
  }
1146

    
1147
  /* Add bottom-of-stack flag */
1148
  (*pos)[-1] |= BGP_MPLS_BOS;
1149

    
1150
  *pxlen += 24 * lnum;
1151
}
1152

    
1153
static void
1154
bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *pxlen, rta *a)
1155
{
1156
  u32 labels[BGP_MPLS_MAX], label;
1157
  uint lnum = 0;
1158

    
1159
  do {
1160
    if (*pxlen < 24)
1161
      bgp_parse_error(s, 1);
1162

    
1163
    label = get_u24(*pos);
1164
    labels[lnum++] = label >> 4;
1165
    ADVANCE(*pos, *len, 3);
1166
    *pxlen -= 24;
1167

    
1168
    /* Withdraw: Magic label stack value 0x800000 according to RFC 3107, section 3, last paragraph */
1169
    if (!a && !s->err_withdraw && (lnum == 1) && (label == BGP_MPLS_MAGIC))
1170
      break;
1171
  }
1172
  while (!(label & BGP_MPLS_BOS));
1173

    
1174
  if (!a)
1175
    return;
1176

    
1177
  /* Attach MPLS attribute unless we already have one */
1178
  if (!s->mpls_labels)
1179
  {
1180
    s->mpls_labels = lp_alloc_adata(s->pool, 4*BGP_MPLS_MAX);
1181
    bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_MPLS_LABEL_STACK, 0, s->mpls_labels);
1182
  }
1183

    
1184
  /* Overwrite data in the attribute */
1185
  s->mpls_labels->length = 4*lnum;
1186
  memcpy(s->mpls_labels->data, labels, 4*lnum);
1187

    
1188
  /* Update next hop entry in rta */
1189
  bgp_apply_mpls_labels(s, a, labels, lnum);
1190

    
1191
  /* Attributes were changed, invalidate cached entry */
1192
  rta_free(s->cached_rta);
1193
  s->cached_rta = NULL;
1194

    
1195
  return;
1196
}
1197

    
1198
static uint
1199
bgp_encode_nlri_ip4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1200
{
1201
  byte *pos = buf;
1202

    
1203
  while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1204
  {
1205
    struct bgp_prefix *px = HEAD(buck->prefixes);
1206
    struct net_addr_ip4 *net = (void *) px->net;
1207

    
1208
    /* Encode path ID */
1209
    if (s->add_path)
1210
    {
1211
      put_u32(pos, px->path_id);
1212
      ADVANCE(pos, size, 4);
1213
    }
1214

    
1215
    /* Encode prefix length */
1216
    *pos = net->pxlen;
1217
    ADVANCE(pos, size, 1);
1218

    
1219
    /* Encode MPLS labels */
1220
    if (s->mpls)
1221
      bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1222

    
1223
    /* Encode prefix body */
1224
    ip4_addr a = ip4_hton(net->prefix);
1225
    uint b = (net->pxlen + 7) / 8;
1226
    memcpy(pos, &a, b);
1227
    ADVANCE(pos, size, b);
1228

    
1229
    bgp_free_prefix(s->channel, px);
1230
  }
1231

    
1232
  return pos - buf;
1233
}
1234

    
1235
static void
1236
bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1237
{
1238
  while (len)
1239
  {
1240
    net_addr_ip4 net;
1241
    u32 path_id = 0;
1242

    
1243
    /* Decode path ID */
1244
    if (s->add_path)
1245
    {
1246
      if (len < 5)
1247
        bgp_parse_error(s, 1);
1248

    
1249
      path_id = get_u32(pos);
1250
      ADVANCE(pos, len, 4);
1251
    }
1252

    
1253
    /* Decode prefix length */
1254
    uint l = *pos;
1255
    ADVANCE(pos, len, 1);
1256

    
1257
    if (len < ((l + 7) / 8))
1258
      bgp_parse_error(s, 1);
1259

    
1260
    /* Decode MPLS labels */
1261
    if (s->mpls)
1262
      bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1263

    
1264
    if (l > IP4_MAX_PREFIX_LENGTH)
1265
      bgp_parse_error(s, 10);
1266

    
1267
    /* Decode prefix body */
1268
    ip4_addr addr = IP4_NONE;
1269
    uint b = (l + 7) / 8;
1270
    memcpy(&addr, pos, b);
1271
    ADVANCE(pos, len, b);
1272

    
1273
    net = NET_ADDR_IP4(ip4_ntoh(addr), l);
1274
    net_normalize_ip4(&net);
1275

    
1276
    // XXXX validate prefix
1277

    
1278
    bgp_rte_update(s, (net_addr *) &net, path_id, a);
1279
  }
1280
}
1281

    
1282

    
1283
static uint
1284
bgp_encode_nlri_ip6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1285
{
1286
  byte *pos = buf;
1287

    
1288
  while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1289
  {
1290
    struct bgp_prefix *px = HEAD(buck->prefixes);
1291
    struct net_addr_ip6 *net = (void *) px->net;
1292

    
1293
    /* Encode path ID */
1294
    if (s->add_path)
1295
    {
1296
      put_u32(pos, px->path_id);
1297
      ADVANCE(pos, size, 4);
1298
    }
1299

    
1300
    /* Encode prefix length */
1301
    *pos = net->pxlen;
1302
    ADVANCE(pos, size, 1);
1303

    
1304
    /* Encode MPLS labels */
1305
    if (s->mpls)
1306
      bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1307

    
1308
    /* Encode prefix body */
1309
    ip6_addr a = ip6_hton(net->prefix);
1310
    uint b = (net->pxlen + 7) / 8;
1311
    memcpy(pos, &a, b);
1312
    ADVANCE(pos, size, b);
1313

    
1314
    bgp_free_prefix(s->channel, px);
1315
  }
1316

    
1317
  return pos - buf;
1318
}
1319

    
1320
static void
1321
bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1322
{
1323
  while (len)
1324
  {
1325
    net_addr_ip6 net;
1326
    u32 path_id = 0;
1327

    
1328
    /* Decode path ID */
1329
    if (s->add_path)
1330
    {
1331
      if (len < 5)
1332
        bgp_parse_error(s, 1);
1333

    
1334
      path_id = get_u32(pos);
1335
      ADVANCE(pos, len, 4);
1336
    }
1337

    
1338
    /* Decode prefix length */
1339
    uint l = *pos;
1340
    ADVANCE(pos, len, 1);
1341

    
1342
    if (len < ((l + 7) / 8))
1343
      bgp_parse_error(s, 1);
1344

    
1345
    /* Decode MPLS labels */
1346
    if (s->mpls)
1347
      bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1348

    
1349
    if (l > IP6_MAX_PREFIX_LENGTH)
1350
      bgp_parse_error(s, 10);
1351

    
1352
    /* Decode prefix body */
1353
    ip6_addr addr = IP6_NONE;
1354
    uint b = (l + 7) / 8;
1355
    memcpy(&addr, pos, b);
1356
    ADVANCE(pos, len, b);
1357

    
1358
    net = NET_ADDR_IP6(ip6_ntoh(addr), l);
1359
    net_normalize_ip6(&net);
1360

    
1361
    // XXXX validate prefix
1362

    
1363
    bgp_rte_update(s, (net_addr *) &net, path_id, a);
1364
  }
1365
}
1366

    
1367
static uint
1368
bgp_encode_nlri_vpn4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1369
{
1370
  byte *pos = buf;
1371

    
1372
  while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1373
  {
1374
    struct bgp_prefix *px = HEAD(buck->prefixes);
1375
    struct net_addr_vpn4 *net = (void *) px->net;
1376

    
1377
    /* Encode path ID */
1378
    if (s->add_path)
1379
    {
1380
      put_u32(pos, px->path_id);
1381
      ADVANCE(pos, size, 4);
1382
    }
1383

    
1384
    /* Encode prefix length */
1385
    *pos = 64 + net->pxlen;
1386
    ADVANCE(pos, size, 1);
1387

    
1388
    /* Encode MPLS labels */
1389
    if (s->mpls)
1390
      bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1391

    
1392
    /* Encode route distinguisher */
1393
    put_u64(pos, net->rd);
1394
    ADVANCE(pos, size, 8);
1395

    
1396
    /* Encode prefix body */
1397
    ip4_addr a = ip4_hton(net->prefix);
1398
    uint b = (net->pxlen + 7) / 8;
1399
    memcpy(pos, &a, b);
1400
    ADVANCE(pos, size, b);
1401

    
1402
    bgp_free_prefix(s->channel, px);
1403
  }
1404

    
1405
  return pos - buf;
1406
}
1407

    
1408
static void
1409
bgp_decode_nlri_vpn4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1410
{
1411
  while (len)
1412
  {
1413
    net_addr_vpn4 net;
1414
    u32 path_id = 0;
1415

    
1416
    /* Decode path ID */
1417
    if (s->add_path)
1418
    {
1419
      if (len < 5)
1420
        bgp_parse_error(s, 1);
1421

    
1422
      path_id = get_u32(pos);
1423
      ADVANCE(pos, len, 4);
1424
    }
1425

    
1426
    /* Decode prefix length */
1427
    uint l = *pos;
1428
    ADVANCE(pos, len, 1);
1429

    
1430
    if (len < ((l + 7) / 8))
1431
      bgp_parse_error(s, 1);
1432

    
1433
    /* Decode MPLS labels */
1434
    if (s->mpls)
1435
      bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1436

    
1437
    /* Decode route distinguisher */
1438
    if (l < 64)
1439
      bgp_parse_error(s, 1);
1440

    
1441
    u64 rd = get_u64(pos);
1442
    ADVANCE(pos, len, 8);
1443
    l -= 64;
1444

    
1445
    if (l > IP4_MAX_PREFIX_LENGTH)
1446
      bgp_parse_error(s, 10);
1447

    
1448
    /* Decode prefix body */
1449
    ip4_addr addr = IP4_NONE;
1450
    uint b = (l + 7) / 8;
1451
    memcpy(&addr, pos, b);
1452
    ADVANCE(pos, len, b);
1453

    
1454
    net = NET_ADDR_VPN4(ip4_ntoh(addr), l, rd);
1455
    net_normalize_vpn4(&net);
1456

    
1457
    // XXXX validate prefix
1458

    
1459
    bgp_rte_update(s, (net_addr *) &net, path_id, a);
1460
  }
1461
}
1462

    
1463

    
1464
static uint
1465
bgp_encode_nlri_vpn6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1466
{
1467
  byte *pos = buf;
1468

    
1469
  while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1470
  {
1471
    struct bgp_prefix *px = HEAD(buck->prefixes);
1472
    struct net_addr_vpn6 *net = (void *) px->net;
1473

    
1474
    /* Encode path ID */
1475
    if (s->add_path)
1476
    {
1477
      put_u32(pos, px->path_id);
1478
      ADVANCE(pos, size, 4);
1479
    }
1480

    
1481
    /* Encode prefix length */
1482
    *pos = 64 + net->pxlen;
1483
    ADVANCE(pos, size, 1);
1484

    
1485
    /* Encode MPLS labels */
1486
    if (s->mpls)
1487
      bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1488

    
1489
    /* Encode route distinguisher */
1490
    put_u64(pos, net->rd);
1491
    ADVANCE(pos, size, 8);
1492

    
1493
    /* Encode prefix body */
1494
    ip6_addr a = ip6_hton(net->prefix);
1495
    uint b = (net->pxlen + 7) / 8;
1496
    memcpy(pos, &a, b);
1497
    ADVANCE(pos, size, b);
1498

    
1499
    bgp_free_prefix(s->channel, px);
1500
  }
1501

    
1502
  return pos - buf;
1503
}
1504

    
1505
static void
1506
bgp_decode_nlri_vpn6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1507
{
1508
  while (len)
1509
  {
1510
    net_addr_vpn6 net;
1511
    u32 path_id = 0;
1512

    
1513
    /* Decode path ID */
1514
    if (s->add_path)
1515
    {
1516
      if (len < 5)
1517
        bgp_parse_error(s, 1);
1518

    
1519
      path_id = get_u32(pos);
1520
      ADVANCE(pos, len, 4);
1521
    }
1522

    
1523
    /* Decode prefix length */
1524
    uint l = *pos;
1525
    ADVANCE(pos, len, 1);
1526

    
1527
    if (len < ((l + 7) / 8))
1528
      bgp_parse_error(s, 1);
1529

    
1530
    /* Decode MPLS labels */
1531
    if (s->mpls)
1532
      bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1533

    
1534
    /* Decode route distinguisher */
1535
    if (l < 64)
1536
      bgp_parse_error(s, 1);
1537

    
1538
    u64 rd = get_u64(pos);
1539
    ADVANCE(pos, len, 8);
1540
    l -= 64;
1541

    
1542
    if (l > IP6_MAX_PREFIX_LENGTH)
1543
      bgp_parse_error(s, 10);
1544

    
1545
    /* Decode prefix body */
1546
    ip6_addr addr = IP6_NONE;
1547
    uint b = (l + 7) / 8;
1548
    memcpy(&addr, pos, b);
1549
    ADVANCE(pos, len, b);
1550

    
1551
    net = NET_ADDR_VPN6(ip6_ntoh(addr), l, rd);
1552
    net_normalize_vpn6(&net);
1553

    
1554
    // XXXX validate prefix
1555

    
1556
    bgp_rte_update(s, (net_addr *) &net, path_id, a);
1557
  }
1558
}
1559

    
1560

    
1561
static uint
1562
bgp_encode_nlri_flow4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1563
{
1564
  byte *pos = buf;
1565

    
1566
  while (!EMPTY_LIST(buck->prefixes) && (size >= 4))
1567
  {
1568
    struct bgp_prefix *px = HEAD(buck->prefixes);
1569
    struct net_addr_flow4 *net = (void *) px->net;
1570
    uint flen = net->length - sizeof(net_addr_flow4);
1571

    
1572
    /* Encode path ID */
1573
    if (s->add_path)
1574
    {
1575
      put_u32(pos, px->path_id);
1576
      ADVANCE(pos, size, 4);
1577
    }
1578

    
1579
    if (flen > size)
1580
      break;
1581

    
1582
    /* Copy whole flow data including length */
1583
    memcpy(pos, net->data, flen);
1584
    ADVANCE(pos, size, flen);
1585

    
1586
    bgp_free_prefix(s->channel, px);
1587
  }
1588

    
1589
  return pos - buf;
1590
}
1591

    
1592
static void
1593
bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1594
{
1595
  while (len)
1596
  {
1597
    u32 path_id = 0;
1598

    
1599
    /* Decode path ID */
1600
    if (s->add_path)
1601
    {
1602
      if (len < 4)
1603
        bgp_parse_error(s, 1);
1604

    
1605
      path_id = get_u32(pos);
1606
      ADVANCE(pos, len, 4);
1607
    }
1608

    
1609
    if (len < 2)
1610
      bgp_parse_error(s, 1);
1611

    
1612
    /* Decode flow length */
1613
    uint hlen = flow_hdr_length(pos);
1614
    uint dlen = flow_read_length(pos);
1615
    uint flen = hlen + dlen;
1616
    byte *data = pos + hlen;
1617

    
1618
    if (len < flen)
1619
      bgp_parse_error(s, 1);
1620

    
1621
    /* Validate flow data */
1622
    enum flow_validated_state r = flow4_validate(data, dlen);
1623
    if (r != FLOW_ST_VALID)
1624
    {
1625
      log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r));
1626
      bgp_parse_error(s, 1);
1627
    }
1628

    
1629
    if (data[0] != FLOW_TYPE_DST_PREFIX)
1630
    {
1631
      log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name);
1632
      bgp_parse_error(s, 1);
1633
    }
1634

    
1635
    /* Decode dst prefix */
1636
    ip4_addr px = IP4_NONE;
1637
    uint pxlen = data[1];
1638

    
1639
    // FIXME: Use some generic function
1640
    memcpy(&px, data, BYTES(pxlen));
1641
    px = ip4_and(px, ip4_mkmask(pxlen));
1642

    
1643
    /* Prepare the flow */
1644
    net_addr *n = alloca(sizeof(struct net_addr_flow4) + flen);
1645
    net_fill_flow4(n, px, pxlen, pos, flen);
1646
    ADVANCE(pos, len, flen);
1647

    
1648
    bgp_rte_update(s, n, path_id, a);
1649
  }
1650
}
1651

    
1652

    
1653
static uint
1654
bgp_encode_nlri_flow6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1655
{
1656
  byte *pos = buf;
1657

    
1658
  while (!EMPTY_LIST(buck->prefixes) && (size >= 4))
1659
  {
1660
    struct bgp_prefix *px = HEAD(buck->prefixes);
1661
    struct net_addr_flow6 *net = (void *) px->net;
1662
    uint flen = net->length - sizeof(net_addr_flow6);
1663

    
1664
    /* Encode path ID */
1665
    if (s->add_path)
1666
    {
1667
      put_u32(pos, px->path_id);
1668
      ADVANCE(pos, size, 4);
1669
    }
1670

    
1671
    if (flen > size)
1672
      break;
1673

    
1674
    /* Copy whole flow data including length */
1675
    memcpy(pos, net->data, flen);
1676
    ADVANCE(pos, size, flen);
1677

    
1678
    bgp_free_prefix(s->channel, px);
1679
  }
1680

    
1681
  return pos - buf;
1682
}
1683

    
1684
static void
1685
bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1686
{
1687
  while (len)
1688
  {
1689
    u32 path_id = 0;
1690

    
1691
    /* Decode path ID */
1692
    if (s->add_path)
1693
    {
1694
      if (len < 4)
1695
        bgp_parse_error(s, 1);
1696

    
1697
      path_id = get_u32(pos);
1698
      ADVANCE(pos, len, 4);
1699
    }
1700

    
1701
    if (len < 2)
1702
      bgp_parse_error(s, 1);
1703

    
1704
    /* Decode flow length */
1705
    uint hlen = flow_hdr_length(pos);
1706
    uint dlen = flow_read_length(pos);
1707
    uint flen = hlen + dlen;
1708
    byte *data = pos + hlen;
1709

    
1710
    if (len < flen)
1711
      bgp_parse_error(s, 1);
1712

    
1713
    /* Validate flow data */
1714
    enum flow_validated_state r = flow6_validate(data, dlen);
1715
    if (r != FLOW_ST_VALID)
1716
    {
1717
      log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r));
1718
      bgp_parse_error(s, 1);
1719
    }
1720

    
1721
    if (data[0] != FLOW_TYPE_DST_PREFIX)
1722
    {
1723
      log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name);
1724
      bgp_parse_error(s, 1);
1725
    }
1726

    
1727
    /* Decode dst prefix */
1728
    ip6_addr px = IP6_NONE;
1729
    uint pxlen = data[1];
1730

    
1731
    // FIXME: Use some generic function
1732
    memcpy(&px, data, BYTES(pxlen));
1733
    px = ip6_and(px, ip6_mkmask(pxlen));
1734

    
1735
    /* Prepare the flow */
1736
    net_addr *n = alloca(sizeof(struct net_addr_flow6) + flen);
1737
    net_fill_flow6(n, px, pxlen, pos, flen);
1738
    ADVANCE(pos, len, flen);
1739

    
1740
    bgp_rte_update(s, n, path_id, a);
1741
  }
1742
}
1743

    
1744

    
1745
static const struct bgp_af_desc bgp_af_table[] = {
1746
  {
1747
    .afi = BGP_AF_IPV4,
1748
    .net = NET_IP4,
1749
    .name = "ipv4",
1750
    .encode_nlri = bgp_encode_nlri_ip4,
1751
    .decode_nlri = bgp_decode_nlri_ip4,
1752
    .encode_next_hop = bgp_encode_next_hop_ip,
1753
    .decode_next_hop = bgp_decode_next_hop_ip,
1754
    .update_next_hop = bgp_update_next_hop_ip,
1755
  },
1756
  {
1757
    .afi = BGP_AF_IPV4_MC,
1758
    .net = NET_IP4,
1759
    .name = "ipv4-mc",
1760
    .encode_nlri = bgp_encode_nlri_ip4,
1761
    .decode_nlri = bgp_decode_nlri_ip4,
1762
    .encode_next_hop = bgp_encode_next_hop_ip,
1763
    .decode_next_hop = bgp_decode_next_hop_ip,
1764
    .update_next_hop = bgp_update_next_hop_ip,
1765
  },
1766
  {
1767
    .afi = BGP_AF_IPV4_MPLS,
1768
    .net = NET_IP4,
1769
    .mpls = 1,
1770
    .name = "ipv4-mpls",
1771
    .encode_nlri = bgp_encode_nlri_ip4,
1772
    .decode_nlri = bgp_decode_nlri_ip4,
1773
    .encode_next_hop = bgp_encode_next_hop_ip,
1774
    .decode_next_hop = bgp_decode_next_hop_ip,
1775
    .update_next_hop = bgp_update_next_hop_ip,
1776
  },
1777
  {
1778
    .afi = BGP_AF_IPV6,
1779
    .net = NET_IP6,
1780
    .name = "ipv6",
1781
    .encode_nlri = bgp_encode_nlri_ip6,
1782
    .decode_nlri = bgp_decode_nlri_ip6,
1783
    .encode_next_hop = bgp_encode_next_hop_ip,
1784
    .decode_next_hop = bgp_decode_next_hop_ip,
1785
    .update_next_hop = bgp_update_next_hop_ip,
1786
  },
1787
  {
1788
    .afi = BGP_AF_IPV6_MC,
1789
    .net = NET_IP6,
1790
    .name = "ipv6-mc",
1791
    .encode_nlri = bgp_encode_nlri_ip6,
1792
    .decode_nlri = bgp_decode_nlri_ip6,
1793
    .encode_next_hop = bgp_encode_next_hop_ip,
1794
    .decode_next_hop = bgp_decode_next_hop_ip,
1795
    .update_next_hop = bgp_update_next_hop_ip,
1796
  },
1797
  {
1798
    .afi = BGP_AF_IPV6_MPLS,
1799
    .net = NET_IP6,
1800
    .mpls = 1,
1801
    .name = "ipv6-mpls",
1802
    .encode_nlri = bgp_encode_nlri_ip6,
1803
    .decode_nlri = bgp_decode_nlri_ip6,
1804
    .encode_next_hop = bgp_encode_next_hop_ip,
1805
    .decode_next_hop = bgp_decode_next_hop_ip,
1806
    .update_next_hop = bgp_update_next_hop_ip,
1807
  },
1808
  {
1809
    .afi = BGP_AF_VPN4_MPLS,
1810
    .net = NET_VPN4,
1811
    .mpls = 1,
1812
    .name = "vpn4-mpls",
1813
    .encode_nlri = bgp_encode_nlri_vpn4,
1814
    .decode_nlri = bgp_decode_nlri_vpn4,
1815
    .encode_next_hop = bgp_encode_next_hop_vpn,
1816
    .decode_next_hop = bgp_decode_next_hop_vpn,
1817
    .update_next_hop = bgp_update_next_hop_ip,
1818
  },
1819
  {
1820
    .afi = BGP_AF_VPN6_MPLS,
1821
    .net = NET_VPN6,
1822
    .mpls = 1,
1823
    .name = "vpn6-mpls",
1824
    .encode_nlri = bgp_encode_nlri_vpn6,
1825
    .decode_nlri = bgp_decode_nlri_vpn6,
1826
    .encode_next_hop = bgp_encode_next_hop_vpn,
1827
    .decode_next_hop = bgp_decode_next_hop_vpn,
1828
    .update_next_hop = bgp_update_next_hop_ip,
1829
  },
1830
  {
1831
    .afi = BGP_AF_VPN4_MC,
1832
    .net = NET_VPN4,
1833
    .name = "vpn4-mc",
1834
    .encode_nlri = bgp_encode_nlri_vpn4,
1835
    .decode_nlri = bgp_decode_nlri_vpn4,
1836
    .encode_next_hop = bgp_encode_next_hop_vpn,
1837
    .decode_next_hop = bgp_decode_next_hop_vpn,
1838
    .update_next_hop = bgp_update_next_hop_ip,
1839
  },
1840
  {
1841
    .afi = BGP_AF_VPN6_MC,
1842
    .net = NET_VPN6,
1843
    .name = "vpn6-mc",
1844
    .encode_nlri = bgp_encode_nlri_vpn6,
1845
    .decode_nlri = bgp_decode_nlri_vpn6,
1846
    .encode_next_hop = bgp_encode_next_hop_vpn,
1847
    .decode_next_hop = bgp_decode_next_hop_vpn,
1848
    .update_next_hop = bgp_update_next_hop_ip,
1849
  },
1850
  {
1851
    .afi = BGP_AF_FLOW4,
1852
    .net = NET_FLOW4,
1853
    .no_igp = 1,
1854
    .name = "flow4",
1855
    .encode_nlri = bgp_encode_nlri_flow4,
1856
    .decode_nlri = bgp_decode_nlri_flow4,
1857
    .encode_next_hop = bgp_encode_next_hop_none,
1858
    .decode_next_hop = bgp_decode_next_hop_none,
1859
    .update_next_hop = bgp_update_next_hop_none,
1860
  },
1861
  {
1862
    .afi = BGP_AF_FLOW6,
1863
    .net = NET_FLOW6,
1864
    .no_igp = 1,
1865
    .name = "flow6",
1866
    .encode_nlri = bgp_encode_nlri_flow6,
1867
    .decode_nlri = bgp_decode_nlri_flow6,
1868
    .encode_next_hop = bgp_encode_next_hop_none,
1869
    .decode_next_hop = bgp_decode_next_hop_none,
1870
    .update_next_hop = bgp_update_next_hop_none,
1871
  },
1872
};
1873

    
1874
const struct bgp_af_desc *
1875
bgp_get_af_desc(u32 afi)
1876
{
1877
  uint i;
1878
  for (i = 0; i < ARRAY_SIZE(bgp_af_table); i++)
1879
    if (bgp_af_table[i].afi == afi)
1880
      return &bgp_af_table[i];
1881

    
1882
  return NULL;
1883
}
1884

    
1885
static inline uint
1886
bgp_encode_nlri(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
1887
{
1888
  return s->channel->desc->encode_nlri(s, buck, buf, end - buf);
1889
}
1890

    
1891
static inline uint
1892
bgp_encode_next_hop(struct bgp_write_state *s, eattr *nh, byte *buf)
1893
{
1894
  return s->channel->desc->encode_next_hop(s, nh, buf, 255);
1895
}
1896

    
1897
void
1898
bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to)
1899
{
1900
  s->channel->desc->update_next_hop(s, a, to);
1901
}
1902

    
1903
#define MAX_ATTRS_LENGTH (end-buf+BGP_HEADER_LENGTH - 1024)
1904

    
1905
static byte *
1906
bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
1907
{
1908
  /*
1909
   *        2 B        Withdrawn Routes Length (zero)
1910
   *        ---        IPv4 Withdrawn Routes NLRI (unused)
1911
   *        2 B        Total Path Attribute Length
1912
   *        var        Path Attributes
1913
   *        var        IPv4 Network Layer Reachability Information
1914
   */
1915

    
1916
  int lr, la;
1917

    
1918
  la = bgp_encode_attrs(s, buck->eattrs, buf+4, buf + MAX_ATTRS_LENGTH);
1919
  if (la < 0)
1920
  {
1921
    /* Attribute list too long */
1922
    bgp_withdraw_bucket(s->channel, buck);
1923
    return NULL;
1924
  }
1925

    
1926
  put_u16(buf+0, 0);
1927
  put_u16(buf+2, la);
1928

    
1929
  lr = bgp_encode_nlri(s, buck, buf+4+la, end);
1930

    
1931
  return buf+4+la+lr;
1932
}
1933

    
1934
static byte *
1935
bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
1936
{
1937
  /*
1938
   *        2 B        IPv4 Withdrawn Routes Length (zero)
1939
   *        ---        IPv4 Withdrawn Routes NLRI (unused)
1940
   *        2 B        Total Path Attribute Length
1941
   *        1 B        MP_REACH_NLRI hdr - Attribute Flags
1942
   *        1 B        MP_REACH_NLRI hdr - Attribute Type Code
1943
   *        2 B        MP_REACH_NLRI hdr - Length of Attribute Data
1944
   *        2 B        MP_REACH_NLRI data - Address Family Identifier
1945
   *        1 B        MP_REACH_NLRI data - Subsequent Address Family Identifier
1946
   *        1 B        MP_REACH_NLRI data - Length of Next Hop Network Address
1947
   *        var        MP_REACH_NLRI data - Network Address of Next Hop
1948
   *        1 B        MP_REACH_NLRI data - Reserved (zero)
1949
   *        var        MP_REACH_NLRI data - Network Layer Reachability Information
1950
   *        var        Rest of Path Attributes
1951
   *        ---        IPv4 Network Layer Reachability Information (unused)
1952
   */
1953

    
1954
  int lh, lr, la;        /* Lengths of next hop, NLRI and attributes */
1955

    
1956
  /* Begin of MP_REACH_NLRI atribute */
1957
  buf[4] = BAF_OPTIONAL | BAF_EXT_LEN;
1958
  buf[5] = BA_MP_REACH_NLRI;
1959
  put_u16(buf+6, 0);                /* Will be fixed later */
1960
  put_af3(buf+8, s->channel->afi);
1961
  byte *pos = buf+11;
1962

    
1963
  /* Encode attributes to temporary buffer */
1964
  byte *abuf = alloca(MAX_ATTRS_LENGTH);
1965
  la = bgp_encode_attrs(s, buck->eattrs, abuf, abuf + MAX_ATTRS_LENGTH);
1966
  if (la < 0)
1967
  {
1968
    /* Attribute list too long */
1969
    bgp_withdraw_bucket(s->channel, buck);
1970
    return NULL;
1971
  }
1972

    
1973
  /* Encode the next hop */
1974
  lh = bgp_encode_next_hop(s, s->mp_next_hop, pos+1);
1975
  *pos = lh;
1976
  pos += 1+lh;
1977

    
1978
  /* Reserved field */
1979
  *pos++ = 0;
1980

    
1981
  /* Encode the NLRI */
1982
  lr = bgp_encode_nlri(s, buck, pos, end - la);
1983
  pos += lr;
1984

    
1985
  /* End of MP_REACH_NLRI atribute, update data length */
1986
  put_u16(buf+6, pos-buf-8);
1987

    
1988
  /* Copy remaining attributes */
1989
  memcpy(pos, abuf, la);
1990
  pos += la;
1991

    
1992
  /* Initial UPDATE fields */
1993
  put_u16(buf+0, 0);
1994
  put_u16(buf+2, pos-buf-4);
1995

    
1996
  return pos;
1997
}
1998

    
1999
#undef MAX_ATTRS_LENGTH
2000

    
2001
static byte *
2002
bgp_create_ip_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2003
{
2004
  /*
2005
   *        2 B        Withdrawn Routes Length
2006
   *        var        IPv4 Withdrawn Routes NLRI
2007
   *        2 B        Total Path Attribute Length (zero)
2008
   *        ---        Path Attributes (unused)
2009
   *        ---        IPv4 Network Layer Reachability Information (unused)
2010
   */
2011

    
2012
  uint len = bgp_encode_nlri(s, buck, buf+2, end);
2013

    
2014
  put_u16(buf+0, len);
2015
  put_u16(buf+2+len, 0);
2016

    
2017
  return buf+4+len;
2018
}
2019

    
2020
static byte *
2021
bgp_create_mp_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2022
{
2023
  /*
2024
   *        2 B        Withdrawn Routes Length (zero)
2025
   *        ---        IPv4 Withdrawn Routes NLRI (unused)
2026
   *        2 B        Total Path Attribute Length
2027
   *        1 B        MP_UNREACH_NLRI hdr - Attribute Flags
2028
   *        1 B        MP_UNREACH_NLRI hdr - Attribute Type Code
2029
   *        2 B        MP_UNREACH_NLRI hdr - Length of Attribute Data
2030
   *        2 B        MP_UNREACH_NLRI data - Address Family Identifier
2031
   *        1 B        MP_UNREACH_NLRI data - Subsequent Address Family Identifier
2032
   *        var        MP_UNREACH_NLRI data - Network Layer Reachability Information
2033
   *        ---        IPv4 Network Layer Reachability Information (unused)
2034
   */
2035

    
2036
  uint len = bgp_encode_nlri(s, buck, buf+11, end);
2037

    
2038
  put_u16(buf+0, 0);
2039
  put_u16(buf+2, 7+len);
2040

    
2041
  /* Begin of MP_UNREACH_NLRI atribute */
2042
  buf[4] = BAF_OPTIONAL | BAF_EXT_LEN;
2043
  buf[5] = BA_MP_UNREACH_NLRI;
2044
  put_u16(buf+6, 3+len);
2045
  put_af3(buf+8, s->channel->afi);
2046

    
2047
  return buf+11+len;
2048
}
2049

    
2050
static byte *
2051
bgp_create_update(struct bgp_channel *c, byte *buf)
2052
{
2053
  struct bgp_proto *p = (void *) c->c.proto;
2054
  struct bgp_bucket *buck;
2055
  byte *end = buf + (bgp_max_packet_length(p->conn) - BGP_HEADER_LENGTH);
2056
  byte *res = NULL;
2057

    
2058
again: ;
2059

    
2060
  /* Initialize write state */
2061
  struct bgp_write_state s = {
2062
    .proto = p,
2063
    .channel = c,
2064
    .pool = bgp_linpool,
2065
    .as4_session = p->as4_session,
2066
    .add_path = c->add_path_tx,
2067
    .mpls = c->desc->mpls,
2068
  };
2069

    
2070
  /* Try unreachable bucket */
2071
  if ((buck = c->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
2072
  {
2073
    res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ?
2074
      bgp_create_ip_unreach(&s, buck, buf, end):
2075
      bgp_create_mp_unreach(&s, buck, buf, end);
2076

    
2077
    goto done;
2078
  }
2079

    
2080
  /* Try reachable buckets */
2081
  if (!EMPTY_LIST(c->bucket_queue))
2082
  {
2083
    buck = HEAD(c->bucket_queue);
2084

    
2085
    /* Cleanup empty buckets */
2086
    if (EMPTY_LIST(buck->prefixes))
2087
    {
2088
      bgp_free_bucket(c, buck);
2089
      goto again;
2090
    }
2091

    
2092
    res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ?
2093
      bgp_create_ip_reach(&s, buck, buf, end):
2094
      bgp_create_mp_reach(&s, buck, buf, end);
2095

    
2096
    if (EMPTY_LIST(buck->prefixes))
2097
      bgp_free_bucket(c, buck);
2098
    else
2099
      bgp_defer_bucket(c, buck);
2100

    
2101
    if (!res)
2102
      goto again;
2103

    
2104
    goto done;
2105
  }
2106

    
2107
  /* No more prefixes to send */
2108
  return NULL;
2109

    
2110
done:
2111
  BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE");
2112
  lp_flush(s.pool);
2113

    
2114
  return res;
2115
}
2116

    
2117
static byte *
2118
bgp_create_ip_end_mark(struct bgp_channel *c UNUSED, byte *buf)
2119
{
2120
  /* Empty update packet */
2121
  put_u32(buf, 0);
2122

    
2123
  return buf+4;
2124
}
2125

    
2126
static byte *
2127
bgp_create_mp_end_mark(struct bgp_channel *c, byte *buf)
2128
{
2129
  put_u16(buf+0, 0);
2130
  put_u16(buf+2, 6);                /* length 4--9 */
2131

    
2132
  /* Empty MP_UNREACH_NLRI atribute */
2133
  buf[4] = BAF_OPTIONAL;
2134
  buf[5] = BA_MP_UNREACH_NLRI;
2135
  buf[6] = 3;                        /* Length 7--9 */
2136
  put_af3(buf+7, c->afi);
2137

    
2138
  return buf+10;
2139
}
2140

    
2141
static byte *
2142
bgp_create_end_mark(struct bgp_channel *c, byte *buf)
2143
{
2144
  struct bgp_proto *p = (void *) c->c.proto;
2145

    
2146
  BGP_TRACE(D_PACKETS, "Sending END-OF-RIB");
2147

    
2148
  return (c->afi == BGP_AF_IPV4) ?
2149
    bgp_create_ip_end_mark(c, buf):
2150
    bgp_create_mp_end_mark(c, buf);
2151
}
2152

    
2153
static inline void
2154
bgp_rx_end_mark(struct bgp_parse_state *s, u32 afi)
2155
{
2156
  struct bgp_proto *p = s->proto;
2157
  struct bgp_channel *c = bgp_get_channel(p, afi);
2158

    
2159
  BGP_TRACE(D_PACKETS, "Got END-OF-RIB");
2160

    
2161
  if (!c)
2162
    DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi));
2163

    
2164
  if (c->load_state == BFS_LOADING)
2165
    c->load_state = BFS_NONE;
2166

    
2167
  if (p->p.gr_recovery)
2168
    channel_graceful_restart_unlock(&c->c);
2169

    
2170
  if (c->gr_active)
2171
    bgp_graceful_restart_done(c);
2172
}
2173

    
2174
static inline void
2175
bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_list *ea, byte *nh, uint nh_len)
2176
{
2177
  struct bgp_channel *c = bgp_get_channel(s->proto, afi);
2178
  rta *a = NULL;
2179

    
2180
  if (!c)
2181
    DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi));
2182

    
2183
  s->channel = c;
2184
  s->add_path = c->add_path_rx;
2185
  s->mpls = c->desc->mpls;
2186

    
2187
  s->last_id = 0;
2188
  s->last_src = s->proto->p.main_source;
2189

    
2190
  /*
2191
   * IPv4 BGP and MP-BGP may be used together in one update, therefore we do not
2192
   * add BA_NEXT_HOP in bgp_decode_attrs(), but we add it here independently for
2193
   * IPv4 BGP and MP-BGP. We undo the attribute (and possibly others attached by
2194
   * decode_next_hop hooks) by restoring a->eattrs afterwards.
2195
   */
2196

    
2197
  if (ea)
2198
  {
2199
    a = allocz(RTA_MAX_SIZE);
2200

    
2201
    a->source = RTS_BGP;
2202
    a->scope = SCOPE_UNIVERSE;
2203
    a->from = s->proto->cf->remote_ip;
2204
    a->eattrs = ea;
2205

    
2206
    c->desc->decode_next_hop(s, nh, nh_len, a);
2207

    
2208
    /* Handle withdraw during next hop decoding */
2209
    if (s->err_withdraw)
2210
      a = NULL;
2211
  }
2212

    
2213
  c->desc->decode_nlri(s, nlri, len, a);
2214

    
2215
  rta_free(s->cached_rta);
2216
  s->cached_rta = NULL;
2217
}
2218

    
2219
static void
2220
bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len)
2221
{
2222
  struct bgp_proto *p = conn->bgp;
2223
  ea_list *ea = NULL;
2224

    
2225
  BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE");
2226

    
2227
  /* Workaround for some BGP implementations that skip initial KEEPALIVE */
2228
  if (conn->state == BS_OPENCONFIRM)
2229
    bgp_conn_enter_established_state(conn);
2230

    
2231
  if (conn->state != BS_ESTABLISHED)
2232
  { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
2233

    
2234
  bgp_start_timer(conn->hold_timer, conn->hold_time);
2235

    
2236
  /* Initialize parse state */
2237
  struct bgp_parse_state s = {
2238
    .proto = p,
2239
    .pool = bgp_linpool,
2240
    .as4_session = p->as4_session,
2241
  };
2242

    
2243
  /* Parse error handler */
2244
  if (setjmp(s.err_jmpbuf))
2245
  {
2246
    bgp_error(conn, 3, s.err_subcode, NULL, 0);
2247
    goto done;
2248
  }
2249

    
2250
  /* Check minimal length */
2251
  if (len < 23)
2252
  { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2253

    
2254
  /* Skip fixed header */
2255
  uint pos = 19;
2256

    
2257
  /*
2258
   *        UPDATE message format
2259
   *
2260
   *        2 B        IPv4 Withdrawn Routes Length
2261
   *        var        IPv4 Withdrawn Routes NLRI
2262
   *        2 B        Total Path Attribute Length
2263
   *        var        Path Attributes
2264
   *        var        IPv4 Reachable Routes NLRI
2265
   */
2266

    
2267
  s.ip_unreach_len = get_u16(pkt + pos);
2268
  s.ip_unreach_nlri = pkt + pos + 2;
2269
  pos += 2 + s.ip_unreach_len;
2270

    
2271
  if (pos + 2 > len)
2272
    bgp_parse_error(&s, 1);
2273

    
2274
  s.attr_len = get_u16(pkt + pos);
2275
  s.attrs = pkt + pos + 2;
2276
  pos += 2 + s.attr_len;
2277

    
2278
  if (pos > len)
2279
    bgp_parse_error(&s, 1);
2280

    
2281
  s.ip_reach_len = len - pos;
2282
  s.ip_reach_nlri = pkt + pos;
2283

    
2284

    
2285
  if (s.attr_len)
2286
    ea = bgp_decode_attrs(&s, s.attrs, s.attr_len);
2287

    
2288
  /* Check for End-of-RIB marker */
2289
  if (!s.attr_len && !s.ip_unreach_len && !s.ip_reach_len)
2290
  { bgp_rx_end_mark(&s, BGP_AF_IPV4); goto done; }
2291

    
2292
  /* Check for MP End-of-RIB marker */
2293
  if ((s.attr_len < 8) && !s.ip_unreach_len && !s.ip_reach_len &&
2294
      !s.mp_reach_len && !s.mp_unreach_len && s.mp_unreach_af)
2295
  { bgp_rx_end_mark(&s, s.mp_unreach_af); goto done; }
2296

    
2297
  if (s.ip_unreach_len)
2298
    bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_unreach_nlri, s.ip_unreach_len, NULL, NULL, 0);
2299

    
2300
  if (s.mp_unreach_len)
2301
    bgp_decode_nlri(&s, s.mp_unreach_af, s.mp_unreach_nlri, s.mp_unreach_len, NULL, NULL, 0);
2302

    
2303
  if (s.ip_reach_len)
2304
    bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_reach_nlri, s.ip_reach_len,
2305
                    ea, s.ip_next_hop_data, s.ip_next_hop_len);
2306

    
2307
  if (s.mp_reach_len)
2308
    bgp_decode_nlri(&s, s.mp_reach_af, s.mp_reach_nlri, s.mp_reach_len,
2309
                    ea, s.mp_next_hop_data, s.mp_next_hop_len);
2310

    
2311
done:
2312
  rta_free(s.cached_rta);
2313
  lp_flush(s.pool);
2314
  return;
2315
}
2316

    
2317

    
2318
/*
2319
 *        ROUTE-REFRESH
2320
 */
2321

    
2322
static inline byte *
2323
bgp_create_route_refresh(struct bgp_channel *c, byte *buf)
2324
{
2325
  struct bgp_proto *p = (void *) c->c.proto;
2326

    
2327
  BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH");
2328

    
2329
  /* Original route refresh request, RFC 2918 */
2330
  put_af4(buf, c->afi);
2331
  buf[2] = BGP_RR_REQUEST;
2332

    
2333
  return buf+4;
2334
}
2335

    
2336
static inline byte *
2337
bgp_create_begin_refresh(struct bgp_channel *c, byte *buf)
2338
{
2339
  struct bgp_proto *p = (void *) c->c.proto;
2340

    
2341
  BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR");
2342

    
2343
  /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */
2344
  put_af4(buf, c->afi);
2345
  buf[2] = BGP_RR_BEGIN;
2346

    
2347
  return buf+4;
2348
}
2349

    
2350
static inline byte *
2351
bgp_create_end_refresh(struct bgp_channel *c, byte *buf)
2352
{
2353
  struct bgp_proto *p = (void *) c->c.proto;
2354

    
2355
  BGP_TRACE(D_PACKETS, "Sending END-OF-RR");
2356

    
2357
  /* Demarcation of ending of route refresh (EoRR), RFC 7313 */
2358
  put_af4(buf, c->afi);
2359
  buf[2] = BGP_RR_END;
2360

    
2361
  return buf+4;
2362
}
2363

    
2364
static void
2365
bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len)
2366
{
2367
  struct bgp_proto *p = conn->bgp;
2368

    
2369
  if (conn->state != BS_ESTABLISHED)
2370
  { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
2371

    
2372
  if (!conn->local_caps->route_refresh)
2373
  { bgp_error(conn, 1, 3, pkt+18, 1); return; }
2374

    
2375
  if (len < (BGP_HEADER_LENGTH + 4))
2376
  { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2377

    
2378
  if (len > (BGP_HEADER_LENGTH + 4))
2379
  { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; }
2380

    
2381
  struct bgp_channel *c = bgp_get_channel(p, get_af4(pkt+19));
2382
  if (!c)
2383
  {
2384
    log(L_WARN "%s: Got ROUTE-REFRESH subtype %u for AF %u.%u, ignoring",
2385
        p->p.name, pkt[21], get_u16(pkt+19), pkt[22]);
2386
    return;
2387
  }
2388

    
2389
  /* RFC 7313 redefined reserved field as RR message subtype */
2390
  uint subtype = p->enhanced_refresh ? pkt[21] : BGP_RR_REQUEST;
2391

    
2392
  switch (subtype)
2393
  {
2394
  case BGP_RR_REQUEST:
2395
    BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH");
2396
    channel_request_feeding(&c->c);
2397
    break;
2398

    
2399
  case BGP_RR_BEGIN:
2400
    BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR");
2401
    bgp_refresh_begin(c);
2402
    break;
2403

    
2404
  case BGP_RR_END:
2405
    BGP_TRACE(D_PACKETS, "Got END-OF-RR");
2406
    bgp_refresh_end(c);
2407
    break;
2408

    
2409
  default:
2410
    log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring",
2411
        p->p.name, subtype);
2412
    break;
2413
  }
2414
}
2415

    
2416
static inline struct bgp_channel *
2417
bgp_get_channel_to_send(struct bgp_proto *p, struct bgp_conn *conn)
2418
{
2419
  uint i = conn->last_channel;
2420

    
2421
  /* Try the last channel, but at most several times */
2422
  if ((conn->channels_to_send & (1 << i)) &&
2423
      (conn->last_channel_count < 16))
2424
    goto found;
2425

    
2426
  /* Find channel with non-zero channels_to_send */
2427
  do
2428
  {
2429
    i++;
2430
    if (i >= p->channel_count)
2431
      i = 0;
2432
  }
2433
  while (! (conn->channels_to_send & (1 << i)));
2434

    
2435
  /* Use that channel */
2436
  conn->last_channel = i;
2437
  conn->last_channel_count = 0;
2438

    
2439
found:
2440
  conn->last_channel_count++;
2441
  return p->channel_map[i];
2442
}
2443

    
2444
static inline int
2445
bgp_send(struct bgp_conn *conn, uint type, uint len)
2446
{
2447
  sock *sk = conn->sk;
2448
  byte *buf = sk->tbuf;
2449

    
2450
  memset(buf, 0xff, 16);                /* Marker */
2451
  put_u16(buf+16, len);
2452
  buf[18] = type;
2453

    
2454
  return sk_send(sk, len);
2455
}
2456

    
2457
/**
2458
 * bgp_fire_tx - transmit packets
2459
 * @conn: connection
2460
 *
2461
 * Whenever the transmit buffers of the underlying TCP connection
2462
 * are free and we have any packets queued for sending, the socket functions
2463
 * call bgp_fire_tx() which takes care of selecting the highest priority packet
2464
 * queued (Notification > Keepalive > Open > Update), assembling its header
2465
 * and body and sending it to the connection.
2466
 */
2467
static int
2468
bgp_fire_tx(struct bgp_conn *conn)
2469
{
2470
  struct bgp_proto *p = conn->bgp;
2471
  struct bgp_channel *c;
2472
  byte *buf, *pkt, *end;
2473
  uint s;
2474

    
2475
  if (!conn->sk)
2476
    return 0;
2477

    
2478
  buf = conn->sk->tbuf;
2479
  pkt = buf + BGP_HEADER_LENGTH;
2480
  s = conn->packets_to_send;
2481

    
2482
  if (s & (1 << PKT_SCHEDULE_CLOSE))
2483
  {
2484
    /* We can finally close connection and enter idle state */
2485
    bgp_conn_enter_idle_state(conn);
2486
    return 0;
2487
  }
2488
  if (s & (1 << PKT_NOTIFICATION))
2489
  {
2490
    conn->packets_to_send = 1 << PKT_SCHEDULE_CLOSE;
2491
    end = bgp_create_notification(conn, pkt);
2492
    return bgp_send(conn, PKT_NOTIFICATION, end - buf);
2493
  }
2494
  else if (s & (1 << PKT_KEEPALIVE))
2495
  {
2496
    conn->packets_to_send &= ~(1 << PKT_KEEPALIVE);
2497
    BGP_TRACE(D_PACKETS, "Sending KEEPALIVE");
2498
    bgp_start_timer(conn->keepalive_timer, conn->keepalive_time);
2499
    return bgp_send(conn, PKT_KEEPALIVE, BGP_HEADER_LENGTH);
2500
  }
2501
  else if (s & (1 << PKT_OPEN))
2502
  {
2503
    conn->packets_to_send &= ~(1 << PKT_OPEN);
2504
    end = bgp_create_open(conn, pkt);
2505
    return bgp_send(conn, PKT_OPEN, end - buf);
2506
  }
2507
  else while (conn->channels_to_send)
2508
  {
2509
    c = bgp_get_channel_to_send(p, conn);
2510
    s = c->packets_to_send;
2511

    
2512
    if (s & (1 << PKT_ROUTE_REFRESH))
2513
    {
2514
      c->packets_to_send &= ~(1 << PKT_ROUTE_REFRESH);
2515
      end = bgp_create_route_refresh(c, pkt);
2516
      return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2517
    }
2518
    else if (s & (1 << PKT_BEGIN_REFRESH))
2519
    {
2520
      /* BoRR is a subtype of RR, but uses separate bit in packets_to_send */
2521
      c->packets_to_send &= ~(1 << PKT_BEGIN_REFRESH);
2522
      end = bgp_create_begin_refresh(c, pkt);
2523
      return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2524
    }
2525
    else if (s & (1 << PKT_UPDATE))
2526
    {
2527
      end = bgp_create_update(c, pkt);
2528
      if (end)
2529
        return bgp_send(conn, PKT_UPDATE, end - buf);
2530

    
2531
      /* No update to send, perhaps we need to send End-of-RIB or EoRR */
2532
      c->packets_to_send = 0;
2533
      conn->channels_to_send &= ~(1 << c->index);
2534

    
2535
      if (c->feed_state == BFS_LOADED)
2536
      {
2537
        c->feed_state = BFS_NONE;
2538
        end = bgp_create_end_mark(c, pkt);
2539
        return bgp_send(conn, PKT_UPDATE, end - buf);
2540
      }
2541

    
2542
      else if (c->feed_state == BFS_REFRESHED)
2543
      {
2544
        c->feed_state = BFS_NONE;
2545
        end = bgp_create_end_refresh(c, pkt);
2546
        return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2547
      }
2548
    }
2549
    else if (s)
2550
      bug("Channel packets_to_send: %x", s);
2551

    
2552
    c->packets_to_send = 0;
2553
    conn->channels_to_send &= ~(1 << c->index);
2554
  }
2555

    
2556
  return 0;
2557
}
2558

    
2559
/**
2560
 * bgp_schedule_packet - schedule a packet for transmission
2561
 * @conn: connection
2562
 * @c: channel
2563
 * @type: packet type
2564
 *
2565
 * Schedule a packet of type @type to be sent as soon as possible.
2566
 */
2567
void
2568
bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type)
2569
{
2570
  ASSERT(conn->sk);
2571

    
2572
  DBG("BGP: Scheduling packet type %d\n", type);
2573

    
2574
  if (c)
2575
  {
2576
    if (! conn->channels_to_send)
2577
    {
2578
      conn->last_channel = c->index;
2579
      conn->last_channel_count = 0;
2580
    }
2581

    
2582
    c->packets_to_send |= 1 << type;
2583
    conn->channels_to_send |= 1 << c->index;
2584
  }
2585
  else
2586
    conn->packets_to_send |= 1 << type;
2587

    
2588
  if ((conn->sk->tpos == conn->sk->tbuf) && !ev_active(conn->tx_ev))
2589
    ev_schedule(conn->tx_ev);
2590
}
2591

    
2592
void
2593
bgp_kick_tx(void *vconn)
2594
{
2595
  struct bgp_conn *conn = vconn;
2596

    
2597
  DBG("BGP: kicking TX\n");
2598
  while (bgp_fire_tx(conn) > 0)
2599
    ;
2600
}
2601

    
2602
void
2603
bgp_tx(sock *sk)
2604
{
2605
  struct bgp_conn *conn = sk->data;
2606

    
2607
  DBG("BGP: TX hook\n");
2608
  while (bgp_fire_tx(conn) > 0)
2609
    ;
2610
}
2611

    
2612

    
2613
static struct {
2614
  byte major, minor;
2615
  byte *msg;
2616
} bgp_msg_table[] = {
2617
  { 1, 0, "Invalid message header" },
2618
  { 1, 1, "Connection not synchronized" },
2619
  { 1, 2, "Bad message length" },
2620
  { 1, 3, "Bad message type" },
2621
  { 2, 0, "Invalid OPEN message" },
2622
  { 2, 1, "Unsupported version number" },
2623
  { 2, 2, "Bad peer AS" },
2624
  { 2, 3, "Bad BGP identifier" },
2625
  { 2, 4, "Unsupported optional parameter" },
2626
  { 2, 5, "Authentication failure" },
2627
  { 2, 6, "Unacceptable hold time" },
2628
  { 2, 7, "Required capability missing" }, /* [RFC5492] */
2629
  { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */
2630
  { 3, 0, "Invalid UPDATE message" },
2631
  { 3, 1, "Malformed attribute list" },
2632
  { 3, 2, "Unrecognized well-known attribute" },
2633
  { 3, 3, "Missing mandatory attribute" },
2634
  { 3, 4, "Invalid attribute flags" },
2635
  { 3, 5, "Invalid attribute length" },
2636
  { 3, 6, "Invalid ORIGIN attribute" },
2637
  { 3, 7, "AS routing loop" },                /* Deprecated */
2638
  { 3, 8, "Invalid NEXT_HOP attribute" },
2639
  { 3, 9, "Optional attribute error" },
2640
  { 3, 10, "Invalid network field" },
2641
  { 3, 11, "Malformed AS_PATH" },
2642
  { 4, 0, "Hold timer expired" },
2643
  { 5, 0, "Finite state machine error" }, /* Subcodes are according to [RFC6608] */
2644
  { 5, 1, "Unexpected message in OpenSent state" },
2645
  { 5, 2, "Unexpected message in OpenConfirm state" },
2646
  { 5, 3, "Unexpected message in Established state" },
2647
  { 6, 0, "Cease" }, /* Subcodes are according to [RFC4486] */
2648
  { 6, 1, "Maximum number of prefixes reached" },
2649
  { 6, 2, "Administrative shutdown" },
2650
  { 6, 3, "Peer de-configured" },
2651
  { 6, 4, "Administrative reset" },
2652
  { 6, 5, "Connection rejected" },
2653
  { 6, 6, "Other configuration change" },
2654
  { 6, 7, "Connection collision resolution" },
2655
  { 6, 8, "Out of Resources" },
2656
  { 7, 0, "Invalid ROUTE-REFRESH message" }, /* [RFC7313] */
2657
  { 7, 1, "Invalid ROUTE-REFRESH message length" } /* [RFC7313] */
2658
};
2659

    
2660
/**
2661
 * bgp_error_dsc - return BGP error description
2662
 * @code: BGP error code
2663
 * @subcode: BGP error subcode
2664
 *
2665
 * bgp_error_dsc() returns error description for BGP errors
2666
 * which might be static string or given temporary buffer.
2667
 */
2668
const char *
2669
bgp_error_dsc(uint code, uint subcode)
2670
{
2671
  static char buff[32];
2672
  uint i;
2673

    
2674
  for (i=0; i < ARRAY_SIZE(bgp_msg_table); i++)
2675
    if (bgp_msg_table[i].major == code && bgp_msg_table[i].minor == subcode)
2676
      return bgp_msg_table[i].msg;
2677

    
2678
  bsprintf(buff, "Unknown error %u.%u", code, subcode);
2679
  return buff;
2680
}
2681

    
2682
/* RFC 8203 - shutdown communication message */
2683
static int
2684
bgp_handle_message(struct bgp_proto *p, byte *data, uint len, byte **bp)
2685
{
2686
  byte *msg = data + 1;
2687
  uint msg_len = data[0];
2688
  uint i;
2689

    
2690
  /* Handle zero length message */
2691
  if (msg_len == 0)
2692
    return 1;
2693

    
2694
  /* Handle proper message */
2695
  if ((msg_len > 128) && (msg_len + 1 > len))
2696
    return 0;
2697

    
2698
  /* Some elementary cleanup */
2699
  for (i = 0; i < msg_len; i++)
2700
    if (msg[i] < ' ')
2701
      msg[i] = ' ';
2702

    
2703
  proto_set_message(&p->p, msg, msg_len);
2704
  *bp += bsprintf(*bp, ": \"%s\"", p->p.message);
2705
  return 1;
2706
}
2707

    
2708
void
2709
bgp_log_error(struct bgp_proto *p, u8 class, char *msg, uint code, uint subcode, byte *data, uint len)
2710
{
2711
  byte argbuf[256], *t = argbuf;
2712
  uint i;
2713

    
2714
  /* Don't report Cease messages generated by myself */
2715
  if (code == 6 && class == BE_BGP_TX)
2716
    return;
2717

    
2718
  /* Reset shutdown message */
2719
  if ((code == 6) && ((subcode == 2) || (subcode == 4)))
2720
    proto_set_message(&p->p, NULL, 0);
2721

    
2722
  if (len)
2723
    {
2724
      /* Bad peer AS - we would like to print the AS */
2725
      if ((code == 2) && (subcode == 2) && ((len == 2) || (len == 4)))
2726
        {
2727
          t += bsprintf(t, ": %u", (len == 2) ? get_u16(data) : get_u32(data));
2728
          goto done;
2729
        }
2730

    
2731
      /* RFC 8203 - shutdown communication */
2732
      if (((code == 6) && ((subcode == 2) || (subcode == 4))))
2733
        if (bgp_handle_message(p, data, len, &t))
2734
          goto done;
2735

    
2736
      *t++ = ':';
2737
      *t++ = ' ';
2738
      if (len > 16)
2739
        len = 16;
2740
      for (i=0; i<len; i++)
2741
        t += bsprintf(t, "%02x", data[i]);
2742
    }
2743

    
2744
done:
2745
  *t = 0;
2746
  const byte *dsc = bgp_error_dsc(code, subcode);
2747
  log(L_REMOTE "%s: %s: %s%s", p->p.name, msg, dsc, argbuf);
2748
}
2749

    
2750
static void
2751
bgp_rx_notification(struct bgp_conn *conn, byte *pkt, uint len)
2752
{
2753
  struct bgp_proto *p = conn->bgp;
2754

    
2755
  if (len < 21)
2756
  { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2757

    
2758
  uint code = pkt[19];
2759
  uint subcode = pkt[20];
2760
  int err = (code != 6);
2761

    
2762
  bgp_log_error(p, BE_BGP_RX, "Received", code, subcode, pkt+21, len-21);
2763
  bgp_store_error(p, conn, BE_BGP_RX, (code << 16) | subcode);
2764

    
2765
  bgp_conn_enter_close_state(conn);
2766
  bgp_schedule_packet(conn, NULL, PKT_SCHEDULE_CLOSE);
2767

    
2768
  if (err)
2769
  {
2770
    bgp_update_startup_delay(p);
2771
    bgp_stop(p, 0, NULL, 0);
2772
  }
2773
}
2774

    
2775
static void
2776
bgp_rx_keepalive(struct bgp_conn *conn)
2777
{
2778
  struct bgp_proto *p = conn->bgp;
2779

    
2780
  BGP_TRACE(D_PACKETS, "Got KEEPALIVE");
2781
  bgp_start_timer(conn->hold_timer, conn->hold_time);
2782

    
2783
  if (conn->state == BS_OPENCONFIRM)
2784
  { bgp_conn_enter_established_state(conn); return; }
2785

    
2786
  if (conn->state != BS_ESTABLISHED)
2787
    bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0);
2788
}
2789

    
2790

    
2791
/**
2792
 * bgp_rx_packet - handle a received packet
2793
 * @conn: BGP connection
2794
 * @pkt: start of the packet
2795
 * @len: packet size
2796
 *
2797
 * bgp_rx_packet() takes a newly received packet and calls the corresponding
2798
 * packet handler according to the packet type.
2799
 */
2800
static void
2801
bgp_rx_packet(struct bgp_conn *conn, byte *pkt, uint len)
2802
{
2803
  byte type = pkt[18];
2804

    
2805
  DBG("BGP: Got packet %02x (%d bytes)\n", type, len);
2806

    
2807
  if (conn->bgp->p.mrtdump & MD_MESSAGES)
2808
    mrt_dump_bgp_packet(conn, pkt, len);
2809

    
2810
  switch (type)
2811
  {
2812
  case PKT_OPEN:                return bgp_rx_open(conn, pkt, len);
2813
  case PKT_UPDATE:                return bgp_rx_update(conn, pkt, len);
2814
  case PKT_NOTIFICATION:        return bgp_rx_notification(conn, pkt, len);
2815
  case PKT_KEEPALIVE:                return bgp_rx_keepalive(conn);
2816
  case PKT_ROUTE_REFRESH:        return bgp_rx_route_refresh(conn, pkt, len);
2817
  default:                        bgp_error(conn, 1, 3, pkt+18, 1);
2818
  }
2819
}
2820

    
2821
/**
2822
 * bgp_rx - handle received data
2823
 * @sk: socket
2824
 * @size: amount of data received
2825
 *
2826
 * bgp_rx() is called by the socket layer whenever new data arrive from
2827
 * the underlying TCP connection. It assembles the data fragments to packets,
2828
 * checks their headers and framing and passes complete packets to
2829
 * bgp_rx_packet().
2830
 */
2831
int
2832
bgp_rx(sock *sk, uint size)
2833
{
2834
  struct bgp_conn *conn = sk->data;
2835
  byte *pkt_start = sk->rbuf;
2836
  byte *end = pkt_start + size;
2837
  uint i, len;
2838

    
2839
  DBG("BGP: RX hook: Got %d bytes\n", size);
2840
  while (end >= pkt_start + BGP_HEADER_LENGTH)
2841
    {
2842
      if ((conn->state == BS_CLOSE) || (conn->sk != sk))
2843
        return 0;
2844
      for(i=0; i<16; i++)
2845
        if (pkt_start[i] != 0xff)
2846
          {
2847
            bgp_error(conn, 1, 1, NULL, 0);
2848
            break;
2849
          }
2850
      len = get_u16(pkt_start+16);
2851
      if ((len < BGP_HEADER_LENGTH) || (len > bgp_max_packet_length(conn)))
2852
        {
2853
          bgp_error(conn, 1, 2, pkt_start+16, 2);
2854
          break;
2855
        }
2856
      if (end < pkt_start + len)
2857
        break;
2858
      bgp_rx_packet(conn, pkt_start, len);
2859
      pkt_start += len;
2860
    }
2861
  if (pkt_start != sk->rbuf)
2862
    {
2863
      memmove(sk->rbuf, pkt_start, end - pkt_start);
2864
      sk->rpos = sk->rbuf + (end - pkt_start);
2865
    }
2866
  return 0;
2867
}