Statistics
| Branch: | Revision:

iof-bird-daemon / proto / bgp / packets.c @ 7e5f769d

History | View | Annotate | Download (71.5 KB)

1
/*
2
 *        BIRD -- BGP Packet Processing
3
 *
4
 *        (c) 2000 Martin Mares <mj@ucw.cz>
5
 *        (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
6
 *        (c) 2008--2016 CZ.NIC z.s.p.o.
7
 *
8
 *        Can be freely distributed and used under the terms of the GNU GPL.
9
 */
10

    
11
#undef LOCAL_DEBUG
12

    
13
#include <stdlib.h>
14

    
15
#include "nest/bird.h"
16
#include "nest/iface.h"
17
#include "nest/protocol.h"
18
#include "nest/route.h"
19
#include "nest/attrs.h"
20
#include "proto/mrt/mrt.h"
21
#include "conf/conf.h"
22
#include "lib/unaligned.h"
23
#include "lib/flowspec.h"
24
#include "lib/socket.h"
25

    
26
#include "nest/cli.h"
27

    
28
#include "bgp.h"
29

    
30

    
31
#define BGP_RR_REQUEST                0
32
#define BGP_RR_BEGIN                1
33
#define BGP_RR_END                2
34

    
35
#define BGP_NLRI_MAX                (4 + 1 + 32)
36

    
37
#define BGP_MPLS_BOS                1        /* Bottom-of-stack bit */
38
#define BGP_MPLS_MAX                10        /* Max number of labels that 24*n <= 255 */
39
#define BGP_MPLS_NULL                3        /* Implicit NULL label */
40
#define BGP_MPLS_MAGIC                0x800000 /* Magic withdraw label value, RFC 3107 3 */
41

    
42

    
43
static struct tbf rl_rcv_update = TBF_DEFAULT_LOG_LIMITS;
44
static struct tbf rl_snd_update = TBF_DEFAULT_LOG_LIMITS;
45

    
46
/* Table for state -> RFC 6608 FSM error subcodes */
47
static byte fsm_err_subcode[BS_MAX] = {
48
  [BS_OPENSENT] = 1,
49
  [BS_OPENCONFIRM] = 2,
50
  [BS_ESTABLISHED] = 3
51
};
52

    
53

    
54
static struct bgp_channel *
55
bgp_get_channel(struct bgp_proto *p, u32 afi)
56
{
57
  uint i;
58

    
59
  for (i = 0; i < p->channel_count; i++)
60
    if (p->afi_map[i] == afi)
61
      return p->channel_map[i];
62

    
63
  return NULL;
64
}
65

    
66
static inline void
67
put_af3(byte *buf, u32 id)
68
{
69
  put_u16(buf, id >> 16);
70
  buf[2] = id & 0xff;
71
}
72

    
73
static inline void
74
put_af4(byte *buf, u32 id)
75
{
76
  put_u16(buf, id >> 16);
77
  buf[2] = 0;
78
  buf[3] = id & 0xff;
79
}
80

    
81
static inline u32
82
get_af3(byte *buf)
83
{
84
  return (get_u16(buf) << 16) | buf[2];
85
}
86

    
87
static inline u32
88
get_af4(byte *buf)
89
{
90
  return (get_u16(buf) << 16) | buf[3];
91
}
92

    
93
static void
94
init_mrt_bgp_data(struct bgp_conn *conn, struct mrt_bgp_data *d)
95
{
96
  struct bgp_proto *p = conn->bgp;
97
  int p_ok = conn->state >= BS_OPENCONFIRM;
98

    
99
  memset(d, 0, sizeof(struct mrt_bgp_data));
100
  d->peer_as = p->remote_as;
101
  d->local_as = p->local_as;
102
  d->index = (p->neigh && p->neigh->iface) ? p->neigh->iface->index : 0;
103
  d->af = ipa_is_ip4(p->cf->remote_ip) ? BGP_AFI_IPV4 : BGP_AFI_IPV6;
104
  d->peer_ip = conn->sk ? conn->sk->daddr : IPA_NONE;
105
  d->local_ip = conn->sk ? conn->sk->saddr : IPA_NONE;
106
  d->as4 = p_ok ? p->as4_session : 0;
107
}
108

    
109
static uint bgp_find_update_afi(byte *pos, uint len);
110

    
111
static int
112
bgp_estimate_add_path(struct bgp_proto *p, byte *pkt, uint len)
113
{
114
  /* No need to estimate it for other messages than UPDATE */
115
  if (pkt[18] != PKT_UPDATE)
116
    return 0;
117

    
118
  /* 1 -> no channel, 2 -> all channels, 3 -> some channels */
119
  if (p->summary_add_path_rx < 3)
120
    return p->summary_add_path_rx == 2;
121

    
122
  uint afi = bgp_find_update_afi(pkt, len);
123
  struct bgp_channel *c = bgp_get_channel(p, afi);
124
  if (!c)
125
  {
126
    /* Either frame error (if !afi) or unknown AFI/SAFI,
127
       will be reported later in regular parsing */
128
    BGP_TRACE(D_PACKETS, "MRT processing noticed invalid packet");
129
    return 0;
130
  }
131

    
132
  return c->add_path_rx;
133
}
134

    
135
static void
136
bgp_dump_message(struct bgp_conn *conn, byte *pkt, uint len)
137
{
138
  struct mrt_bgp_data d;
139
  init_mrt_bgp_data(conn, &d);
140

    
141
  d.message = pkt;
142
  d.msg_len = len;
143
  d.add_path = bgp_estimate_add_path(conn->bgp, pkt, len);
144

    
145
  mrt_dump_bgp_message(&d);
146
}
147

    
148
void
149
bgp_dump_state_change(struct bgp_conn *conn, uint old, uint new)
150
{
151
  struct mrt_bgp_data d;
152
  init_mrt_bgp_data(conn, &d);
153

    
154
  d.old_state = old;
155
  d.new_state = new;
156

    
157
  mrt_dump_bgp_state_change(&d);
158
}
159

    
160
static byte *
161
bgp_create_notification(struct bgp_conn *conn, byte *buf)
162
{
163
  struct bgp_proto *p = conn->bgp;
164

    
165
  BGP_TRACE(D_PACKETS, "Sending NOTIFICATION(code=%d.%d)", conn->notify_code, conn->notify_subcode);
166
  buf[0] = conn->notify_code;
167
  buf[1] = conn->notify_subcode;
168
  memcpy(buf+2, conn->notify_data, conn->notify_size);
169
  return buf + 2 + conn->notify_size;
170
}
171

    
172

    
173
/* Capability negotiation as per RFC 5492 */
174

    
175
const struct bgp_af_caps *
176
bgp_find_af_caps(struct bgp_caps *caps, u32 afi)
177
{
178
  struct bgp_af_caps *ac;
179

    
180
  WALK_AF_CAPS(caps, ac)
181
    if (ac->afi == afi)
182
      return ac;
183

    
184
  return NULL;
185
}
186

    
187
static struct bgp_af_caps *
188
bgp_get_af_caps(struct bgp_caps *caps, u32 afi)
189
{
190
  struct bgp_af_caps *ac;
191

    
192
  WALK_AF_CAPS(caps, ac)
193
    if (ac->afi == afi)
194
      return ac;
195

    
196
  ac = &caps->af_data[caps->af_count++];
197
  memset(ac, 0, sizeof(struct bgp_af_caps));
198
  ac->afi = afi;
199

    
200
  return ac;
201
}
202

    
203
static int
204
bgp_af_caps_cmp(const void *X, const void *Y)
205
{
206
  const struct bgp_af_caps *x = X, *y = Y;
207
  return (x->afi < y->afi) ? -1 : (x->afi > y->afi) ? 1 : 0;
208
}
209

    
210

    
211
static byte *
212
bgp_write_capabilities(struct bgp_conn *conn, byte *buf)
213
{
214
  struct bgp_proto *p = conn->bgp;
215
  struct bgp_channel *c;
216
  struct bgp_caps *caps;
217
  struct bgp_af_caps *ac;
218
  uint any_ext_next_hop = 0;
219
  uint any_add_path = 0;
220
  byte *buf_head = buf;
221
  byte *data;
222

    
223
  /* Prepare bgp_caps structure */
224

    
225
  int n = list_length(&p->p.channels);
226
  caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + n * sizeof(struct bgp_af_caps));
227
  conn->local_caps = caps;
228

    
229
  caps->as4_support = p->cf->enable_as4;
230
  caps->ext_messages = p->cf->enable_extended_messages;
231
  caps->route_refresh = p->cf->enable_refresh;
232
  caps->enhanced_refresh = p->cf->enable_refresh;
233

    
234
  if (caps->as4_support)
235
    caps->as4_number = p->public_as;
236

    
237
  if (p->cf->gr_mode)
238
  {
239
    caps->gr_aware = 1;
240
    caps->gr_time = p->cf->gr_time;
241
    caps->gr_flags = p->p.gr_recovery ? BGP_GRF_RESTART : 0;
242
  }
243

    
244
  if (p->cf->llgr_mode)
245
    caps->llgr_aware = 1;
246

    
247
  /* Allocate and fill per-AF fields */
248
  WALK_LIST(c, p->p.channels)
249
  {
250
    ac = &caps->af_data[caps->af_count++];
251
    ac->afi = c->afi;
252
    ac->ready = 1;
253

    
254
    ac->ext_next_hop = bgp_channel_is_ipv4(c) && c->cf->ext_next_hop;
255
    any_ext_next_hop |= ac->ext_next_hop;
256

    
257
    ac->add_path = c->cf->add_path;
258
    any_add_path |= ac->add_path;
259

    
260
    if (c->cf->gr_able)
261
    {
262
      ac->gr_able = 1;
263

    
264
      if (p->p.gr_recovery)
265
        ac->gr_af_flags |= BGP_GRF_FORWARDING;
266
    }
267

    
268
    if (c->cf->llgr_able)
269
    {
270
      ac->llgr_able = 1;
271
      ac->llgr_time = c->cf->llgr_time;
272

    
273
      if (p->p.gr_recovery)
274
        ac->llgr_flags |= BGP_LLGRF_FORWARDING;
275
    }
276
  }
277

    
278
  /* Sort capability fields by AFI/SAFI */
279
  qsort(caps->af_data, caps->af_count, sizeof(struct bgp_af_caps), bgp_af_caps_cmp);
280

    
281

    
282
  /* Create capability list in buffer */
283

    
284
  /*
285
   * Note that max length is ~ 22+21*af_count. With max 12 channels that is
286
   * 274. Option limit is 253 and buffer size is 4096, so we cannot overflow
287
   * unless we add new capabilities or more AFs. XXXXX
288
   */
289

    
290
  WALK_AF_CAPS(caps, ac)
291
    if (ac->ready)
292
    {
293
      *buf++ = 1;                /* Capability 1: Multiprotocol extensions */
294
      *buf++ = 4;                /* Capability data length */
295
      put_af4(buf, ac->afi);
296
      buf += 4;
297
    }
298

    
299
  if (caps->route_refresh)
300
  {
301
    *buf++ = 2;                        /* Capability 2: Support for route refresh */
302
    *buf++ = 0;                        /* Capability data length */
303
  }
304

    
305
  if (any_ext_next_hop)
306
  {
307
    *buf++ = 5;                        /* Capability 5: Support for extended next hop */
308
    *buf++ = 0;                        /* Capability data length, will be fixed later */
309
    data = buf;
310

    
311
    WALK_AF_CAPS(caps, ac)
312
      if (ac->ext_next_hop)
313
      {
314
        put_af4(buf, ac->afi);
315
        put_u16(buf+4, BGP_AFI_IPV6);
316
        buf += 6;
317
      }
318

    
319
    data[-1] = buf - data;
320
  }
321

    
322
  if (caps->ext_messages)
323
  {
324
    *buf++ = 6;                        /* Capability 6: Support for extended messages */
325
    *buf++ = 0;                        /* Capability data length */
326
  }
327

    
328
  if (caps->gr_aware)
329
  {
330
    *buf++ = 64;                /* Capability 64: Support for graceful restart */
331
    *buf++ = 0;                        /* Capability data length, will be fixed later */
332
    data = buf;
333

    
334
    put_u16(buf, caps->gr_time);
335
    buf[0] |= caps->gr_flags;
336
    buf += 2;
337

    
338
    WALK_AF_CAPS(caps, ac)
339
      if (ac->gr_able)
340
      {
341
        put_af3(buf, ac->afi);
342
        buf[3] = ac->gr_af_flags;
343
        buf += 4;
344
      }
345

    
346
    data[-1] = buf - data;
347
  }
348

    
349
  if (caps->as4_support)
350
  {
351
    *buf++ = 65;                /* Capability 65: Support for 4-octet AS number */
352
    *buf++ = 4;                        /* Capability data length */
353
    put_u32(buf, p->public_as);
354
    buf += 4;
355
  }
356

    
357
  if (any_add_path)
358
  {
359
    *buf++ = 69;                /* Capability 69: Support for ADD-PATH */
360
    *buf++ = 0;                        /* Capability data length, will be fixed later */
361
    data = buf;
362

    
363
    WALK_AF_CAPS(caps, ac)
364
      if (ac->add_path)
365
      {
366
        put_af3(buf, ac->afi);
367
        buf[3] = ac->add_path;
368
        buf += 4;
369
      }
370

    
371
    data[-1] = buf - data;
372
  }
373

    
374
  if (caps->enhanced_refresh)
375
  {
376
    *buf++ = 70;                /* Capability 70: Support for enhanced route refresh */
377
    *buf++ = 0;                        /* Capability data length */
378
  }
379

    
380
  if (caps->llgr_aware)
381
  {
382
    *buf++ = 71;                /* Capability 71: Support for long-lived graceful restart */
383
    *buf++ = 0;                        /* Capability data length, will be fixed later */
384
    data = buf;
385

    
386
    WALK_AF_CAPS(caps, ac)
387
      if (ac->llgr_able)
388
      {
389
        put_af3(buf, ac->afi);
390
        buf[3] = ac->llgr_flags;
391
        put_u24(buf+4, ac->llgr_time);
392
        buf += 7;
393
      }
394

    
395
    data[-1] = buf - data;
396
  }
397

    
398
  caps->length = buf - buf_head;
399

    
400
  return buf;
401
}
402

    
403
static void
404
bgp_read_capabilities(struct bgp_conn *conn, struct bgp_caps *caps, byte *pos, int len)
405
{
406
  struct bgp_proto *p = conn->bgp;
407
  struct bgp_af_caps *ac;
408
  int i, cl;
409
  u32 af;
410

    
411
  caps->length += len;
412

    
413
  while (len > 0)
414
  {
415
    if (len < 2 || len < (2 + pos[1]))
416
      goto err;
417

    
418
    /* Capability length */
419
    cl = pos[1];
420

    
421
    /* Capability type */
422
    switch (pos[0])
423
    {
424
    case  1: /* Multiprotocol capability, RFC 4760 */
425
      if (cl != 4)
426
        goto err;
427

    
428
      af = get_af4(pos+2);
429
      ac = bgp_get_af_caps(caps, af);
430
      ac->ready = 1;
431
      break;
432

    
433
    case  2: /* Route refresh capability, RFC 2918 */
434
      if (cl != 0)
435
        goto err;
436

    
437
      caps->route_refresh = 1;
438
      break;
439

    
440
    case  5: /* Extended next hop encoding capability, RFC 5549 */
441
      if (cl % 6)
442
        goto err;
443

    
444
      for (i = 0; i < cl; i += 6)
445
      {
446
        /* Specified only for IPv4 prefixes with IPv6 next hops */
447
        if ((get_u16(pos+2+i+0) != BGP_AFI_IPV4) ||
448
            (get_u16(pos+2+i+4) != BGP_AFI_IPV6))
449
          continue;
450

    
451
        af = get_af4(pos+2+i);
452
        ac = bgp_get_af_caps(caps, af);
453
        ac->ext_next_hop = 1;
454
      }
455
      break;
456

    
457
    case  6: /* Extended message length capability, RFC draft */
458
      if (cl != 0)
459
        goto err;
460

    
461
      caps->ext_messages = 1;
462
      break;
463

    
464
    case 64: /* Graceful restart capability, RFC 4724 */
465
      if (cl % 4 != 2)
466
        goto err;
467

    
468
      /* Only the last instance is valid */
469
      WALK_AF_CAPS(caps, ac)
470
      {
471
        ac->gr_able = 0;
472
        ac->gr_af_flags = 0;
473
      }
474

    
475
      caps->gr_aware = 1;
476
      caps->gr_flags = pos[2] & 0xf0;
477
      caps->gr_time = get_u16(pos + 2) & 0x0fff;
478

    
479
      for (i = 2; i < cl; i += 4)
480
      {
481
        af = get_af3(pos+2+i);
482
        ac = bgp_get_af_caps(caps, af);
483
        ac->gr_able = 1;
484
        ac->gr_af_flags = pos[2+i+3];
485
      }
486
      break;
487

    
488
    case 65: /* AS4 capability, RFC 6793 */
489
      if (cl != 4)
490
        goto err;
491

    
492
      caps->as4_support = 1;
493
      caps->as4_number = get_u32(pos + 2);
494
      break;
495

    
496
    case 69: /* ADD-PATH capability, RFC 7911 */
497
      if (cl % 4)
498
        goto err;
499

    
500
      for (i = 0; i < cl; i += 4)
501
      {
502
        byte val = pos[2+i+3];
503
        if (!val || (val > BGP_ADD_PATH_FULL))
504
        {
505
          log(L_WARN "%s: Got ADD-PATH capability with unknown value %u, ignoring",
506
              p->p.name, val);
507
          break;
508
        }
509
      }
510

    
511
      for (i = 0; i < cl; i += 4)
512
      {
513
        af = get_af3(pos+2+i);
514
        ac = bgp_get_af_caps(caps, af);
515
        ac->add_path = pos[2+i+3];
516
      }
517
      break;
518

    
519
    case 70: /* Enhanced route refresh capability, RFC 7313 */
520
      if (cl != 0)
521
        goto err;
522

    
523
      caps->enhanced_refresh = 1;
524
      break;
525

    
526
    case 71: /* Long lived graceful restart capability, RFC draft */
527
      if (cl % 7)
528
        goto err;
529

    
530
      /* Presumably, only the last instance is valid */
531
      WALK_AF_CAPS(caps, ac)
532
      {
533
        ac->llgr_able = 0;
534
        ac->llgr_flags = 0;
535
        ac->llgr_time = 0;
536
      }
537

    
538
      caps->llgr_aware = 1;
539

    
540
      for (i = 0; i < cl; i += 7)
541
      {
542
        af = get_af3(pos+2+i);
543
        ac = bgp_get_af_caps(caps, af);
544
        ac->llgr_able = 1;
545
        ac->llgr_flags = pos[2+i+3];
546
        ac->llgr_time = get_u24(pos + 2+i+4);
547
      }
548
      break;
549

    
550
      /* We can safely ignore all other capabilities */
551
    }
552

    
553
    ADVANCE(pos, len, 2 + cl);
554
  }
555

    
556
  /* The LLGR capability must be advertised together with the GR capability,
557
     otherwise it must be disregarded */
558
  if (!caps->gr_aware && caps->llgr_aware)
559
  {
560
    caps->llgr_aware = 0;
561
    WALK_AF_CAPS(caps, ac)
562
    {
563
      ac->llgr_able = 0;
564
      ac->llgr_flags = 0;
565
      ac->llgr_time = 0;
566
    }
567
  }
568

    
569
  return;
570

    
571
err:
572
  bgp_error(conn, 2, 0, NULL, 0);
573
  return;
574
}
575

    
576
static int
577
bgp_read_options(struct bgp_conn *conn, byte *pos, int len)
578
{
579
  struct bgp_proto *p = conn->bgp;
580
  struct bgp_caps *caps;
581
  int ol;
582

    
583
  /* Max number of announced AFIs is limited by max option length (255) */
584
  caps = alloca(sizeof(struct bgp_caps) + 64 * sizeof(struct bgp_af_caps));
585
  memset(caps, 0, sizeof(struct bgp_caps));
586

    
587
  while (len > 0)
588
  {
589
    if ((len < 2) || (len < (2 + pos[1])))
590
    { bgp_error(conn, 2, 0, NULL, 0); return -1; }
591

    
592
    ol = pos[1];
593
    if (pos[0] == 2)
594
    {
595
      /* BGP capabilities, RFC 5492 */
596
      if (p->cf->capabilities)
597
        bgp_read_capabilities(conn, caps, pos + 2, ol);
598
    }
599
    else
600
    {
601
      /* Unknown option */
602
      bgp_error(conn, 2, 4, pos, ol); /* FIXME: ol or ol+2 ? */
603
      return -1;
604
    }
605

    
606
    ADVANCE(pos, len, 2 + ol);
607
  }
608

    
609
  uint n = sizeof(struct bgp_caps) + caps->af_count * sizeof(struct bgp_af_caps);
610
  conn->remote_caps = mb_allocz(p->p.pool, n);
611
  memcpy(conn->remote_caps, caps, n);
612

    
613
  return 0;
614
}
615

    
616
static byte *
617
bgp_create_open(struct bgp_conn *conn, byte *buf)
618
{
619
  struct bgp_proto *p = conn->bgp;
620

    
621
  BGP_TRACE(D_PACKETS, "Sending OPEN(ver=%d,as=%d,hold=%d,id=%08x)",
622
            BGP_VERSION, p->public_as, p->cf->hold_time, p->local_id);
623

    
624
  buf[0] = BGP_VERSION;
625
  put_u16(buf+1, (p->public_as < 0xFFFF) ? p->public_as : AS_TRANS);
626
  put_u16(buf+3, p->cf->hold_time);
627
  put_u32(buf+5, p->local_id);
628

    
629
  if (p->cf->capabilities)
630
  {
631
    /* Prepare local_caps and write capabilities to buffer */
632
    byte *end = bgp_write_capabilities(conn, buf+12);
633
    uint len = end - (buf+12);
634

    
635
    buf[9] = len + 2;                /* Optional parameters length */
636
    buf[10] = 2;                /* Option 2: Capability list */
637
    buf[11] = len;                /* Option data length */
638

    
639
    return end;
640
  }
641
  else
642
  {
643
    /* Prepare empty local_caps */
644
    conn->local_caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps));
645

    
646
    buf[9] = 0;                        /* No optional parameters */
647
    return buf + 10;
648
  }
649

    
650
  return buf;
651
}
652

    
653
static void
654
bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len)
655
{
656
  struct bgp_proto *p = conn->bgp;
657
  struct bgp_conn *other;
658
  u32 asn, hold, id;
659

    
660
  /* Check state */
661
  if (conn->state != BS_OPENSENT)
662
  { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
663

    
664
  /* Check message contents */
665
  if (len < 29 || len != 29 + (uint) pkt[28])
666
  { bgp_error(conn, 1, 2, pkt+16, 2); return; }
667

    
668
  if (pkt[19] != BGP_VERSION)
669
  { u16 val = BGP_VERSION; bgp_error(conn, 2, 1, (byte *) &val, 2); return; }
670

    
671
  asn = get_u16(pkt+20);
672
  hold = get_u16(pkt+22);
673
  id = get_u32(pkt+24);
674
  BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%R)", asn, hold, id);
675

    
676
  if (bgp_read_options(conn, pkt+29, pkt[28]) < 0)
677
    return;
678

    
679
  if (hold > 0 && hold < 3)
680
  { bgp_error(conn, 2, 6, pkt+22, 2); return; }
681

    
682
  /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */
683
  if (!id || (p->is_internal && id == p->local_id))
684
  { bgp_error(conn, 2, 3, pkt+24, -4); return; }
685

    
686
  struct bgp_caps *caps = conn->remote_caps;
687

    
688
  if (caps->as4_support)
689
  {
690
    u32 as4 = caps->as4_number;
691

    
692
    if ((as4 != asn) && (asn != AS_TRANS))
693
      log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name);
694

    
695
    if (as4 != p->remote_as)
696
    { as4 = htonl(as4); bgp_error(conn, 2, 2, (byte *) &as4, 4); return; }
697
  }
698
  else
699
  {
700
    if (asn != p->remote_as)
701
    { bgp_error(conn, 2, 2, pkt+20, 2); return; }
702
  }
703

    
704
  /* Check the other connection */
705
  other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn;
706
  switch (other->state)
707
  {
708
  case BS_CONNECT:
709
  case BS_ACTIVE:
710
    /* Stop outgoing connection attempts */
711
    bgp_conn_enter_idle_state(other);
712
    break;
713

    
714
  case BS_IDLE:
715
  case BS_OPENSENT:
716
  case BS_CLOSE:
717
    break;
718

    
719
  case BS_OPENCONFIRM:
720
    /*
721
     * Description of collision detection rules in RFC 4271 is confusing and
722
     * contradictory, but it is essentially:
723
     *
724
     * 1. Router with higher ID is dominant
725
     * 2. If both have the same ID, router with higher ASN is dominant [RFC6286]
726
     * 3. When both connections are in OpenConfirm state, one initiated by
727
     *    the dominant router is kept.
728
     *
729
     * The first line in the expression below evaluates whether the neighbor
730
     * is dominant, the second line whether the new connection was initiated
731
     * by the neighbor. If both are true (or both are false), we keep the new
732
     * connection, otherwise we keep the old one.
733
     */
734
    if (((p->local_id < id) || ((p->local_id == id) && (p->public_as < p->remote_as)))
735
        == (conn == &p->incoming_conn))
736
    {
737
      /* Should close the other connection */
738
      BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection");
739
      bgp_error(other, 6, 7, NULL, 0);
740
      break;
741
    }
742
    /* Fall thru */
743
  case BS_ESTABLISHED:
744
    /* Should close this connection */
745
    BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection");
746
    bgp_error(conn, 6, 7, NULL, 0);
747
    return;
748

    
749
  default:
750
    bug("bgp_rx_open: Unknown state");
751
  }
752

    
753
  /* Update our local variables */
754
  conn->hold_time = MIN(hold, p->cf->hold_time);
755
  conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3;
756
  conn->as4_session = conn->local_caps->as4_support && caps->as4_support;
757
  conn->ext_messages = conn->local_caps->ext_messages && caps->ext_messages;
758
  p->remote_id = id;
759

    
760
  DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n",
761
      conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, conn->as4_session);
762

    
763
  bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE);
764
  bgp_start_timer(conn->hold_timer, conn->hold_time);
765
  bgp_conn_enter_openconfirm_state(conn);
766
}
767

    
768

    
769
/*
770
 *        Next hop handling
771
 */
772

    
773
#define REPORT(msg, args...) \
774
  ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); })
775

    
776
#define DISCARD(msg, args...) \
777
  ({ REPORT(msg, ## args); return; })
778

    
779
#define WITHDRAW(msg, args...) \
780
  ({ REPORT(msg, ## args); s->err_withdraw = 1; return; })
781

    
782
#define BAD_AFI                "Unexpected AF <%u/%u> in UPDATE"
783
#define BAD_NEXT_HOP        "Invalid NEXT_HOP attribute"
784
#define NO_NEXT_HOP        "Missing NEXT_HOP attribute"
785
#define NO_LABEL_STACK        "Missing MPLS stack"
786

    
787

    
788
static void
789
bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll)
790
{
791
  struct bgp_proto *p = s->proto;
792
  struct bgp_channel *c = s->channel;
793

    
794
  if (c->cf->gw_mode == GW_DIRECT)
795
  {
796
    neighbor *nbr = NULL;
797

    
798
    /* GW_DIRECT -> single_hop -> p->neigh != NULL */
799
    if (ipa_nonzero(gw))
800
      nbr = neigh_find(&p->p, gw, NULL, 0);
801
    else if (ipa_nonzero(ll))
802
      nbr = neigh_find(&p->p, ll, p->neigh->iface, 0);
803

    
804
    if (!nbr || (nbr->scope == SCOPE_HOST))
805
      WITHDRAW(BAD_NEXT_HOP);
806

    
807
    a->dest = RTD_UNICAST;
808
    a->nh.gw = nbr->addr;
809
    a->nh.iface = nbr->iface;
810
  }
811
  else /* GW_RECURSIVE */
812
  {
813
    if (ipa_zero(gw))
814
      WITHDRAW(BAD_NEXT_HOP);
815

    
816
    rtable *tab = ipa_is_ip4(gw) ? c->igp_table_ip4 : c->igp_table_ip6;
817
    s->hostentry = rt_get_hostentry(tab, gw, ll, c->c.table);
818

    
819
    if (!s->mpls)
820
      rta_apply_hostentry(a, s->hostentry, NULL);
821

    
822
    /* With MPLS, hostentry is applied later in bgp_apply_mpls_labels() */
823
  }
824
}
825

    
826
static void
827
bgp_apply_mpls_labels(struct bgp_parse_state *s, rta *a, u32 *labels, uint lnum)
828
{
829
  if (lnum > MPLS_MAX_LABEL_STACK)
830
  {
831
    REPORT("Too many MPLS labels ($u)", lnum);
832

    
833
    a->dest = RTD_UNREACHABLE;
834
    a->hostentry = NULL;
835
    a->nh = (struct nexthop) { };
836
    return;
837
  }
838

    
839
  /* Handle implicit NULL as empty MPLS stack */
840
  if ((lnum == 1) && (labels[0] == BGP_MPLS_NULL))
841
    lnum = 0;
842

    
843
  if (s->channel->cf->gw_mode == GW_DIRECT)
844
  {
845
    a->nh.labels = lnum;
846
    memcpy(a->nh.label, labels, 4*lnum);
847
  }
848
  else /* GW_RECURSIVE */
849
  {
850
    mpls_label_stack ms;
851

    
852
    ms.len = lnum;
853
    memcpy(ms.stack, labels, 4*lnum);
854
    rta_apply_hostentry(a, s->hostentry, &ms);
855
  }
856
}
857

    
858

    
859
static int
860
bgp_match_src(struct bgp_export_state *s, int mode)
861
{
862
  switch (mode)
863
  {
864
  case NH_NO:                return 0;
865
  case NH_ALL:                return 1;
866
  case NH_IBGP:                return s->src && s->src->is_internal;
867
  case NH_EBGP:                return s->src && !s->src->is_internal;
868
  default:                return 0;
869
  }
870
}
871

    
872
static inline int
873
bgp_use_next_hop(struct bgp_export_state *s, eattr *a)
874
{
875
  struct bgp_proto *p = s->proto;
876
  struct bgp_channel *c = s->channel;
877
  ip_addr *nh = (void *) a->u.ptr->data;
878

    
879
  /* Handle next hop self option */
880
  if (c->cf->next_hop_self && bgp_match_src(s, c->cf->next_hop_self))
881
    return 0;
882

    
883
  /* Handle next hop keep option */
884
  if (c->cf->next_hop_keep && bgp_match_src(s, c->cf->next_hop_keep))
885
    return 1;
886

    
887
  /* Keep it when explicitly set in export filter */
888
  if (a->type & EAF_FRESH)
889
    return 1;
890

    
891
  /* Check for non-matching AF */
892
  if ((ipa_is_ip4(*nh) != bgp_channel_is_ipv4(c)) && !c->ext_next_hop)
893
    return 0;
894

    
895
  /* Keep it when exported to internal peers */
896
  if (p->is_interior && ipa_nonzero(*nh))
897
    return 1;
898

    
899
  /* Keep it when forwarded between single-hop BGPs on the same iface */
900
  struct iface *ifa = (s->src && s->src->neigh) ? s->src->neigh->iface : NULL;
901
  return p->neigh && (p->neigh->iface == ifa);
902
}
903

    
904
static inline int
905
bgp_use_gateway(struct bgp_export_state *s)
906
{
907
  struct bgp_proto *p = s->proto;
908
  struct bgp_channel *c = s->channel;
909
  rta *ra = s->route->attrs;
910

    
911
  /* Handle next hop self option - also applies to gateway */
912
  if (c->cf->next_hop_self && bgp_match_src(s, c->cf->next_hop_self))
913
    return 0;
914

    
915
  /* We need one valid global gateway */
916
  if ((ra->dest != RTD_UNICAST) || ra->nh.next || ipa_zero(ra->nh.gw) || ipa_is_link_local(ra->nh.gw))
917
    return 0;
918

    
919
  /* Check for non-matching AF */
920
  if ((ipa_is_ip4(ra->nh.gw) != bgp_channel_is_ipv4(c)) && !c->ext_next_hop)
921
    return 0;
922

    
923
  /* Use it when exported to internal peers */
924
  if (p->is_interior)
925
    return 1;
926

    
927
  /* Use it when forwarded to single-hop BGP peer on on the same iface */
928
  return p->neigh && (p->neigh->iface == ra->nh.iface);
929
}
930

    
931
static void
932
bgp_update_next_hop_ip(struct bgp_export_state *s, eattr *a, ea_list **to)
933
{
934
  if (!a || !bgp_use_next_hop(s, a))
935
  {
936
    if (bgp_use_gateway(s))
937
    {
938
      rta *ra = s->route->attrs;
939
      ip_addr nh[1] = { ra->nh.gw };
940
      bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, 16);
941

    
942
      if (s->mpls)
943
      {
944
        u32 implicit_null = BGP_MPLS_NULL;
945
        u32 *labels = ra->nh.labels ? ra->nh.label : &implicit_null;
946
        uint lnum = ra->nh.labels ? ra->nh.labels : 1;
947
        bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, labels, lnum * 4);
948
      }
949
    }
950
    else
951
    {
952
      ip_addr nh[2] = { s->channel->next_hop_addr, s->channel->link_addr };
953
      bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, ipa_nonzero(nh[1]) ? 32 : 16);
954

    
955
      /* TODO: Use local MPLS assigned label */
956
      if (s->mpls)
957
      {
958
        u32 implicit_null = BGP_MPLS_NULL;
959
        bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, &implicit_null, 4);
960
      }
961
    }
962
  }
963

    
964
  /* Check if next hop is valid */
965
  a = bgp_find_attr(*to, BA_NEXT_HOP);
966
  if (!a)
967
    WITHDRAW(NO_NEXT_HOP);
968

    
969
  ip_addr *nh = (void *) a->u.ptr->data;
970
  ip_addr peer = s->proto->cf->remote_ip;
971
  uint len = a->u.ptr->length;
972

    
973
  /* Forbid zero next hop */
974
  if (ipa_zero(nh[0]) && ((len != 32) || ipa_zero(nh[1])))
975
    WITHDRAW(BAD_NEXT_HOP);
976

    
977
  /* Forbid next hop equal to neighbor IP */
978
  if (ipa_equal(peer, nh[0]) || ((len == 32) && ipa_equal(peer, nh[1])))
979
    WITHDRAW(BAD_NEXT_HOP);
980

    
981
  /* Forbid next hop with non-matching AF */
982
  if ((ipa_is_ip4(nh[0]) != bgp_channel_is_ipv4(s->channel)) &&
983
      !s->channel->ext_next_hop)
984
    WITHDRAW(BAD_NEXT_HOP);
985

    
986
  /* Just check if MPLS stack */
987
  if (s->mpls && !bgp_find_attr(*to, BA_MPLS_LABEL_STACK))
988
    WITHDRAW(NO_LABEL_STACK);
989
}
990

    
991
static uint
992
bgp_encode_next_hop_ip(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED)
993
{
994
  /* This function is used only for MP-BGP, see bgp_encode_next_hop() for IPv4 BGP */
995
  ip_addr *nh = (void *) a->u.ptr->data;
996
  uint len = a->u.ptr->length;
997

    
998
  ASSERT((len == 16) || (len == 32));
999

    
1000
  /*
1001
   * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This
1002
   * is specified in RFC 5549 for IPv4 and in RFC 4798 for IPv6. The difference
1003
   * is that IPv4 address is directly encoded with IPv4 NLRI, but as IPv4-mapped
1004
   * IPv6 address with IPv6 NLRI.
1005
   */
1006

    
1007
  if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0]))
1008
  {
1009
    put_ip4(buf, ipa_to_ip4(nh[0]));
1010
    return 4;
1011
  }
1012

    
1013
  put_ip6(buf, ipa_to_ip6(nh[0]));
1014

    
1015
  if (len == 32)
1016
    put_ip6(buf+16, ipa_to_ip6(nh[1]));
1017

    
1018
  return len;
1019
}
1020

    
1021
static void
1022
bgp_decode_next_hop_ip(struct bgp_parse_state *s, byte *data, uint len, rta *a)
1023
{
1024
  struct bgp_channel *c = s->channel;
1025
  struct adata *ad = lp_alloc_adata(s->pool, 32);
1026
  ip_addr *nh = (void *) ad->data;
1027

    
1028
  if (len == 4)
1029
  {
1030
    nh[0] = ipa_from_ip4(get_ip4(data));
1031
    nh[1] = IPA_NONE;
1032
  }
1033
  else if (len == 16)
1034
  {
1035
    nh[0] = ipa_from_ip6(get_ip6(data));
1036
    nh[1] = IPA_NONE;
1037

    
1038
    if (ipa_is_link_local(nh[0]))
1039
    { nh[1] = nh[0]; nh[0] = IPA_NONE; }
1040
  }
1041
  else if (len == 32)
1042
  {
1043
    nh[0] = ipa_from_ip6(get_ip6(data));
1044
    nh[1] = ipa_from_ip6(get_ip6(data+16));
1045

    
1046
    if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1]))
1047
      nh[1] = IPA_NONE;
1048
  }
1049
  else
1050
    bgp_parse_error(s, 9);
1051

    
1052
  if (ipa_zero(nh[1]))
1053
    ad->length = 16;
1054

    
1055
  if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop)
1056
    WITHDRAW(BAD_NEXT_HOP);
1057

    
1058
  // XXXX validate next hop
1059

    
1060
  bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad);
1061
  bgp_apply_next_hop(s, a, nh[0], nh[1]);
1062
}
1063

    
1064
static uint
1065
bgp_encode_next_hop_vpn(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED)
1066
{
1067
  ip_addr *nh = (void *) a->u.ptr->data;
1068
  uint len = a->u.ptr->length;
1069

    
1070
  ASSERT((len == 16) || (len == 32));
1071

    
1072
  /*
1073
   * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This
1074
   * is specified in RFC 5549 for VPNv4 and in RFC 4659 for VPNv6. The difference
1075
   * is that IPv4 address is directly encoded with VPNv4 NLRI, but as IPv4-mapped
1076
   * IPv6 address with VPNv6 NLRI.
1077
   */
1078

    
1079
  if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0]))
1080
  {
1081
    put_u64(buf, 0); /* VPN RD is 0 */
1082
    put_ip4(buf+8, ipa_to_ip4(nh[0]));
1083
    return 12;
1084
  }
1085

    
1086
  put_u64(buf, 0); /* VPN RD is 0 */
1087
  put_ip6(buf+8, ipa_to_ip6(nh[0]));
1088

    
1089
  if (len == 16)
1090
    return 24;
1091

    
1092
  put_u64(buf+24, 0); /* VPN RD is 0 */
1093
  put_ip6(buf+32, ipa_to_ip6(nh[1]));
1094

    
1095
  return 48;
1096
}
1097

    
1098
static void
1099
bgp_decode_next_hop_vpn(struct bgp_parse_state *s, byte *data, uint len, rta *a)
1100
{
1101
  struct bgp_channel *c = s->channel;
1102
  struct adata *ad = lp_alloc_adata(s->pool, 32);
1103
  ip_addr *nh = (void *) ad->data;
1104

    
1105
  if (len == 12)
1106
  {
1107
    nh[0] = ipa_from_ip4(get_ip4(data+8));
1108
    nh[1] = IPA_NONE;
1109
  }
1110
  else if (len == 24)
1111
  {
1112
    nh[0] = ipa_from_ip6(get_ip6(data+8));
1113
    nh[1] = IPA_NONE;
1114

    
1115
    if (ipa_is_link_local(nh[0]))
1116
    { nh[1] = nh[0]; nh[0] = IPA_NONE; }
1117
  }
1118
  else if (len == 48)
1119
  {
1120
    nh[0] = ipa_from_ip6(get_ip6(data+8));
1121
    nh[1] = ipa_from_ip6(get_ip6(data+32));
1122

    
1123
    if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1]))
1124
      nh[1] = IPA_NONE;
1125
  }
1126
  else
1127
    bgp_parse_error(s, 9);
1128

    
1129
  if (ipa_zero(nh[1]))
1130
    ad->length = 16;
1131

    
1132
  /* XXXX which error */
1133
  if ((get_u64(data) != 0) || ((len == 48) && (get_u64(data+24) != 0)))
1134
    bgp_parse_error(s, 9);
1135

    
1136
  if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop)
1137
    WITHDRAW(BAD_NEXT_HOP);
1138

    
1139
  // XXXX validate next hop
1140

    
1141
  bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad);
1142
  bgp_apply_next_hop(s, a, nh[0], nh[1]);
1143
}
1144

    
1145

    
1146

    
1147
static uint
1148
bgp_encode_next_hop_none(struct bgp_write_state *s UNUSED, eattr *a UNUSED, byte *buf UNUSED, uint size UNUSED)
1149
{
1150
  return 0;
1151
}
1152

    
1153
static void
1154
bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, uint len UNUSED, rta *a UNUSED)
1155
{
1156
  /*
1157
   * Although we expect no next hop and RFC 7606 7.11 states that attribute
1158
   * MP_REACH_NLRI with unexpected next hop length is considered malformed,
1159
   * FlowSpec RFC 5575 4 states that next hop shall be ignored on receipt.
1160
   */
1161

    
1162
  return;
1163
}
1164

    
1165
static void
1166
bgp_update_next_hop_none(struct bgp_export_state *s, eattr *a, ea_list **to)
1167
{
1168
  /* NEXT_HOP shall not pass */
1169
  if (a)
1170
    bgp_unset_attr(to, s->pool, BA_NEXT_HOP);
1171
}
1172

    
1173

    
1174
/*
1175
 *        UPDATE
1176
 */
1177

    
1178
static void
1179
bgp_rte_update(struct bgp_parse_state *s, net_addr *n, u32 path_id, rta *a0)
1180
{
1181
  if (path_id != s->last_id)
1182
  {
1183
    s->last_src = rt_get_source(&s->proto->p, path_id);
1184
    s->last_id = path_id;
1185

    
1186
    rta_free(s->cached_rta);
1187
    s->cached_rta = NULL;
1188
  }
1189

    
1190
  if (!a0)
1191
  {
1192
    /* Route withdraw */
1193
    rte_update3(&s->channel->c, n, NULL, s->last_src);
1194
    return;
1195
  }
1196

    
1197
  /* Prepare cached route attributes */
1198
  if (s->cached_rta == NULL)
1199
  {
1200
    a0->src = s->last_src;
1201

    
1202
    /* Workaround for rta_lookup() breaking eattrs */
1203
    ea_list *ea = a0->eattrs;
1204
    s->cached_rta = rta_lookup(a0);
1205
    a0->eattrs = ea;
1206
  }
1207

    
1208
  rta *a = rta_clone(s->cached_rta);
1209
  rte *e = rte_get_temp(a);
1210

    
1211
  e->pflags = 0;
1212
  e->u.bgp.suppressed = 0;
1213
  e->u.bgp.stale = -1;
1214
  rte_update3(&s->channel->c, n, e, s->last_src);
1215
}
1216

    
1217
static void
1218
bgp_encode_mpls_labels(struct bgp_write_state *s UNUSED, adata *mpls, byte **pos, uint *size, byte *pxlen)
1219
{
1220
  u32 dummy = 0;
1221
  u32 *labels = mpls ? (u32 *) mpls->data : &dummy;
1222
  uint lnum = mpls ? (mpls->length / 4) : 1;
1223

    
1224
  for (uint i = 0; i < lnum; i++)
1225
  {
1226
    put_u24(*pos, labels[i] << 4);
1227
    ADVANCE(*pos, *size, 3);
1228
  }
1229

    
1230
  /* Add bottom-of-stack flag */
1231
  (*pos)[-1] |= BGP_MPLS_BOS;
1232

    
1233
  *pxlen += 24 * lnum;
1234
}
1235

    
1236
static void
1237
bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *pxlen, rta *a)
1238
{
1239
  u32 labels[BGP_MPLS_MAX], label;
1240
  uint lnum = 0;
1241

    
1242
  do {
1243
    if (*pxlen < 24)
1244
      bgp_parse_error(s, 1);
1245

    
1246
    label = get_u24(*pos);
1247
    labels[lnum++] = label >> 4;
1248
    ADVANCE(*pos, *len, 3);
1249
    *pxlen -= 24;
1250

    
1251
    /* RFC 8277 2.4 - withdraw does not have variable-size MPLS stack but
1252
       fixed-size 24-bit Compatibility field, which MUST be ignored */
1253
    if (!a && !s->err_withdraw)
1254
      return;
1255
  }
1256
  while (!(label & BGP_MPLS_BOS));
1257

    
1258
  if (!a)
1259
    return;
1260

    
1261
  /* Attach MPLS attribute unless we already have one */
1262
  if (!s->mpls_labels)
1263
  {
1264
    s->mpls_labels = lp_alloc_adata(s->pool, 4*BGP_MPLS_MAX);
1265
    bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_MPLS_LABEL_STACK, 0, s->mpls_labels);
1266
  }
1267

    
1268
  /* Overwrite data in the attribute */
1269
  s->mpls_labels->length = 4*lnum;
1270
  memcpy(s->mpls_labels->data, labels, 4*lnum);
1271

    
1272
  /* Update next hop entry in rta */
1273
  bgp_apply_mpls_labels(s, a, labels, lnum);
1274

    
1275
  /* Attributes were changed, invalidate cached entry */
1276
  rta_free(s->cached_rta);
1277
  s->cached_rta = NULL;
1278

    
1279
  return;
1280
}
1281

    
1282
static uint
1283
bgp_encode_nlri_ip4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1284
{
1285
  byte *pos = buf;
1286

    
1287
  while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1288
  {
1289
    struct bgp_prefix *px = HEAD(buck->prefixes);
1290
    struct net_addr_ip4 *net = (void *) px->net;
1291

    
1292
    /* Encode path ID */
1293
    if (s->add_path)
1294
    {
1295
      put_u32(pos, px->path_id);
1296
      ADVANCE(pos, size, 4);
1297
    }
1298

    
1299
    /* Encode prefix length */
1300
    *pos = net->pxlen;
1301
    ADVANCE(pos, size, 1);
1302

    
1303
    /* Encode MPLS labels */
1304
    if (s->mpls)
1305
      bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1306

    
1307
    /* Encode prefix body */
1308
    ip4_addr a = ip4_hton(net->prefix);
1309
    uint b = (net->pxlen + 7) / 8;
1310
    memcpy(pos, &a, b);
1311
    ADVANCE(pos, size, b);
1312

    
1313
    bgp_free_prefix(s->channel, px);
1314
  }
1315

    
1316
  return pos - buf;
1317
}
1318

    
1319
static void
1320
bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1321
{
1322
  while (len)
1323
  {
1324
    net_addr_ip4 net;
1325
    u32 path_id = 0;
1326

    
1327
    /* Decode path ID */
1328
    if (s->add_path)
1329
    {
1330
      if (len < 5)
1331
        bgp_parse_error(s, 1);
1332

    
1333
      path_id = get_u32(pos);
1334
      ADVANCE(pos, len, 4);
1335
    }
1336

    
1337
    /* Decode prefix length */
1338
    uint l = *pos;
1339
    ADVANCE(pos, len, 1);
1340

    
1341
    if (len < ((l + 7) / 8))
1342
      bgp_parse_error(s, 1);
1343

    
1344
    /* Decode MPLS labels */
1345
    if (s->mpls)
1346
      bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1347

    
1348
    if (l > IP4_MAX_PREFIX_LENGTH)
1349
      bgp_parse_error(s, 10);
1350

    
1351
    /* Decode prefix body */
1352
    ip4_addr addr = IP4_NONE;
1353
    uint b = (l + 7) / 8;
1354
    memcpy(&addr, pos, b);
1355
    ADVANCE(pos, len, b);
1356

    
1357
    net = NET_ADDR_IP4(ip4_ntoh(addr), l);
1358
    net_normalize_ip4(&net);
1359

    
1360
    // XXXX validate prefix
1361

    
1362
    bgp_rte_update(s, (net_addr *) &net, path_id, a);
1363
  }
1364
}
1365

    
1366

    
1367
static uint
1368
bgp_encode_nlri_ip6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1369
{
1370
  byte *pos = buf;
1371

    
1372
  while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1373
  {
1374
    struct bgp_prefix *px = HEAD(buck->prefixes);
1375
    struct net_addr_ip6 *net = (void *) px->net;
1376

    
1377
    /* Encode path ID */
1378
    if (s->add_path)
1379
    {
1380
      put_u32(pos, px->path_id);
1381
      ADVANCE(pos, size, 4);
1382
    }
1383

    
1384
    /* Encode prefix length */
1385
    *pos = net->pxlen;
1386
    ADVANCE(pos, size, 1);
1387

    
1388
    /* Encode MPLS labels */
1389
    if (s->mpls)
1390
      bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1391

    
1392
    /* Encode prefix body */
1393
    ip6_addr a = ip6_hton(net->prefix);
1394
    uint b = (net->pxlen + 7) / 8;
1395
    memcpy(pos, &a, b);
1396
    ADVANCE(pos, size, b);
1397

    
1398
    bgp_free_prefix(s->channel, px);
1399
  }
1400

    
1401
  return pos - buf;
1402
}
1403

    
1404
static void
1405
bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1406
{
1407
  while (len)
1408
  {
1409
    net_addr_ip6 net;
1410
    u32 path_id = 0;
1411

    
1412
    /* Decode path ID */
1413
    if (s->add_path)
1414
    {
1415
      if (len < 5)
1416
        bgp_parse_error(s, 1);
1417

    
1418
      path_id = get_u32(pos);
1419
      ADVANCE(pos, len, 4);
1420
    }
1421

    
1422
    /* Decode prefix length */
1423
    uint l = *pos;
1424
    ADVANCE(pos, len, 1);
1425

    
1426
    if (len < ((l + 7) / 8))
1427
      bgp_parse_error(s, 1);
1428

    
1429
    /* Decode MPLS labels */
1430
    if (s->mpls)
1431
      bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1432

    
1433
    if (l > IP6_MAX_PREFIX_LENGTH)
1434
      bgp_parse_error(s, 10);
1435

    
1436
    /* Decode prefix body */
1437
    ip6_addr addr = IP6_NONE;
1438
    uint b = (l + 7) / 8;
1439
    memcpy(&addr, pos, b);
1440
    ADVANCE(pos, len, b);
1441

    
1442
    net = NET_ADDR_IP6(ip6_ntoh(addr), l);
1443
    net_normalize_ip6(&net);
1444

    
1445
    // XXXX validate prefix
1446

    
1447
    bgp_rte_update(s, (net_addr *) &net, path_id, a);
1448
  }
1449
}
1450

    
1451
static uint
1452
bgp_encode_nlri_vpn4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1453
{
1454
  byte *pos = buf;
1455

    
1456
  while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1457
  {
1458
    struct bgp_prefix *px = HEAD(buck->prefixes);
1459
    struct net_addr_vpn4 *net = (void *) px->net;
1460

    
1461
    /* Encode path ID */
1462
    if (s->add_path)
1463
    {
1464
      put_u32(pos, px->path_id);
1465
      ADVANCE(pos, size, 4);
1466
    }
1467

    
1468
    /* Encode prefix length */
1469
    *pos = 64 + net->pxlen;
1470
    ADVANCE(pos, size, 1);
1471

    
1472
    /* Encode MPLS labels */
1473
    if (s->mpls)
1474
      bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1475

    
1476
    /* Encode route distinguisher */
1477
    put_u64(pos, net->rd);
1478
    ADVANCE(pos, size, 8);
1479

    
1480
    /* Encode prefix body */
1481
    ip4_addr a = ip4_hton(net->prefix);
1482
    uint b = (net->pxlen + 7) / 8;
1483
    memcpy(pos, &a, b);
1484
    ADVANCE(pos, size, b);
1485

    
1486
    bgp_free_prefix(s->channel, px);
1487
  }
1488

    
1489
  return pos - buf;
1490
}
1491

    
1492
static void
1493
bgp_decode_nlri_vpn4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1494
{
1495
  while (len)
1496
  {
1497
    net_addr_vpn4 net;
1498
    u32 path_id = 0;
1499

    
1500
    /* Decode path ID */
1501
    if (s->add_path)
1502
    {
1503
      if (len < 5)
1504
        bgp_parse_error(s, 1);
1505

    
1506
      path_id = get_u32(pos);
1507
      ADVANCE(pos, len, 4);
1508
    }
1509

    
1510
    /* Decode prefix length */
1511
    uint l = *pos;
1512
    ADVANCE(pos, len, 1);
1513

    
1514
    if (len < ((l + 7) / 8))
1515
      bgp_parse_error(s, 1);
1516

    
1517
    /* Decode MPLS labels */
1518
    if (s->mpls)
1519
      bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1520

    
1521
    /* Decode route distinguisher */
1522
    if (l < 64)
1523
      bgp_parse_error(s, 1);
1524

    
1525
    u64 rd = get_u64(pos);
1526
    ADVANCE(pos, len, 8);
1527
    l -= 64;
1528

    
1529
    if (l > IP4_MAX_PREFIX_LENGTH)
1530
      bgp_parse_error(s, 10);
1531

    
1532
    /* Decode prefix body */
1533
    ip4_addr addr = IP4_NONE;
1534
    uint b = (l + 7) / 8;
1535
    memcpy(&addr, pos, b);
1536
    ADVANCE(pos, len, b);
1537

    
1538
    net = NET_ADDR_VPN4(ip4_ntoh(addr), l, rd);
1539
    net_normalize_vpn4(&net);
1540

    
1541
    // XXXX validate prefix
1542

    
1543
    bgp_rte_update(s, (net_addr *) &net, path_id, a);
1544
  }
1545
}
1546

    
1547

    
1548
static uint
1549
bgp_encode_nlri_vpn6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1550
{
1551
  byte *pos = buf;
1552

    
1553
  while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1554
  {
1555
    struct bgp_prefix *px = HEAD(buck->prefixes);
1556
    struct net_addr_vpn6 *net = (void *) px->net;
1557

    
1558
    /* Encode path ID */
1559
    if (s->add_path)
1560
    {
1561
      put_u32(pos, px->path_id);
1562
      ADVANCE(pos, size, 4);
1563
    }
1564

    
1565
    /* Encode prefix length */
1566
    *pos = 64 + net->pxlen;
1567
    ADVANCE(pos, size, 1);
1568

    
1569
    /* Encode MPLS labels */
1570
    if (s->mpls)
1571
      bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1572

    
1573
    /* Encode route distinguisher */
1574
    put_u64(pos, net->rd);
1575
    ADVANCE(pos, size, 8);
1576

    
1577
    /* Encode prefix body */
1578
    ip6_addr a = ip6_hton(net->prefix);
1579
    uint b = (net->pxlen + 7) / 8;
1580
    memcpy(pos, &a, b);
1581
    ADVANCE(pos, size, b);
1582

    
1583
    bgp_free_prefix(s->channel, px);
1584
  }
1585

    
1586
  return pos - buf;
1587
}
1588

    
1589
static void
1590
bgp_decode_nlri_vpn6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1591
{
1592
  while (len)
1593
  {
1594
    net_addr_vpn6 net;
1595
    u32 path_id = 0;
1596

    
1597
    /* Decode path ID */
1598
    if (s->add_path)
1599
    {
1600
      if (len < 5)
1601
        bgp_parse_error(s, 1);
1602

    
1603
      path_id = get_u32(pos);
1604
      ADVANCE(pos, len, 4);
1605
    }
1606

    
1607
    /* Decode prefix length */
1608
    uint l = *pos;
1609
    ADVANCE(pos, len, 1);
1610

    
1611
    if (len < ((l + 7) / 8))
1612
      bgp_parse_error(s, 1);
1613

    
1614
    /* Decode MPLS labels */
1615
    if (s->mpls)
1616
      bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1617

    
1618
    /* Decode route distinguisher */
1619
    if (l < 64)
1620
      bgp_parse_error(s, 1);
1621

    
1622
    u64 rd = get_u64(pos);
1623
    ADVANCE(pos, len, 8);
1624
    l -= 64;
1625

    
1626
    if (l > IP6_MAX_PREFIX_LENGTH)
1627
      bgp_parse_error(s, 10);
1628

    
1629
    /* Decode prefix body */
1630
    ip6_addr addr = IP6_NONE;
1631
    uint b = (l + 7) / 8;
1632
    memcpy(&addr, pos, b);
1633
    ADVANCE(pos, len, b);
1634

    
1635
    net = NET_ADDR_VPN6(ip6_ntoh(addr), l, rd);
1636
    net_normalize_vpn6(&net);
1637

    
1638
    // XXXX validate prefix
1639

    
1640
    bgp_rte_update(s, (net_addr *) &net, path_id, a);
1641
  }
1642
}
1643

    
1644

    
1645
static uint
1646
bgp_encode_nlri_flow4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1647
{
1648
  byte *pos = buf;
1649

    
1650
  while (!EMPTY_LIST(buck->prefixes) && (size >= 4))
1651
  {
1652
    struct bgp_prefix *px = HEAD(buck->prefixes);
1653
    struct net_addr_flow4 *net = (void *) px->net;
1654
    uint flen = net->length - sizeof(net_addr_flow4);
1655

    
1656
    /* Encode path ID */
1657
    if (s->add_path)
1658
    {
1659
      put_u32(pos, px->path_id);
1660
      ADVANCE(pos, size, 4);
1661
    }
1662

    
1663
    if (flen > size)
1664
      break;
1665

    
1666
    /* Copy whole flow data including length */
1667
    memcpy(pos, net->data, flen);
1668
    ADVANCE(pos, size, flen);
1669

    
1670
    bgp_free_prefix(s->channel, px);
1671
  }
1672

    
1673
  return pos - buf;
1674
}
1675

    
1676
static void
1677
bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1678
{
1679
  while (len)
1680
  {
1681
    u32 path_id = 0;
1682

    
1683
    /* Decode path ID */
1684
    if (s->add_path)
1685
    {
1686
      if (len < 4)
1687
        bgp_parse_error(s, 1);
1688

    
1689
      path_id = get_u32(pos);
1690
      ADVANCE(pos, len, 4);
1691
    }
1692

    
1693
    if (len < 2)
1694
      bgp_parse_error(s, 1);
1695

    
1696
    /* Decode flow length */
1697
    uint hlen = flow_hdr_length(pos);
1698
    uint dlen = flow_read_length(pos);
1699
    uint flen = hlen + dlen;
1700
    byte *data = pos + hlen;
1701

    
1702
    if (len < flen)
1703
      bgp_parse_error(s, 1);
1704

    
1705
    /* Validate flow data */
1706
    enum flow_validated_state r = flow4_validate(data, dlen);
1707
    if (r != FLOW_ST_VALID)
1708
    {
1709
      log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r));
1710
      bgp_parse_error(s, 1);
1711
    }
1712

    
1713
    if (data[0] != FLOW_TYPE_DST_PREFIX)
1714
    {
1715
      log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name);
1716
      bgp_parse_error(s, 1);
1717
    }
1718

    
1719
    /* Decode dst prefix */
1720
    ip4_addr px = IP4_NONE;
1721
    uint pxlen = data[1];
1722

    
1723
    // FIXME: Use some generic function
1724
    memcpy(&px, data+2, BYTES(pxlen));
1725
    px = ip4_and(ip4_ntoh(px), ip4_mkmask(pxlen));
1726

    
1727
    /* Prepare the flow */
1728
    net_addr *n = alloca(sizeof(struct net_addr_flow4) + flen);
1729
    net_fill_flow4(n, px, pxlen, pos, flen);
1730
    ADVANCE(pos, len, flen);
1731

    
1732
    bgp_rte_update(s, n, path_id, a);
1733
  }
1734
}
1735

    
1736

    
1737
static uint
1738
bgp_encode_nlri_flow6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1739
{
1740
  byte *pos = buf;
1741

    
1742
  while (!EMPTY_LIST(buck->prefixes) && (size >= 4))
1743
  {
1744
    struct bgp_prefix *px = HEAD(buck->prefixes);
1745
    struct net_addr_flow6 *net = (void *) px->net;
1746
    uint flen = net->length - sizeof(net_addr_flow6);
1747

    
1748
    /* Encode path ID */
1749
    if (s->add_path)
1750
    {
1751
      put_u32(pos, px->path_id);
1752
      ADVANCE(pos, size, 4);
1753
    }
1754

    
1755
    if (flen > size)
1756
      break;
1757

    
1758
    /* Copy whole flow data including length */
1759
    memcpy(pos, net->data, flen);
1760
    ADVANCE(pos, size, flen);
1761

    
1762
    bgp_free_prefix(s->channel, px);
1763
  }
1764

    
1765
  return pos - buf;
1766
}
1767

    
1768
static void
1769
bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1770
{
1771
  while (len)
1772
  {
1773
    u32 path_id = 0;
1774

    
1775
    /* Decode path ID */
1776
    if (s->add_path)
1777
    {
1778
      if (len < 4)
1779
        bgp_parse_error(s, 1);
1780

    
1781
      path_id = get_u32(pos);
1782
      ADVANCE(pos, len, 4);
1783
    }
1784

    
1785
    if (len < 2)
1786
      bgp_parse_error(s, 1);
1787

    
1788
    /* Decode flow length */
1789
    uint hlen = flow_hdr_length(pos);
1790
    uint dlen = flow_read_length(pos);
1791
    uint flen = hlen + dlen;
1792
    byte *data = pos + hlen;
1793

    
1794
    if (len < flen)
1795
      bgp_parse_error(s, 1);
1796

    
1797
    /* Validate flow data */
1798
    enum flow_validated_state r = flow6_validate(data, dlen);
1799
    if (r != FLOW_ST_VALID)
1800
    {
1801
      log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r));
1802
      bgp_parse_error(s, 1);
1803
    }
1804

    
1805
    if (data[0] != FLOW_TYPE_DST_PREFIX)
1806
    {
1807
      log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name);
1808
      bgp_parse_error(s, 1);
1809
    }
1810

    
1811
    /* Decode dst prefix */
1812
    ip6_addr px = IP6_NONE;
1813
    uint pxlen = data[1];
1814

    
1815
    // FIXME: Use some generic function
1816
    memcpy(&px, data+2, BYTES(pxlen));
1817
    px = ip6_and(ip6_ntoh(px), ip6_mkmask(pxlen));
1818

    
1819
    /* Prepare the flow */
1820
    net_addr *n = alloca(sizeof(struct net_addr_flow6) + flen);
1821
    net_fill_flow6(n, px, pxlen, pos, flen);
1822
    ADVANCE(pos, len, flen);
1823

    
1824
    bgp_rte_update(s, n, path_id, a);
1825
  }
1826
}
1827

    
1828

    
1829
static const struct bgp_af_desc bgp_af_table[] = {
1830
  {
1831
    .afi = BGP_AF_IPV4,
1832
    .net = NET_IP4,
1833
    .name = "ipv4",
1834
    .encode_nlri = bgp_encode_nlri_ip4,
1835
    .decode_nlri = bgp_decode_nlri_ip4,
1836
    .encode_next_hop = bgp_encode_next_hop_ip,
1837
    .decode_next_hop = bgp_decode_next_hop_ip,
1838
    .update_next_hop = bgp_update_next_hop_ip,
1839
  },
1840
  {
1841
    .afi = BGP_AF_IPV4_MC,
1842
    .net = NET_IP4,
1843
    .name = "ipv4-mc",
1844
    .encode_nlri = bgp_encode_nlri_ip4,
1845
    .decode_nlri = bgp_decode_nlri_ip4,
1846
    .encode_next_hop = bgp_encode_next_hop_ip,
1847
    .decode_next_hop = bgp_decode_next_hop_ip,
1848
    .update_next_hop = bgp_update_next_hop_ip,
1849
  },
1850
  {
1851
    .afi = BGP_AF_IPV4_MPLS,
1852
    .net = NET_IP4,
1853
    .mpls = 1,
1854
    .name = "ipv4-mpls",
1855
    .encode_nlri = bgp_encode_nlri_ip4,
1856
    .decode_nlri = bgp_decode_nlri_ip4,
1857
    .encode_next_hop = bgp_encode_next_hop_ip,
1858
    .decode_next_hop = bgp_decode_next_hop_ip,
1859
    .update_next_hop = bgp_update_next_hop_ip,
1860
  },
1861
  {
1862
    .afi = BGP_AF_IPV6,
1863
    .net = NET_IP6,
1864
    .name = "ipv6",
1865
    .encode_nlri = bgp_encode_nlri_ip6,
1866
    .decode_nlri = bgp_decode_nlri_ip6,
1867
    .encode_next_hop = bgp_encode_next_hop_ip,
1868
    .decode_next_hop = bgp_decode_next_hop_ip,
1869
    .update_next_hop = bgp_update_next_hop_ip,
1870
  },
1871
  {
1872
    .afi = BGP_AF_IPV6_MC,
1873
    .net = NET_IP6,
1874
    .name = "ipv6-mc",
1875
    .encode_nlri = bgp_encode_nlri_ip6,
1876
    .decode_nlri = bgp_decode_nlri_ip6,
1877
    .encode_next_hop = bgp_encode_next_hop_ip,
1878
    .decode_next_hop = bgp_decode_next_hop_ip,
1879
    .update_next_hop = bgp_update_next_hop_ip,
1880
  },
1881
  {
1882
    .afi = BGP_AF_IPV6_MPLS,
1883
    .net = NET_IP6,
1884
    .mpls = 1,
1885
    .name = "ipv6-mpls",
1886
    .encode_nlri = bgp_encode_nlri_ip6,
1887
    .decode_nlri = bgp_decode_nlri_ip6,
1888
    .encode_next_hop = bgp_encode_next_hop_ip,
1889
    .decode_next_hop = bgp_decode_next_hop_ip,
1890
    .update_next_hop = bgp_update_next_hop_ip,
1891
  },
1892
  {
1893
    .afi = BGP_AF_VPN4_MPLS,
1894
    .net = NET_VPN4,
1895
    .mpls = 1,
1896
    .name = "vpn4-mpls",
1897
    .encode_nlri = bgp_encode_nlri_vpn4,
1898
    .decode_nlri = bgp_decode_nlri_vpn4,
1899
    .encode_next_hop = bgp_encode_next_hop_vpn,
1900
    .decode_next_hop = bgp_decode_next_hop_vpn,
1901
    .update_next_hop = bgp_update_next_hop_ip,
1902
  },
1903
  {
1904
    .afi = BGP_AF_VPN6_MPLS,
1905
    .net = NET_VPN6,
1906
    .mpls = 1,
1907
    .name = "vpn6-mpls",
1908
    .encode_nlri = bgp_encode_nlri_vpn6,
1909
    .decode_nlri = bgp_decode_nlri_vpn6,
1910
    .encode_next_hop = bgp_encode_next_hop_vpn,
1911
    .decode_next_hop = bgp_decode_next_hop_vpn,
1912
    .update_next_hop = bgp_update_next_hop_ip,
1913
  },
1914
  {
1915
    .afi = BGP_AF_VPN4_MC,
1916
    .net = NET_VPN4,
1917
    .name = "vpn4-mc",
1918
    .encode_nlri = bgp_encode_nlri_vpn4,
1919
    .decode_nlri = bgp_decode_nlri_vpn4,
1920
    .encode_next_hop = bgp_encode_next_hop_vpn,
1921
    .decode_next_hop = bgp_decode_next_hop_vpn,
1922
    .update_next_hop = bgp_update_next_hop_ip,
1923
  },
1924
  {
1925
    .afi = BGP_AF_VPN6_MC,
1926
    .net = NET_VPN6,
1927
    .name = "vpn6-mc",
1928
    .encode_nlri = bgp_encode_nlri_vpn6,
1929
    .decode_nlri = bgp_decode_nlri_vpn6,
1930
    .encode_next_hop = bgp_encode_next_hop_vpn,
1931
    .decode_next_hop = bgp_decode_next_hop_vpn,
1932
    .update_next_hop = bgp_update_next_hop_ip,
1933
  },
1934
  {
1935
    .afi = BGP_AF_FLOW4,
1936
    .net = NET_FLOW4,
1937
    .no_igp = 1,
1938
    .name = "flow4",
1939
    .encode_nlri = bgp_encode_nlri_flow4,
1940
    .decode_nlri = bgp_decode_nlri_flow4,
1941
    .encode_next_hop = bgp_encode_next_hop_none,
1942
    .decode_next_hop = bgp_decode_next_hop_none,
1943
    .update_next_hop = bgp_update_next_hop_none,
1944
  },
1945
  {
1946
    .afi = BGP_AF_FLOW6,
1947
    .net = NET_FLOW6,
1948
    .no_igp = 1,
1949
    .name = "flow6",
1950
    .encode_nlri = bgp_encode_nlri_flow6,
1951
    .decode_nlri = bgp_decode_nlri_flow6,
1952
    .encode_next_hop = bgp_encode_next_hop_none,
1953
    .decode_next_hop = bgp_decode_next_hop_none,
1954
    .update_next_hop = bgp_update_next_hop_none,
1955
  },
1956
};
1957

    
1958
const struct bgp_af_desc *
1959
bgp_get_af_desc(u32 afi)
1960
{
1961
  uint i;
1962
  for (i = 0; i < ARRAY_SIZE(bgp_af_table); i++)
1963
    if (bgp_af_table[i].afi == afi)
1964
      return &bgp_af_table[i];
1965

    
1966
  return NULL;
1967
}
1968

    
1969
static inline uint
1970
bgp_encode_nlri(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
1971
{
1972
  return s->channel->desc->encode_nlri(s, buck, buf, end - buf);
1973
}
1974

    
1975
static inline uint
1976
bgp_encode_next_hop(struct bgp_write_state *s, eattr *nh, byte *buf)
1977
{
1978
  return s->channel->desc->encode_next_hop(s, nh, buf, 255);
1979
}
1980

    
1981
void
1982
bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to)
1983
{
1984
  s->channel->desc->update_next_hop(s, a, to);
1985
}
1986

    
1987
#define MAX_ATTRS_LENGTH (end-buf+BGP_HEADER_LENGTH - 1024)
1988

    
1989
static byte *
1990
bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
1991
{
1992
  /*
1993
   *        2 B        Withdrawn Routes Length (zero)
1994
   *        ---        IPv4 Withdrawn Routes NLRI (unused)
1995
   *        2 B        Total Path Attribute Length
1996
   *        var        Path Attributes
1997
   *        var        IPv4 Network Layer Reachability Information
1998
   */
1999

    
2000
  int lr, la;
2001

    
2002
  la = bgp_encode_attrs(s, buck->eattrs, buf+4, buf + MAX_ATTRS_LENGTH);
2003
  if (la < 0)
2004
  {
2005
    /* Attribute list too long */
2006
    bgp_withdraw_bucket(s->channel, buck);
2007
    return NULL;
2008
  }
2009

    
2010
  put_u16(buf+0, 0);
2011
  put_u16(buf+2, la);
2012

    
2013
  lr = bgp_encode_nlri(s, buck, buf+4+la, end);
2014

    
2015
  return buf+4+la+lr;
2016
}
2017

    
2018
static byte *
2019
bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2020
{
2021
  /*
2022
   *        2 B        IPv4 Withdrawn Routes Length (zero)
2023
   *        ---        IPv4 Withdrawn Routes NLRI (unused)
2024
   *        2 B        Total Path Attribute Length
2025
   *        1 B        MP_REACH_NLRI hdr - Attribute Flags
2026
   *        1 B        MP_REACH_NLRI hdr - Attribute Type Code
2027
   *        2 B        MP_REACH_NLRI hdr - Length of Attribute Data
2028
   *        2 B        MP_REACH_NLRI data - Address Family Identifier
2029
   *        1 B        MP_REACH_NLRI data - Subsequent Address Family Identifier
2030
   *        1 B        MP_REACH_NLRI data - Length of Next Hop Network Address
2031
   *        var        MP_REACH_NLRI data - Network Address of Next Hop
2032
   *        1 B        MP_REACH_NLRI data - Reserved (zero)
2033
   *        var        MP_REACH_NLRI data - Network Layer Reachability Information
2034
   *        var        Rest of Path Attributes
2035
   *        ---        IPv4 Network Layer Reachability Information (unused)
2036
   */
2037

    
2038
  int lh, lr, la;        /* Lengths of next hop, NLRI and attributes */
2039

    
2040
  /* Begin of MP_REACH_NLRI atribute */
2041
  buf[4] = BAF_OPTIONAL | BAF_EXT_LEN;
2042
  buf[5] = BA_MP_REACH_NLRI;
2043
  put_u16(buf+6, 0);                /* Will be fixed later */
2044
  put_af3(buf+8, s->channel->afi);
2045
  byte *pos = buf+11;
2046

    
2047
  /* Encode attributes to temporary buffer */
2048
  byte *abuf = alloca(MAX_ATTRS_LENGTH);
2049
  la = bgp_encode_attrs(s, buck->eattrs, abuf, abuf + MAX_ATTRS_LENGTH);
2050
  if (la < 0)
2051
  {
2052
    /* Attribute list too long */
2053
    bgp_withdraw_bucket(s->channel, buck);
2054
    return NULL;
2055
  }
2056

    
2057
  /* Encode the next hop */
2058
  lh = bgp_encode_next_hop(s, s->mp_next_hop, pos+1);
2059
  *pos = lh;
2060
  pos += 1+lh;
2061

    
2062
  /* Reserved field */
2063
  *pos++ = 0;
2064

    
2065
  /* Encode the NLRI */
2066
  lr = bgp_encode_nlri(s, buck, pos, end - la);
2067
  pos += lr;
2068

    
2069
  /* End of MP_REACH_NLRI atribute, update data length */
2070
  put_u16(buf+6, pos-buf-8);
2071

    
2072
  /* Copy remaining attributes */
2073
  memcpy(pos, abuf, la);
2074
  pos += la;
2075

    
2076
  /* Initial UPDATE fields */
2077
  put_u16(buf+0, 0);
2078
  put_u16(buf+2, pos-buf-4);
2079

    
2080
  return pos;
2081
}
2082

    
2083
#undef MAX_ATTRS_LENGTH
2084

    
2085
static byte *
2086
bgp_create_ip_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2087
{
2088
  /*
2089
   *        2 B        Withdrawn Routes Length
2090
   *        var        IPv4 Withdrawn Routes NLRI
2091
   *        2 B        Total Path Attribute Length (zero)
2092
   *        ---        Path Attributes (unused)
2093
   *        ---        IPv4 Network Layer Reachability Information (unused)
2094
   */
2095

    
2096
  uint len = bgp_encode_nlri(s, buck, buf+2, end);
2097

    
2098
  put_u16(buf+0, len);
2099
  put_u16(buf+2+len, 0);
2100

    
2101
  return buf+4+len;
2102
}
2103

    
2104
static byte *
2105
bgp_create_mp_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2106
{
2107
  /*
2108
   *        2 B        Withdrawn Routes Length (zero)
2109
   *        ---        IPv4 Withdrawn Routes NLRI (unused)
2110
   *        2 B        Total Path Attribute Length
2111
   *        1 B        MP_UNREACH_NLRI hdr - Attribute Flags
2112
   *        1 B        MP_UNREACH_NLRI hdr - Attribute Type Code
2113
   *        2 B        MP_UNREACH_NLRI hdr - Length of Attribute Data
2114
   *        2 B        MP_UNREACH_NLRI data - Address Family Identifier
2115
   *        1 B        MP_UNREACH_NLRI data - Subsequent Address Family Identifier
2116
   *        var        MP_UNREACH_NLRI data - Network Layer Reachability Information
2117
   *        ---        IPv4 Network Layer Reachability Information (unused)
2118
   */
2119

    
2120
  uint len = bgp_encode_nlri(s, buck, buf+11, end);
2121

    
2122
  put_u16(buf+0, 0);
2123
  put_u16(buf+2, 7+len);
2124

    
2125
  /* Begin of MP_UNREACH_NLRI atribute */
2126
  buf[4] = BAF_OPTIONAL | BAF_EXT_LEN;
2127
  buf[5] = BA_MP_UNREACH_NLRI;
2128
  put_u16(buf+6, 3+len);
2129
  put_af3(buf+8, s->channel->afi);
2130

    
2131
  return buf+11+len;
2132
}
2133

    
2134
static byte *
2135
bgp_create_update(struct bgp_channel *c, byte *buf)
2136
{
2137
  struct bgp_proto *p = (void *) c->c.proto;
2138
  struct bgp_bucket *buck;
2139
  byte *end = buf + (bgp_max_packet_length(p->conn) - BGP_HEADER_LENGTH);
2140
  byte *res = NULL;
2141

    
2142
again: ;
2143

    
2144
  /* Initialize write state */
2145
  struct bgp_write_state s = {
2146
    .proto = p,
2147
    .channel = c,
2148
    .pool = bgp_linpool,
2149
    .mp_reach = (c->afi != BGP_AF_IPV4) || c->ext_next_hop,
2150
    .as4_session = p->as4_session,
2151
    .add_path = c->add_path_tx,
2152
    .mpls = c->desc->mpls,
2153
  };
2154

    
2155
  /* Try unreachable bucket */
2156
  if ((buck = c->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
2157
  {
2158
    res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ?
2159
      bgp_create_ip_unreach(&s, buck, buf, end):
2160
      bgp_create_mp_unreach(&s, buck, buf, end);
2161

    
2162
    goto done;
2163
  }
2164

    
2165
  /* Try reachable buckets */
2166
  if (!EMPTY_LIST(c->bucket_queue))
2167
  {
2168
    buck = HEAD(c->bucket_queue);
2169

    
2170
    /* Cleanup empty buckets */
2171
    if (EMPTY_LIST(buck->prefixes))
2172
    {
2173
      bgp_free_bucket(c, buck);
2174
      goto again;
2175
    }
2176

    
2177
    res = !s.mp_reach ?
2178
      bgp_create_ip_reach(&s, buck, buf, end):
2179
      bgp_create_mp_reach(&s, buck, buf, end);
2180

    
2181
    if (EMPTY_LIST(buck->prefixes))
2182
      bgp_free_bucket(c, buck);
2183
    else
2184
      bgp_defer_bucket(c, buck);
2185

    
2186
    if (!res)
2187
      goto again;
2188

    
2189
    goto done;
2190
  }
2191

    
2192
  /* No more prefixes to send */
2193
  return NULL;
2194

    
2195
done:
2196
  BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE");
2197
  lp_flush(s.pool);
2198

    
2199
  return res;
2200
}
2201

    
2202
static byte *
2203
bgp_create_ip_end_mark(struct bgp_channel *c UNUSED, byte *buf)
2204
{
2205
  /* Empty update packet */
2206
  put_u32(buf, 0);
2207

    
2208
  return buf+4;
2209
}
2210

    
2211
static byte *
2212
bgp_create_mp_end_mark(struct bgp_channel *c, byte *buf)
2213
{
2214
  put_u16(buf+0, 0);
2215
  put_u16(buf+2, 6);                /* length 4--9 */
2216

    
2217
  /* Empty MP_UNREACH_NLRI atribute */
2218
  buf[4] = BAF_OPTIONAL;
2219
  buf[5] = BA_MP_UNREACH_NLRI;
2220
  buf[6] = 3;                        /* Length 7--9 */
2221
  put_af3(buf+7, c->afi);
2222

    
2223
  return buf+10;
2224
}
2225

    
2226
static byte *
2227
bgp_create_end_mark(struct bgp_channel *c, byte *buf)
2228
{
2229
  struct bgp_proto *p = (void *) c->c.proto;
2230

    
2231
  BGP_TRACE(D_PACKETS, "Sending END-OF-RIB");
2232

    
2233
  return (c->afi == BGP_AF_IPV4) ?
2234
    bgp_create_ip_end_mark(c, buf):
2235
    bgp_create_mp_end_mark(c, buf);
2236
}
2237

    
2238
static inline void
2239
bgp_rx_end_mark(struct bgp_parse_state *s, u32 afi)
2240
{
2241
  struct bgp_proto *p = s->proto;
2242
  struct bgp_channel *c = bgp_get_channel(p, afi);
2243

    
2244
  BGP_TRACE(D_PACKETS, "Got END-OF-RIB");
2245

    
2246
  if (!c)
2247
    DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi));
2248

    
2249
  if (c->load_state == BFS_LOADING)
2250
    c->load_state = BFS_NONE;
2251

    
2252
  if (p->p.gr_recovery)
2253
    channel_graceful_restart_unlock(&c->c);
2254

    
2255
  if (c->gr_active)
2256
    bgp_graceful_restart_done(c);
2257
}
2258

    
2259
static inline void
2260
bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_list *ea, byte *nh, uint nh_len)
2261
{
2262
  struct bgp_channel *c = bgp_get_channel(s->proto, afi);
2263
  rta *a = NULL;
2264

    
2265
  if (!c)
2266
    DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi));
2267

    
2268
  s->channel = c;
2269
  s->add_path = c->add_path_rx;
2270
  s->mpls = c->desc->mpls;
2271

    
2272
  s->last_id = 0;
2273
  s->last_src = s->proto->p.main_source;
2274

    
2275
  /*
2276
   * IPv4 BGP and MP-BGP may be used together in one update, therefore we do not
2277
   * add BA_NEXT_HOP in bgp_decode_attrs(), but we add it here independently for
2278
   * IPv4 BGP and MP-BGP. We undo the attribute (and possibly others attached by
2279
   * decode_next_hop hooks) by restoring a->eattrs afterwards.
2280
   */
2281

    
2282
  if (ea)
2283
  {
2284
    a = allocz(RTA_MAX_SIZE);
2285

    
2286
    a->source = RTS_BGP;
2287
    a->scope = SCOPE_UNIVERSE;
2288
    a->from = s->proto->cf->remote_ip;
2289
    a->eattrs = ea;
2290

    
2291
    c->desc->decode_next_hop(s, nh, nh_len, a);
2292

    
2293
    /* Handle withdraw during next hop decoding */
2294
    if (s->err_withdraw)
2295
      a = NULL;
2296
  }
2297

    
2298
  c->desc->decode_nlri(s, nlri, len, a);
2299

    
2300
  rta_free(s->cached_rta);
2301
  s->cached_rta = NULL;
2302
}
2303

    
2304
static void
2305
bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len)
2306
{
2307
  struct bgp_proto *p = conn->bgp;
2308
  ea_list *ea = NULL;
2309

    
2310
  BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE");
2311

    
2312
  /* Workaround for some BGP implementations that skip initial KEEPALIVE */
2313
  if (conn->state == BS_OPENCONFIRM)
2314
    bgp_conn_enter_established_state(conn);
2315

    
2316
  if (conn->state != BS_ESTABLISHED)
2317
  { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
2318

    
2319
  bgp_start_timer(conn->hold_timer, conn->hold_time);
2320

    
2321
  /* Initialize parse state */
2322
  struct bgp_parse_state s = {
2323
    .proto = p,
2324
    .pool = bgp_linpool,
2325
    .as4_session = p->as4_session,
2326
  };
2327

    
2328
  /* Parse error handler */
2329
  if (setjmp(s.err_jmpbuf))
2330
  {
2331
    bgp_error(conn, 3, s.err_subcode, NULL, 0);
2332
    goto done;
2333
  }
2334

    
2335
  /* Check minimal length */
2336
  if (len < 23)
2337
  { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2338

    
2339
  /* Skip fixed header */
2340
  uint pos = 19;
2341

    
2342
  /*
2343
   *        UPDATE message format
2344
   *
2345
   *        2 B        IPv4 Withdrawn Routes Length
2346
   *        var        IPv4 Withdrawn Routes NLRI
2347
   *        2 B        Total Path Attribute Length
2348
   *        var        Path Attributes
2349
   *        var        IPv4 Reachable Routes NLRI
2350
   */
2351

    
2352
  s.ip_unreach_len = get_u16(pkt + pos);
2353
  s.ip_unreach_nlri = pkt + pos + 2;
2354
  pos += 2 + s.ip_unreach_len;
2355

    
2356
  if (pos + 2 > len)
2357
    bgp_parse_error(&s, 1);
2358

    
2359
  s.attr_len = get_u16(pkt + pos);
2360
  s.attrs = pkt + pos + 2;
2361
  pos += 2 + s.attr_len;
2362

    
2363
  if (pos > len)
2364
    bgp_parse_error(&s, 1);
2365

    
2366
  s.ip_reach_len = len - pos;
2367
  s.ip_reach_nlri = pkt + pos;
2368

    
2369

    
2370
  if (s.attr_len)
2371
    ea = bgp_decode_attrs(&s, s.attrs, s.attr_len);
2372
  else
2373
    ea = NULL;
2374

    
2375
  /* Check for End-of-RIB marker */
2376
  if (!s.attr_len && !s.ip_unreach_len && !s.ip_reach_len)
2377
  { bgp_rx_end_mark(&s, BGP_AF_IPV4); goto done; }
2378

    
2379
  /* Check for MP End-of-RIB marker */
2380
  if ((s.attr_len < 8) && !s.ip_unreach_len && !s.ip_reach_len &&
2381
      !s.mp_reach_len && !s.mp_unreach_len && s.mp_unreach_af)
2382
  { bgp_rx_end_mark(&s, s.mp_unreach_af); goto done; }
2383

    
2384
  if (s.ip_unreach_len)
2385
    bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_unreach_nlri, s.ip_unreach_len, NULL, NULL, 0);
2386

    
2387
  if (s.mp_unreach_len)
2388
    bgp_decode_nlri(&s, s.mp_unreach_af, s.mp_unreach_nlri, s.mp_unreach_len, NULL, NULL, 0);
2389

    
2390
  if (s.ip_reach_len)
2391
    bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_reach_nlri, s.ip_reach_len,
2392
                    ea, s.ip_next_hop_data, s.ip_next_hop_len);
2393

    
2394
  if (s.mp_reach_len)
2395
    bgp_decode_nlri(&s, s.mp_reach_af, s.mp_reach_nlri, s.mp_reach_len,
2396
                    ea, s.mp_next_hop_data, s.mp_next_hop_len);
2397

    
2398
done:
2399
  rta_free(s.cached_rta);
2400
  lp_flush(s.pool);
2401
  return;
2402
}
2403

    
2404
static uint
2405
bgp_find_update_afi(byte *pos, uint len)
2406
{
2407
  /*
2408
   * This is stripped-down version of bgp_rx_update(), bgp_decode_attrs() and
2409
   * bgp_decode_mp_[un]reach_nlri() used by MRT code in order to find out which
2410
   * AFI/SAFI is associated with incoming UPDATE. Returns 0 for framing errors.
2411
   */
2412
  if (len < 23)
2413
    return 0;
2414

    
2415
  /* Assume there is no withrawn NLRI, read lengths and move to attribute list */
2416
  uint wlen = get_u16(pos + 19);
2417
  uint alen = get_u16(pos + 21);
2418
  ADVANCE(pos, len, 23);
2419

    
2420
  /* Either non-zero withdrawn NLRI, non-zero reachable NLRI, or IPv4 End-of-RIB */
2421
  if ((wlen != 0) || (alen < len) || !alen)
2422
    return BGP_AF_IPV4;
2423

    
2424
  if (alen > len)
2425
    return 0;
2426

    
2427
  /* Process attribute list (alen == len) */
2428
  while (len)
2429
  {
2430
    if (len < 2)
2431
      return 0;
2432

    
2433
    uint flags = pos[0];
2434
    uint code = pos[1];
2435
    ADVANCE(pos, len, 2);
2436

    
2437
    uint ll = !(flags & BAF_EXT_LEN) ? 1 : 2;
2438
    if (len < ll)
2439
      return 0;
2440

    
2441
    /* Read attribute length and move to attribute body */
2442
    alen = (ll == 1) ? get_u8(pos) : get_u16(pos);
2443
    ADVANCE(pos, len, ll);
2444

    
2445
    if (len < alen)
2446
      return 0;
2447

    
2448
    /* Found MP NLRI */
2449
    if ((code == BA_MP_REACH_NLRI) || (code == BA_MP_UNREACH_NLRI))
2450
    {
2451
      if (alen < 3)
2452
        return 0;
2453

    
2454
      return BGP_AF(get_u16(pos), pos[2]);
2455
    }
2456

    
2457
    /* Move to the next attribute */
2458
    ADVANCE(pos, len, alen);
2459
  }
2460

    
2461
  /* No basic or MP NLRI, but there are some attributes -> error */
2462
  return 0;
2463
}
2464

    
2465

    
2466
/*
2467
 *        ROUTE-REFRESH
2468
 */
2469

    
2470
static inline byte *
2471
bgp_create_route_refresh(struct bgp_channel *c, byte *buf)
2472
{
2473
  struct bgp_proto *p = (void *) c->c.proto;
2474

    
2475
  BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH");
2476

    
2477
  /* Original route refresh request, RFC 2918 */
2478
  put_af4(buf, c->afi);
2479
  buf[2] = BGP_RR_REQUEST;
2480

    
2481
  return buf+4;
2482
}
2483

    
2484
static inline byte *
2485
bgp_create_begin_refresh(struct bgp_channel *c, byte *buf)
2486
{
2487
  struct bgp_proto *p = (void *) c->c.proto;
2488

    
2489
  BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR");
2490

    
2491
  /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */
2492
  put_af4(buf, c->afi);
2493
  buf[2] = BGP_RR_BEGIN;
2494

    
2495
  return buf+4;
2496
}
2497

    
2498
static inline byte *
2499
bgp_create_end_refresh(struct bgp_channel *c, byte *buf)
2500
{
2501
  struct bgp_proto *p = (void *) c->c.proto;
2502

    
2503
  BGP_TRACE(D_PACKETS, "Sending END-OF-RR");
2504

    
2505
  /* Demarcation of ending of route refresh (EoRR), RFC 7313 */
2506
  put_af4(buf, c->afi);
2507
  buf[2] = BGP_RR_END;
2508

    
2509
  return buf+4;
2510
}
2511

    
2512
static void
2513
bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len)
2514
{
2515
  struct bgp_proto *p = conn->bgp;
2516

    
2517
  if (conn->state != BS_ESTABLISHED)
2518
  { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
2519

    
2520
  if (!conn->local_caps->route_refresh)
2521
  { bgp_error(conn, 1, 3, pkt+18, 1); return; }
2522

    
2523
  if (len < (BGP_HEADER_LENGTH + 4))
2524
  { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2525

    
2526
  if (len > (BGP_HEADER_LENGTH + 4))
2527
  { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; }
2528

    
2529
  struct bgp_channel *c = bgp_get_channel(p, get_af4(pkt+19));
2530
  if (!c)
2531
  {
2532
    log(L_WARN "%s: Got ROUTE-REFRESH subtype %u for AF %u.%u, ignoring",
2533
        p->p.name, pkt[21], get_u16(pkt+19), pkt[22]);
2534
    return;
2535
  }
2536

    
2537
  /* RFC 7313 redefined reserved field as RR message subtype */
2538
  uint subtype = p->enhanced_refresh ? pkt[21] : BGP_RR_REQUEST;
2539

    
2540
  switch (subtype)
2541
  {
2542
  case BGP_RR_REQUEST:
2543
    BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH");
2544
    channel_request_feeding(&c->c);
2545
    break;
2546

    
2547
  case BGP_RR_BEGIN:
2548
    BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR");
2549
    bgp_refresh_begin(c);
2550
    break;
2551

    
2552
  case BGP_RR_END:
2553
    BGP_TRACE(D_PACKETS, "Got END-OF-RR");
2554
    bgp_refresh_end(c);
2555
    break;
2556

    
2557
  default:
2558
    log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring",
2559
        p->p.name, subtype);
2560
    break;
2561
  }
2562
}
2563

    
2564
static inline struct bgp_channel *
2565
bgp_get_channel_to_send(struct bgp_proto *p, struct bgp_conn *conn)
2566
{
2567
  uint i = conn->last_channel;
2568

    
2569
  /* Try the last channel, but at most several times */
2570
  if ((conn->channels_to_send & (1 << i)) &&
2571
      (conn->last_channel_count < 16))
2572
    goto found;
2573

    
2574
  /* Find channel with non-zero channels_to_send */
2575
  do
2576
  {
2577
    i++;
2578
    if (i >= p->channel_count)
2579
      i = 0;
2580
  }
2581
  while (! (conn->channels_to_send & (1 << i)));
2582

    
2583
  /* Use that channel */
2584
  conn->last_channel = i;
2585
  conn->last_channel_count = 0;
2586

    
2587
found:
2588
  conn->last_channel_count++;
2589
  return p->channel_map[i];
2590
}
2591

    
2592
static inline int
2593
bgp_send(struct bgp_conn *conn, uint type, uint len)
2594
{
2595
  sock *sk = conn->sk;
2596
  byte *buf = sk->tbuf;
2597

    
2598
  memset(buf, 0xff, 16);                /* Marker */
2599
  put_u16(buf+16, len);
2600
  buf[18] = type;
2601

    
2602
  return sk_send(sk, len);
2603
}
2604

    
2605
/**
2606
 * bgp_fire_tx - transmit packets
2607
 * @conn: connection
2608
 *
2609
 * Whenever the transmit buffers of the underlying TCP connection
2610
 * are free and we have any packets queued for sending, the socket functions
2611
 * call bgp_fire_tx() which takes care of selecting the highest priority packet
2612
 * queued (Notification > Keepalive > Open > Update), assembling its header
2613
 * and body and sending it to the connection.
2614
 */
2615
static int
2616
bgp_fire_tx(struct bgp_conn *conn)
2617
{
2618
  struct bgp_proto *p = conn->bgp;
2619
  struct bgp_channel *c;
2620
  byte *buf, *pkt, *end;
2621
  uint s;
2622

    
2623
  if (!conn->sk)
2624
    return 0;
2625

    
2626
  buf = conn->sk->tbuf;
2627
  pkt = buf + BGP_HEADER_LENGTH;
2628
  s = conn->packets_to_send;
2629

    
2630
  if (s & (1 << PKT_SCHEDULE_CLOSE))
2631
  {
2632
    /* We can finally close connection and enter idle state */
2633
    bgp_conn_enter_idle_state(conn);
2634
    return 0;
2635
  }
2636
  if (s & (1 << PKT_NOTIFICATION))
2637
  {
2638
    conn->packets_to_send = 1 << PKT_SCHEDULE_CLOSE;
2639
    end = bgp_create_notification(conn, pkt);
2640
    return bgp_send(conn, PKT_NOTIFICATION, end - buf);
2641
  }
2642
  else if (s & (1 << PKT_KEEPALIVE))
2643
  {
2644
    conn->packets_to_send &= ~(1 << PKT_KEEPALIVE);
2645
    BGP_TRACE(D_PACKETS, "Sending KEEPALIVE");
2646
    bgp_start_timer(conn->keepalive_timer, conn->keepalive_time);
2647
    return bgp_send(conn, PKT_KEEPALIVE, BGP_HEADER_LENGTH);
2648
  }
2649
  else if (s & (1 << PKT_OPEN))
2650
  {
2651
    conn->packets_to_send &= ~(1 << PKT_OPEN);
2652
    end = bgp_create_open(conn, pkt);
2653
    return bgp_send(conn, PKT_OPEN, end - buf);
2654
  }
2655
  else while (conn->channels_to_send)
2656
  {
2657
    c = bgp_get_channel_to_send(p, conn);
2658
    s = c->packets_to_send;
2659

    
2660
    if (s & (1 << PKT_ROUTE_REFRESH))
2661
    {
2662
      c->packets_to_send &= ~(1 << PKT_ROUTE_REFRESH);
2663
      end = bgp_create_route_refresh(c, pkt);
2664
      return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2665
    }
2666
    else if (s & (1 << PKT_BEGIN_REFRESH))
2667
    {
2668
      /* BoRR is a subtype of RR, but uses separate bit in packets_to_send */
2669
      c->packets_to_send &= ~(1 << PKT_BEGIN_REFRESH);
2670
      end = bgp_create_begin_refresh(c, pkt);
2671
      return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2672
    }
2673
    else if (s & (1 << PKT_UPDATE))
2674
    {
2675
      end = bgp_create_update(c, pkt);
2676
      if (end)
2677
        return bgp_send(conn, PKT_UPDATE, end - buf);
2678

    
2679
      /* No update to send, perhaps we need to send End-of-RIB or EoRR */
2680
      c->packets_to_send = 0;
2681
      conn->channels_to_send &= ~(1 << c->index);
2682

    
2683
      if (c->feed_state == BFS_LOADED)
2684
      {
2685
        c->feed_state = BFS_NONE;
2686
        end = bgp_create_end_mark(c, pkt);
2687
        return bgp_send(conn, PKT_UPDATE, end - buf);
2688
      }
2689

    
2690
      else if (c->feed_state == BFS_REFRESHED)
2691
      {
2692
        c->feed_state = BFS_NONE;
2693
        end = bgp_create_end_refresh(c, pkt);
2694
        return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2695
      }
2696
    }
2697
    else if (s)
2698
      bug("Channel packets_to_send: %x", s);
2699

    
2700
    c->packets_to_send = 0;
2701
    conn->channels_to_send &= ~(1 << c->index);
2702
  }
2703

    
2704
  return 0;
2705
}
2706

    
2707
/**
2708
 * bgp_schedule_packet - schedule a packet for transmission
2709
 * @conn: connection
2710
 * @c: channel
2711
 * @type: packet type
2712
 *
2713
 * Schedule a packet of type @type to be sent as soon as possible.
2714
 */
2715
void
2716
bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type)
2717
{
2718
  ASSERT(conn->sk);
2719

    
2720
  DBG("BGP: Scheduling packet type %d\n", type);
2721

    
2722
  if (c)
2723
  {
2724
    if (! conn->channels_to_send)
2725
    {
2726
      conn->last_channel = c->index;
2727
      conn->last_channel_count = 0;
2728
    }
2729

    
2730
    c->packets_to_send |= 1 << type;
2731
    conn->channels_to_send |= 1 << c->index;
2732
  }
2733
  else
2734
    conn->packets_to_send |= 1 << type;
2735

    
2736
  if ((conn->sk->tpos == conn->sk->tbuf) && !ev_active(conn->tx_ev))
2737
    ev_schedule(conn->tx_ev);
2738
}
2739

    
2740
void
2741
bgp_kick_tx(void *vconn)
2742
{
2743
  struct bgp_conn *conn = vconn;
2744

    
2745
  DBG("BGP: kicking TX\n");
2746
  while (bgp_fire_tx(conn) > 0)
2747
    ;
2748
}
2749

    
2750
void
2751
bgp_tx(sock *sk)
2752
{
2753
  struct bgp_conn *conn = sk->data;
2754

    
2755
  DBG("BGP: TX hook\n");
2756
  while (bgp_fire_tx(conn) > 0)
2757
    ;
2758
}
2759

    
2760

    
2761
static struct {
2762
  byte major, minor;
2763
  byte *msg;
2764
} bgp_msg_table[] = {
2765
  { 1, 0, "Invalid message header" },
2766
  { 1, 1, "Connection not synchronized" },
2767
  { 1, 2, "Bad message length" },
2768
  { 1, 3, "Bad message type" },
2769
  { 2, 0, "Invalid OPEN message" },
2770
  { 2, 1, "Unsupported version number" },
2771
  { 2, 2, "Bad peer AS" },
2772
  { 2, 3, "Bad BGP identifier" },
2773
  { 2, 4, "Unsupported optional parameter" },
2774
  { 2, 5, "Authentication failure" },
2775
  { 2, 6, "Unacceptable hold time" },
2776
  { 2, 7, "Required capability missing" }, /* [RFC5492] */
2777
  { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */
2778
  { 3, 0, "Invalid UPDATE message" },
2779
  { 3, 1, "Malformed attribute list" },
2780
  { 3, 2, "Unrecognized well-known attribute" },
2781
  { 3, 3, "Missing mandatory attribute" },
2782
  { 3, 4, "Invalid attribute flags" },
2783
  { 3, 5, "Invalid attribute length" },
2784
  { 3, 6, "Invalid ORIGIN attribute" },
2785
  { 3, 7, "AS routing loop" },                /* Deprecated */
2786
  { 3, 8, "Invalid NEXT_HOP attribute" },
2787
  { 3, 9, "Optional attribute error" },
2788
  { 3, 10, "Invalid network field" },
2789
  { 3, 11, "Malformed AS_PATH" },
2790
  { 4, 0, "Hold timer expired" },
2791
  { 5, 0, "Finite state machine error" }, /* Subcodes are according to [RFC6608] */
2792
  { 5, 1, "Unexpected message in OpenSent state" },
2793
  { 5, 2, "Unexpected message in OpenConfirm state" },
2794
  { 5, 3, "Unexpected message in Established state" },
2795
  { 6, 0, "Cease" }, /* Subcodes are according to [RFC4486] */
2796
  { 6, 1, "Maximum number of prefixes reached" },
2797
  { 6, 2, "Administrative shutdown" },
2798
  { 6, 3, "Peer de-configured" },
2799
  { 6, 4, "Administrative reset" },
2800
  { 6, 5, "Connection rejected" },
2801
  { 6, 6, "Other configuration change" },
2802
  { 6, 7, "Connection collision resolution" },
2803
  { 6, 8, "Out of Resources" },
2804
  { 7, 0, "Invalid ROUTE-REFRESH message" }, /* [RFC7313] */
2805
  { 7, 1, "Invalid ROUTE-REFRESH message length" } /* [RFC7313] */
2806
};
2807

    
2808
/**
2809
 * bgp_error_dsc - return BGP error description
2810
 * @code: BGP error code
2811
 * @subcode: BGP error subcode
2812
 *
2813
 * bgp_error_dsc() returns error description for BGP errors
2814
 * which might be static string or given temporary buffer.
2815
 */
2816
const char *
2817
bgp_error_dsc(uint code, uint subcode)
2818
{
2819
  static char buff[32];
2820
  uint i;
2821

    
2822
  for (i=0; i < ARRAY_SIZE(bgp_msg_table); i++)
2823
    if (bgp_msg_table[i].major == code && bgp_msg_table[i].minor == subcode)
2824
      return bgp_msg_table[i].msg;
2825

    
2826
  bsprintf(buff, "Unknown error %u.%u", code, subcode);
2827
  return buff;
2828
}
2829

    
2830
/* RFC 8203 - shutdown communication message */
2831
static int
2832
bgp_handle_message(struct bgp_proto *p, byte *data, uint len, byte **bp)
2833
{
2834
  byte *msg = data + 1;
2835
  uint msg_len = data[0];
2836
  uint i;
2837

    
2838
  /* Handle zero length message */
2839
  if (msg_len == 0)
2840
    return 1;
2841

    
2842
  /* Handle proper message */
2843
  if ((msg_len > 128) && (msg_len + 1 > len))
2844
    return 0;
2845

    
2846
  /* Some elementary cleanup */
2847
  for (i = 0; i < msg_len; i++)
2848
    if (msg[i] < ' ')
2849
      msg[i] = ' ';
2850

    
2851
  proto_set_message(&p->p, msg, msg_len);
2852
  *bp += bsprintf(*bp, ": \"%s\"", p->p.message);
2853
  return 1;
2854
}
2855

    
2856
void
2857
bgp_log_error(struct bgp_proto *p, u8 class, char *msg, uint code, uint subcode, byte *data, uint len)
2858
{
2859
  byte argbuf[256], *t = argbuf;
2860
  uint i;
2861

    
2862
  /* Don't report Cease messages generated by myself */
2863
  if (code == 6 && class == BE_BGP_TX)
2864
    return;
2865

    
2866
  /* Reset shutdown message */
2867
  if ((code == 6) && ((subcode == 2) || (subcode == 4)))
2868
    proto_set_message(&p->p, NULL, 0);
2869

    
2870
  if (len)
2871
    {
2872
      /* Bad peer AS - we would like to print the AS */
2873
      if ((code == 2) && (subcode == 2) && ((len == 2) || (len == 4)))
2874
        {
2875
          t += bsprintf(t, ": %u", (len == 2) ? get_u16(data) : get_u32(data));
2876
          goto done;
2877
        }
2878

    
2879
      /* RFC 8203 - shutdown communication */
2880
      if (((code == 6) && ((subcode == 2) || (subcode == 4))))
2881
        if (bgp_handle_message(p, data, len, &t))
2882
          goto done;
2883

    
2884
      *t++ = ':';
2885
      *t++ = ' ';
2886
      if (len > 16)
2887
        len = 16;
2888
      for (i=0; i<len; i++)
2889
        t += bsprintf(t, "%02x", data[i]);
2890
    }
2891

    
2892
done:
2893
  *t = 0;
2894
  const byte *dsc = bgp_error_dsc(code, subcode);
2895
  log(L_REMOTE "%s: %s: %s%s", p->p.name, msg, dsc, argbuf);
2896
}
2897

    
2898
static void
2899
bgp_rx_notification(struct bgp_conn *conn, byte *pkt, uint len)
2900
{
2901
  struct bgp_proto *p = conn->bgp;
2902

    
2903
  if (len < 21)
2904
  { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2905

    
2906
  uint code = pkt[19];
2907
  uint subcode = pkt[20];
2908
  int err = (code != 6);
2909

    
2910
  bgp_log_error(p, BE_BGP_RX, "Received", code, subcode, pkt+21, len-21);
2911
  bgp_store_error(p, conn, BE_BGP_RX, (code << 16) | subcode);
2912

    
2913
  bgp_conn_enter_close_state(conn);
2914
  bgp_schedule_packet(conn, NULL, PKT_SCHEDULE_CLOSE);
2915

    
2916
  if (err)
2917
  {
2918
    bgp_update_startup_delay(p);
2919
    bgp_stop(p, 0, NULL, 0);
2920
  }
2921
  else
2922
  {
2923
    uint subcode_bit = 1 << ((subcode <= 8) ? subcode : 0);
2924
    if (p->cf->disable_after_cease & subcode_bit)
2925
    {
2926
      log(L_INFO "%s: Disabled after Cease notification", p->p.name);
2927
      p->startup_delay = 0;
2928
      p->p.disabled = 1;
2929
    }
2930
  }
2931
}
2932

    
2933
static void
2934
bgp_rx_keepalive(struct bgp_conn *conn)
2935
{
2936
  struct bgp_proto *p = conn->bgp;
2937

    
2938
  BGP_TRACE(D_PACKETS, "Got KEEPALIVE");
2939
  bgp_start_timer(conn->hold_timer, conn->hold_time);
2940

    
2941
  if (conn->state == BS_OPENCONFIRM)
2942
  { bgp_conn_enter_established_state(conn); return; }
2943

    
2944
  if (conn->state != BS_ESTABLISHED)
2945
    bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0);
2946
}
2947

    
2948

    
2949
/**
2950
 * bgp_rx_packet - handle a received packet
2951
 * @conn: BGP connection
2952
 * @pkt: start of the packet
2953
 * @len: packet size
2954
 *
2955
 * bgp_rx_packet() takes a newly received packet and calls the corresponding
2956
 * packet handler according to the packet type.
2957
 */
2958
static void
2959
bgp_rx_packet(struct bgp_conn *conn, byte *pkt, uint len)
2960
{
2961
  byte type = pkt[18];
2962

    
2963
  DBG("BGP: Got packet %02x (%d bytes)\n", type, len);
2964

    
2965
  if (conn->bgp->p.mrtdump & MD_MESSAGES)
2966
    bgp_dump_message(conn, pkt, len);
2967

    
2968
  switch (type)
2969
  {
2970
  case PKT_OPEN:                return bgp_rx_open(conn, pkt, len);
2971
  case PKT_UPDATE:                return bgp_rx_update(conn, pkt, len);
2972
  case PKT_NOTIFICATION:        return bgp_rx_notification(conn, pkt, len);
2973
  case PKT_KEEPALIVE:                return bgp_rx_keepalive(conn);
2974
  case PKT_ROUTE_REFRESH:        return bgp_rx_route_refresh(conn, pkt, len);
2975
  default:                        bgp_error(conn, 1, 3, pkt+18, 1);
2976
  }
2977
}
2978

    
2979
/**
2980
 * bgp_rx - handle received data
2981
 * @sk: socket
2982
 * @size: amount of data received
2983
 *
2984
 * bgp_rx() is called by the socket layer whenever new data arrive from
2985
 * the underlying TCP connection. It assembles the data fragments to packets,
2986
 * checks their headers and framing and passes complete packets to
2987
 * bgp_rx_packet().
2988
 */
2989
int
2990
bgp_rx(sock *sk, uint size)
2991
{
2992
  struct bgp_conn *conn = sk->data;
2993
  byte *pkt_start = sk->rbuf;
2994
  byte *end = pkt_start + size;
2995
  uint i, len;
2996

    
2997
  DBG("BGP: RX hook: Got %d bytes\n", size);
2998
  while (end >= pkt_start + BGP_HEADER_LENGTH)
2999
    {
3000
      if ((conn->state == BS_CLOSE) || (conn->sk != sk))
3001
        return 0;
3002
      for(i=0; i<16; i++)
3003
        if (pkt_start[i] != 0xff)
3004
          {
3005
            bgp_error(conn, 1, 1, NULL, 0);
3006
            break;
3007
          }
3008
      len = get_u16(pkt_start+16);
3009
      if ((len < BGP_HEADER_LENGTH) || (len > bgp_max_packet_length(conn)))
3010
        {
3011
          bgp_error(conn, 1, 2, pkt_start+16, 2);
3012
          break;
3013
        }
3014
      if (end < pkt_start + len)
3015
        break;
3016
      bgp_rx_packet(conn, pkt_start, len);
3017
      pkt_start += len;
3018
    }
3019
  if (pkt_start != sk->rbuf)
3020
    {
3021
      memmove(sk->rbuf, pkt_start, end - pkt_start);
3022
      sk->rpos = sk->rbuf + (end - pkt_start);
3023
    }
3024
  return 0;
3025
}