Statistics
| Branch: | Revision:

iof-bird-daemon / proto / bgp / packets.c @ 7ff34ca2

History | View | Annotate | Download (72.9 KB)

1
/*
2
 *        BIRD -- BGP Packet Processing
3
 *
4
 *        (c) 2000 Martin Mares <mj@ucw.cz>
5
 *        (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
6
 *        (c) 2008--2016 CZ.NIC z.s.p.o.
7
 *
8
 *        Can be freely distributed and used under the terms of the GNU GPL.
9
 */
10

    
11
#undef LOCAL_DEBUG
12

    
13
#include <stdlib.h>
14

    
15
#include "nest/bird.h"
16
#include "nest/iface.h"
17
#include "nest/protocol.h"
18
#include "nest/route.h"
19
#include "nest/attrs.h"
20
#include "proto/mrt/mrt.h"
21
#include "conf/conf.h"
22
#include "lib/unaligned.h"
23
#include "lib/flowspec.h"
24
#include "lib/socket.h"
25

    
26
#include "nest/cli.h"
27

    
28
#include "bgp.h"
29

    
30

    
31
#define BGP_RR_REQUEST                0
32
#define BGP_RR_BEGIN                1
33
#define BGP_RR_END                2
34

    
35
#define BGP_NLRI_MAX                (4 + 1 + 32)
36

    
37
#define BGP_MPLS_BOS                1        /* Bottom-of-stack bit */
38
#define BGP_MPLS_MAX                10        /* Max number of labels that 24*n <= 255 */
39
#define BGP_MPLS_NULL                3        /* Implicit NULL label */
40
#define BGP_MPLS_MAGIC                0x800000 /* Magic withdraw label value, RFC 3107 3 */
41

    
42

    
43
static struct tbf rl_rcv_update = TBF_DEFAULT_LOG_LIMITS;
44
static struct tbf rl_snd_update = TBF_DEFAULT_LOG_LIMITS;
45

    
46
/* Table for state -> RFC 6608 FSM error subcodes */
47
static byte fsm_err_subcode[BS_MAX] = {
48
  [BS_OPENSENT] = 1,
49
  [BS_OPENCONFIRM] = 2,
50
  [BS_ESTABLISHED] = 3
51
};
52

    
53

    
54
static struct bgp_channel *
55
bgp_get_channel(struct bgp_proto *p, u32 afi)
56
{
57
  uint i;
58

    
59
  for (i = 0; i < p->channel_count; i++)
60
    if (p->afi_map[i] == afi)
61
      return p->channel_map[i];
62

    
63
  return NULL;
64
}
65

    
66
static inline void
67
put_af3(byte *buf, u32 id)
68
{
69
  put_u16(buf, id >> 16);
70
  buf[2] = id & 0xff;
71
}
72

    
73
static inline void
74
put_af4(byte *buf, u32 id)
75
{
76
  put_u16(buf, id >> 16);
77
  buf[2] = 0;
78
  buf[3] = id & 0xff;
79
}
80

    
81
static inline u32
82
get_af3(byte *buf)
83
{
84
  return (get_u16(buf) << 16) | buf[2];
85
}
86

    
87
static inline u32
88
get_af4(byte *buf)
89
{
90
  return (get_u16(buf) << 16) | buf[3];
91
}
92

    
93
static void
94
init_mrt_bgp_data(struct bgp_conn *conn, struct mrt_bgp_data *d)
95
{
96
  struct bgp_proto *p = conn->bgp;
97
  int p_ok = conn->state >= BS_OPENCONFIRM;
98

    
99
  memset(d, 0, sizeof(struct mrt_bgp_data));
100
  d->peer_as = p->remote_as;
101
  d->local_as = p->local_as;
102
  d->index = (p->neigh && p->neigh->iface) ? p->neigh->iface->index : 0;
103
  d->af = ipa_is_ip4(p->remote_ip) ? BGP_AFI_IPV4 : BGP_AFI_IPV6;
104
  d->peer_ip = conn->sk ? conn->sk->daddr : IPA_NONE;
105
  d->local_ip = conn->sk ? conn->sk->saddr : IPA_NONE;
106
  d->as4 = p_ok ? p->as4_session : 0;
107
}
108

    
109
static uint bgp_find_update_afi(byte *pos, uint len);
110

    
111
static int
112
bgp_estimate_add_path(struct bgp_proto *p, byte *pkt, uint len)
113
{
114
  /* No need to estimate it for other messages than UPDATE */
115
  if (pkt[18] != PKT_UPDATE)
116
    return 0;
117

    
118
  /* 1 -> no channel, 2 -> all channels, 3 -> some channels */
119
  if (p->summary_add_path_rx < 3)
120
    return p->summary_add_path_rx == 2;
121

    
122
  uint afi = bgp_find_update_afi(pkt, len);
123
  struct bgp_channel *c = bgp_get_channel(p, afi);
124
  if (!c)
125
  {
126
    /* Either frame error (if !afi) or unknown AFI/SAFI,
127
       will be reported later in regular parsing */
128
    BGP_TRACE(D_PACKETS, "MRT processing noticed invalid packet");
129
    return 0;
130
  }
131

    
132
  return c->add_path_rx;
133
}
134

    
135
static void
136
bgp_dump_message(struct bgp_conn *conn, byte *pkt, uint len)
137
{
138
  struct mrt_bgp_data d;
139
  init_mrt_bgp_data(conn, &d);
140

    
141
  d.message = pkt;
142
  d.msg_len = len;
143
  d.add_path = bgp_estimate_add_path(conn->bgp, pkt, len);
144

    
145
  mrt_dump_bgp_message(&d);
146
}
147

    
148
void
149
bgp_dump_state_change(struct bgp_conn *conn, uint old, uint new)
150
{
151
  struct mrt_bgp_data d;
152
  init_mrt_bgp_data(conn, &d);
153

    
154
  d.old_state = old;
155
  d.new_state = new;
156

    
157
  mrt_dump_bgp_state_change(&d);
158
}
159

    
160
static byte *
161
bgp_create_notification(struct bgp_conn *conn, byte *buf)
162
{
163
  struct bgp_proto *p = conn->bgp;
164

    
165
  BGP_TRACE(D_PACKETS, "Sending NOTIFICATION(code=%d.%d)", conn->notify_code, conn->notify_subcode);
166
  buf[0] = conn->notify_code;
167
  buf[1] = conn->notify_subcode;
168
  memcpy(buf+2, conn->notify_data, conn->notify_size);
169
  return buf + 2 + conn->notify_size;
170
}
171

    
172

    
173
/* Capability negotiation as per RFC 5492 */
174

    
175
const struct bgp_af_caps *
176
bgp_find_af_caps(struct bgp_caps *caps, u32 afi)
177
{
178
  struct bgp_af_caps *ac;
179

    
180
  WALK_AF_CAPS(caps, ac)
181
    if (ac->afi == afi)
182
      return ac;
183

    
184
  return NULL;
185
}
186

    
187
static struct bgp_af_caps *
188
bgp_get_af_caps(struct bgp_caps *caps, u32 afi)
189
{
190
  struct bgp_af_caps *ac;
191

    
192
  WALK_AF_CAPS(caps, ac)
193
    if (ac->afi == afi)
194
      return ac;
195

    
196
  ac = &caps->af_data[caps->af_count++];
197
  memset(ac, 0, sizeof(struct bgp_af_caps));
198
  ac->afi = afi;
199

    
200
  return ac;
201
}
202

    
203
static int
204
bgp_af_caps_cmp(const void *X, const void *Y)
205
{
206
  const struct bgp_af_caps *x = X, *y = Y;
207
  return (x->afi < y->afi) ? -1 : (x->afi > y->afi) ? 1 : 0;
208
}
209

    
210

    
211
void
212
bgp_prepare_capabilities(struct bgp_conn *conn)
213
{
214
  struct bgp_proto *p = conn->bgp;
215
  struct bgp_channel *c;
216
  struct bgp_caps *caps;
217
  struct bgp_af_caps *ac;
218

    
219
  if (!p->cf->capabilities)
220
  {
221
    /* Just prepare empty local_caps */
222
    conn->local_caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps));
223
    return;
224
  }
225

    
226
  /* Prepare bgp_caps structure */
227
  int n = list_length(&p->p.channels);
228
  caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + n * sizeof(struct bgp_af_caps));
229
  conn->local_caps = caps;
230

    
231
  caps->as4_support = p->cf->enable_as4;
232
  caps->ext_messages = p->cf->enable_extended_messages;
233
  caps->route_refresh = p->cf->enable_refresh;
234
  caps->enhanced_refresh = p->cf->enable_refresh;
235

    
236
  if (caps->as4_support)
237
    caps->as4_number = p->public_as;
238

    
239
  if (p->cf->gr_mode)
240
  {
241
    caps->gr_aware = 1;
242
    caps->gr_time = p->cf->gr_time;
243
    caps->gr_flags = p->p.gr_recovery ? BGP_GRF_RESTART : 0;
244
  }
245

    
246
  if (p->cf->llgr_mode)
247
    caps->llgr_aware = 1;
248

    
249
  /* Allocate and fill per-AF fields */
250
  WALK_LIST(c, p->p.channels)
251
  {
252
    ac = &caps->af_data[caps->af_count++];
253
    ac->afi = c->afi;
254
    ac->ready = 1;
255

    
256
    ac->ext_next_hop = bgp_channel_is_ipv4(c) && c->cf->ext_next_hop;
257
    caps->any_ext_next_hop |= ac->ext_next_hop;
258

    
259
    ac->add_path = c->cf->add_path;
260
    caps->any_add_path |= ac->add_path;
261

    
262
    if (c->cf->gr_able)
263
    {
264
      ac->gr_able = 1;
265

    
266
      if (p->p.gr_recovery)
267
        ac->gr_af_flags |= BGP_GRF_FORWARDING;
268
    }
269

    
270
    if (c->cf->llgr_able)
271
    {
272
      ac->llgr_able = 1;
273
      ac->llgr_time = c->cf->llgr_time;
274

    
275
      if (p->p.gr_recovery)
276
        ac->llgr_flags |= BGP_LLGRF_FORWARDING;
277
    }
278
  }
279

    
280
  /* Sort capability fields by AFI/SAFI */
281
  qsort(caps->af_data, caps->af_count, sizeof(struct bgp_af_caps), bgp_af_caps_cmp);
282
}
283

    
284
static byte *
285
bgp_write_capabilities(struct bgp_conn *conn, byte *buf)
286
{
287
  struct bgp_proto *p = conn->bgp;
288
  struct bgp_caps *caps = conn->local_caps;
289
  struct bgp_af_caps *ac;
290
  byte *buf_head = buf;
291
  byte *data;
292

    
293
  /* Create capability list in buffer */
294

    
295
  /*
296
   * Note that max length is ~ 22+21*af_count. With max 12 channels that is
297
   * 274. Option limit is 253 and buffer size is 4096, so we cannot overflow
298
   * unless we add new capabilities or more AFs. XXXXX
299
   */
300

    
301
  WALK_AF_CAPS(caps, ac)
302
    if (ac->ready)
303
    {
304
      *buf++ = 1;                /* Capability 1: Multiprotocol extensions */
305
      *buf++ = 4;                /* Capability data length */
306
      put_af4(buf, ac->afi);
307
      buf += 4;
308
    }
309

    
310
  if (caps->route_refresh)
311
  {
312
    *buf++ = 2;                        /* Capability 2: Support for route refresh */
313
    *buf++ = 0;                        /* Capability data length */
314
  }
315

    
316
  if (caps->any_ext_next_hop)
317
  {
318
    *buf++ = 5;                        /* Capability 5: Support for extended next hop */
319
    *buf++ = 0;                        /* Capability data length, will be fixed later */
320
    data = buf;
321

    
322
    WALK_AF_CAPS(caps, ac)
323
      if (ac->ext_next_hop)
324
      {
325
        put_af4(buf, ac->afi);
326
        put_u16(buf+4, BGP_AFI_IPV6);
327
        buf += 6;
328
      }
329

    
330
    data[-1] = buf - data;
331
  }
332

    
333
  if (caps->ext_messages)
334
  {
335
    *buf++ = 6;                        /* Capability 6: Support for extended messages */
336
    *buf++ = 0;                        /* Capability data length */
337
  }
338

    
339
  if (caps->gr_aware)
340
  {
341
    *buf++ = 64;                /* Capability 64: Support for graceful restart */
342
    *buf++ = 0;                        /* Capability data length, will be fixed later */
343
    data = buf;
344

    
345
    put_u16(buf, caps->gr_time);
346
    buf[0] |= caps->gr_flags;
347
    buf += 2;
348

    
349
    WALK_AF_CAPS(caps, ac)
350
      if (ac->gr_able)
351
      {
352
        put_af3(buf, ac->afi);
353
        buf[3] = ac->gr_af_flags;
354
        buf += 4;
355
      }
356

    
357
    data[-1] = buf - data;
358
  }
359

    
360
  if (caps->as4_support)
361
  {
362
    *buf++ = 65;                /* Capability 65: Support for 4-octet AS number */
363
    *buf++ = 4;                        /* Capability data length */
364
    put_u32(buf, p->public_as);
365
    buf += 4;
366
  }
367

    
368
  if (caps->any_add_path)
369
  {
370
    *buf++ = 69;                /* Capability 69: Support for ADD-PATH */
371
    *buf++ = 0;                        /* Capability data length, will be fixed later */
372
    data = buf;
373

    
374
    WALK_AF_CAPS(caps, ac)
375
      if (ac->add_path)
376
      {
377
        put_af3(buf, ac->afi);
378
        buf[3] = ac->add_path;
379
        buf += 4;
380
      }
381

    
382
    data[-1] = buf - data;
383
  }
384

    
385
  if (caps->enhanced_refresh)
386
  {
387
    *buf++ = 70;                /* Capability 70: Support for enhanced route refresh */
388
    *buf++ = 0;                        /* Capability data length */
389
  }
390

    
391
  if (caps->llgr_aware)
392
  {
393
    *buf++ = 71;                /* Capability 71: Support for long-lived graceful restart */
394
    *buf++ = 0;                        /* Capability data length, will be fixed later */
395
    data = buf;
396

    
397
    WALK_AF_CAPS(caps, ac)
398
      if (ac->llgr_able)
399
      {
400
        put_af3(buf, ac->afi);
401
        buf[3] = ac->llgr_flags;
402
        put_u24(buf+4, ac->llgr_time);
403
        buf += 7;
404
      }
405

    
406
    data[-1] = buf - data;
407
  }
408

    
409
  caps->length = buf - buf_head;
410

    
411
  return buf;
412
}
413

    
414
static void
415
bgp_read_capabilities(struct bgp_conn *conn, struct bgp_caps *caps, byte *pos, int len)
416
{
417
  struct bgp_proto *p = conn->bgp;
418
  struct bgp_af_caps *ac;
419
  int i, cl;
420
  u32 af;
421

    
422
  caps->length += len;
423

    
424
  while (len > 0)
425
  {
426
    if (len < 2 || len < (2 + pos[1]))
427
      goto err;
428

    
429
    /* Capability length */
430
    cl = pos[1];
431

    
432
    /* Capability type */
433
    switch (pos[0])
434
    {
435
    case  1: /* Multiprotocol capability, RFC 4760 */
436
      if (cl != 4)
437
        goto err;
438

    
439
      af = get_af4(pos+2);
440
      ac = bgp_get_af_caps(caps, af);
441
      ac->ready = 1;
442
      break;
443

    
444
    case  2: /* Route refresh capability, RFC 2918 */
445
      if (cl != 0)
446
        goto err;
447

    
448
      caps->route_refresh = 1;
449
      break;
450

    
451
    case  5: /* Extended next hop encoding capability, RFC 5549 */
452
      if (cl % 6)
453
        goto err;
454

    
455
      for (i = 0; i < cl; i += 6)
456
      {
457
        /* Specified only for IPv4 prefixes with IPv6 next hops */
458
        if ((get_u16(pos+2+i+0) != BGP_AFI_IPV4) ||
459
            (get_u16(pos+2+i+4) != BGP_AFI_IPV6))
460
          continue;
461

    
462
        af = get_af4(pos+2+i);
463
        ac = bgp_get_af_caps(caps, af);
464
        ac->ext_next_hop = 1;
465
      }
466
      break;
467

    
468
    case  6: /* Extended message length capability, RFC draft */
469
      if (cl != 0)
470
        goto err;
471

    
472
      caps->ext_messages = 1;
473
      break;
474

    
475
    case 64: /* Graceful restart capability, RFC 4724 */
476
      if (cl % 4 != 2)
477
        goto err;
478

    
479
      /* Only the last instance is valid */
480
      WALK_AF_CAPS(caps, ac)
481
      {
482
        ac->gr_able = 0;
483
        ac->gr_af_flags = 0;
484
      }
485

    
486
      caps->gr_aware = 1;
487
      caps->gr_flags = pos[2] & 0xf0;
488
      caps->gr_time = get_u16(pos + 2) & 0x0fff;
489

    
490
      for (i = 2; i < cl; i += 4)
491
      {
492
        af = get_af3(pos+2+i);
493
        ac = bgp_get_af_caps(caps, af);
494
        ac->gr_able = 1;
495
        ac->gr_af_flags = pos[2+i+3];
496
      }
497
      break;
498

    
499
    case 65: /* AS4 capability, RFC 6793 */
500
      if (cl != 4)
501
        goto err;
502

    
503
      caps->as4_support = 1;
504
      caps->as4_number = get_u32(pos + 2);
505
      break;
506

    
507
    case 69: /* ADD-PATH capability, RFC 7911 */
508
      if (cl % 4)
509
        goto err;
510

    
511
      for (i = 0; i < cl; i += 4)
512
      {
513
        byte val = pos[2+i+3];
514
        if (!val || (val > BGP_ADD_PATH_FULL))
515
        {
516
          log(L_WARN "%s: Got ADD-PATH capability with unknown value %u, ignoring",
517
              p->p.name, val);
518
          break;
519
        }
520
      }
521

    
522
      for (i = 0; i < cl; i += 4)
523
      {
524
        af = get_af3(pos+2+i);
525
        ac = bgp_get_af_caps(caps, af);
526
        ac->add_path = pos[2+i+3];
527
      }
528
      break;
529

    
530
    case 70: /* Enhanced route refresh capability, RFC 7313 */
531
      if (cl != 0)
532
        goto err;
533

    
534
      caps->enhanced_refresh = 1;
535
      break;
536

    
537
    case 71: /* Long lived graceful restart capability, RFC draft */
538
      if (cl % 7)
539
        goto err;
540

    
541
      /* Presumably, only the last instance is valid */
542
      WALK_AF_CAPS(caps, ac)
543
      {
544
        ac->llgr_able = 0;
545
        ac->llgr_flags = 0;
546
        ac->llgr_time = 0;
547
      }
548

    
549
      caps->llgr_aware = 1;
550

    
551
      for (i = 0; i < cl; i += 7)
552
      {
553
        af = get_af3(pos+2+i);
554
        ac = bgp_get_af_caps(caps, af);
555
        ac->llgr_able = 1;
556
        ac->llgr_flags = pos[2+i+3];
557
        ac->llgr_time = get_u24(pos + 2+i+4);
558
      }
559
      break;
560

    
561
      /* We can safely ignore all other capabilities */
562
    }
563

    
564
    ADVANCE(pos, len, 2 + cl);
565
  }
566

    
567
  /* The LLGR capability must be advertised together with the GR capability,
568
     otherwise it must be disregarded */
569
  if (!caps->gr_aware && caps->llgr_aware)
570
  {
571
    caps->llgr_aware = 0;
572
    WALK_AF_CAPS(caps, ac)
573
    {
574
      ac->llgr_able = 0;
575
      ac->llgr_flags = 0;
576
      ac->llgr_time = 0;
577
    }
578
  }
579

    
580
  return;
581

    
582
err:
583
  bgp_error(conn, 2, 0, NULL, 0);
584
  return;
585
}
586

    
587
static int
588
bgp_check_capabilities(struct bgp_conn *conn)
589
{
590
  struct bgp_proto *p = conn->bgp;
591
  struct bgp_caps *local = conn->local_caps;
592
  struct bgp_caps *remote = conn->remote_caps;
593
  struct bgp_channel *c;
594
  int count = 0;
595

    
596
  /* This is partially overlapping with bgp_conn_enter_established_state(),
597
     but we need to run this just after we receive OPEN message */
598

    
599
  WALK_LIST(c, p->p.channels)
600
  {
601
    const struct bgp_af_caps *loc = bgp_find_af_caps(local,  c->afi);
602
    const struct bgp_af_caps *rem = bgp_find_af_caps(remote, c->afi);
603

    
604
    /* Find out whether this channel will be active */
605
    int active = loc && loc->ready &&
606
      ((rem && rem->ready) || (!remote->length && (c->afi == BGP_AF_IPV4)));
607

    
608
    /* Mandatory must be active */
609
    if (c->cf->mandatory && !active)
610
      return 0;
611

    
612
    if (active)
613
      count++;
614
  }
615

    
616
  /* We need at least one channel active */
617
  if (!count)
618
    return 0;
619

    
620
  return 1;
621
}
622

    
623
static int
624
bgp_read_options(struct bgp_conn *conn, byte *pos, int len)
625
{
626
  struct bgp_proto *p = conn->bgp;
627
  struct bgp_caps *caps;
628
  int ol;
629

    
630
  /* Max number of announced AFIs is limited by max option length (255) */
631
  caps = alloca(sizeof(struct bgp_caps) + 64 * sizeof(struct bgp_af_caps));
632
  memset(caps, 0, sizeof(struct bgp_caps));
633

    
634
  while (len > 0)
635
  {
636
    if ((len < 2) || (len < (2 + pos[1])))
637
    { bgp_error(conn, 2, 0, NULL, 0); return -1; }
638

    
639
    ol = pos[1];
640
    if (pos[0] == 2)
641
    {
642
      /* BGP capabilities, RFC 5492 */
643
      if (p->cf->capabilities)
644
        bgp_read_capabilities(conn, caps, pos + 2, ol);
645
    }
646
    else
647
    {
648
      /* Unknown option */
649
      bgp_error(conn, 2, 4, pos, ol); /* FIXME: ol or ol+2 ? */
650
      return -1;
651
    }
652

    
653
    ADVANCE(pos, len, 2 + ol);
654
  }
655

    
656
  uint n = sizeof(struct bgp_caps) + caps->af_count * sizeof(struct bgp_af_caps);
657
  conn->remote_caps = mb_allocz(p->p.pool, n);
658
  memcpy(conn->remote_caps, caps, n);
659

    
660
  return 0;
661
}
662

    
663
static byte *
664
bgp_create_open(struct bgp_conn *conn, byte *buf)
665
{
666
  struct bgp_proto *p = conn->bgp;
667

    
668
  BGP_TRACE(D_PACKETS, "Sending OPEN(ver=%d,as=%d,hold=%d,id=%08x)",
669
            BGP_VERSION, p->public_as, p->cf->hold_time, p->local_id);
670

    
671
  buf[0] = BGP_VERSION;
672
  put_u16(buf+1, (p->public_as < 0xFFFF) ? p->public_as : AS_TRANS);
673
  put_u16(buf+3, p->cf->hold_time);
674
  put_u32(buf+5, p->local_id);
675

    
676
  if (p->cf->capabilities)
677
  {
678
    /* Prepare local_caps and write capabilities to buffer */
679
    byte *end = bgp_write_capabilities(conn, buf+12);
680
    uint len = end - (buf+12);
681

    
682
    buf[9] = len + 2;                /* Optional parameters length */
683
    buf[10] = 2;                /* Option 2: Capability list */
684
    buf[11] = len;                /* Option data length */
685

    
686
    return end;
687
  }
688
  else
689
  {
690
    buf[9] = 0;                        /* No optional parameters */
691
    return buf + 10;
692
  }
693

    
694
  return buf;
695
}
696

    
697
static void
698
bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len)
699
{
700
  struct bgp_proto *p = conn->bgp;
701
  struct bgp_conn *other;
702
  u32 asn, hold, id;
703

    
704
  /* Check state */
705
  if (conn->state != BS_OPENSENT)
706
  { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
707

    
708
  /* Check message contents */
709
  if (len < 29 || len != 29 + (uint) pkt[28])
710
  { bgp_error(conn, 1, 2, pkt+16, 2); return; }
711

    
712
  if (pkt[19] != BGP_VERSION)
713
  { u16 val = BGP_VERSION; bgp_error(conn, 2, 1, (byte *) &val, 2); return; }
714

    
715
  asn = get_u16(pkt+20);
716
  hold = get_u16(pkt+22);
717
  id = get_u32(pkt+24);
718
  BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%R)", asn, hold, id);
719

    
720
  if (bgp_read_options(conn, pkt+29, pkt[28]) < 0)
721
    return;
722

    
723
  if (hold > 0 && hold < 3)
724
  { bgp_error(conn, 2, 6, pkt+22, 2); return; }
725

    
726
  /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */
727
  if (!id || (p->is_internal && id == p->local_id))
728
  { bgp_error(conn, 2, 3, pkt+24, -4); return; }
729

    
730
  /* RFC 5492 4 - check for required capabilities */
731
  if (p->cf->capabilities && !bgp_check_capabilities(conn))
732
  { bgp_error(conn, 2, 7, NULL, 0); return; }
733

    
734
  struct bgp_caps *caps = conn->remote_caps;
735

    
736
  if (caps->as4_support)
737
  {
738
    u32 as4 = caps->as4_number;
739

    
740
    if ((as4 != asn) && (asn != AS_TRANS))
741
      log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name);
742

    
743
    /* When remote ASN is unspecified, it must be external one */
744
    if (p->remote_as ? (as4 != p->remote_as) : (as4 == p->local_as))
745
    { as4 = htonl(as4); bgp_error(conn, 2, 2, (byte *) &as4, 4); return; }
746

    
747
    conn->received_as = as4;
748
  }
749
  else
750
  {
751
    if (p->remote_as ? (asn != p->remote_as) : (asn == p->local_as))
752
    { bgp_error(conn, 2, 2, pkt+20, 2); return; }
753

    
754
    conn->received_as = asn;
755
  }
756

    
757
  /* Check the other connection */
758
  other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn;
759
  switch (other->state)
760
  {
761
  case BS_CONNECT:
762
  case BS_ACTIVE:
763
    /* Stop outgoing connection attempts */
764
    bgp_conn_enter_idle_state(other);
765
    break;
766

    
767
  case BS_IDLE:
768
  case BS_OPENSENT:
769
  case BS_CLOSE:
770
    break;
771

    
772
  case BS_OPENCONFIRM:
773
    /*
774
     * Description of collision detection rules in RFC 4271 is confusing and
775
     * contradictory, but it is essentially:
776
     *
777
     * 1. Router with higher ID is dominant
778
     * 2. If both have the same ID, router with higher ASN is dominant [RFC6286]
779
     * 3. When both connections are in OpenConfirm state, one initiated by
780
     *    the dominant router is kept.
781
     *
782
     * The first line in the expression below evaluates whether the neighbor
783
     * is dominant, the second line whether the new connection was initiated
784
     * by the neighbor. If both are true (or both are false), we keep the new
785
     * connection, otherwise we keep the old one.
786
     */
787
    if (((p->local_id < id) || ((p->local_id == id) && (p->public_as < p->remote_as)))
788
        == (conn == &p->incoming_conn))
789
    {
790
      /* Should close the other connection */
791
      BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection");
792
      bgp_error(other, 6, 7, NULL, 0);
793
      break;
794
    }
795
    /* Fall thru */
796
  case BS_ESTABLISHED:
797
    /* Should close this connection */
798
    BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection");
799
    bgp_error(conn, 6, 7, NULL, 0);
800
    return;
801

    
802
  default:
803
    bug("bgp_rx_open: Unknown state");
804
  }
805

    
806
  /* Update our local variables */
807
  conn->hold_time = MIN(hold, p->cf->hold_time);
808
  conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3;
809
  conn->as4_session = conn->local_caps->as4_support && caps->as4_support;
810
  conn->ext_messages = conn->local_caps->ext_messages && caps->ext_messages;
811
  p->remote_id = id;
812

    
813
  DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n",
814
      conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, conn->as4_session);
815

    
816
  bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE);
817
  bgp_start_timer(conn->hold_timer, conn->hold_time);
818
  bgp_conn_enter_openconfirm_state(conn);
819
}
820

    
821

    
822
/*
823
 *        Next hop handling
824
 */
825

    
826
#define REPORT(msg, args...) \
827
  ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); })
828

    
829
#define DISCARD(msg, args...) \
830
  ({ REPORT(msg, ## args); return; })
831

    
832
#define WITHDRAW(msg, args...) \
833
  ({ REPORT(msg, ## args); s->err_withdraw = 1; return; })
834

    
835
#define BAD_AFI                "Unexpected AF <%u/%u> in UPDATE"
836
#define BAD_NEXT_HOP        "Invalid NEXT_HOP attribute"
837
#define NO_NEXT_HOP        "Missing NEXT_HOP attribute"
838
#define NO_LABEL_STACK        "Missing MPLS stack"
839

    
840

    
841
static void
842
bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll)
843
{
844
  struct bgp_proto *p = s->proto;
845
  struct bgp_channel *c = s->channel;
846

    
847
  if (c->cf->gw_mode == GW_DIRECT)
848
  {
849
    neighbor *nbr = NULL;
850

    
851
    /* GW_DIRECT -> single_hop -> p->neigh != NULL */
852
    if (ipa_nonzero(gw))
853
      nbr = neigh_find(&p->p, gw, NULL, 0);
854
    else if (ipa_nonzero(ll))
855
      nbr = neigh_find(&p->p, ll, p->neigh->iface, 0);
856

    
857
    if (!nbr || (nbr->scope == SCOPE_HOST))
858
      WITHDRAW(BAD_NEXT_HOP);
859

    
860
    a->dest = RTD_UNICAST;
861
    a->nh.gw = nbr->addr;
862
    a->nh.iface = nbr->iface;
863
  }
864
  else /* GW_RECURSIVE */
865
  {
866
    if (ipa_zero(gw))
867
      WITHDRAW(BAD_NEXT_HOP);
868

    
869
    rtable *tab = ipa_is_ip4(gw) ? c->igp_table_ip4 : c->igp_table_ip6;
870
    s->hostentry = rt_get_hostentry(tab, gw, ll, c->c.table);
871

    
872
    if (!s->mpls)
873
      rta_apply_hostentry(a, s->hostentry, NULL);
874

    
875
    /* With MPLS, hostentry is applied later in bgp_apply_mpls_labels() */
876
  }
877
}
878

    
879
static void
880
bgp_apply_mpls_labels(struct bgp_parse_state *s, rta *a, u32 *labels, uint lnum)
881
{
882
  if (lnum > MPLS_MAX_LABEL_STACK)
883
  {
884
    REPORT("Too many MPLS labels ($u)", lnum);
885

    
886
    a->dest = RTD_UNREACHABLE;
887
    a->hostentry = NULL;
888
    a->nh = (struct nexthop) { };
889
    return;
890
  }
891

    
892
  /* Handle implicit NULL as empty MPLS stack */
893
  if ((lnum == 1) && (labels[0] == BGP_MPLS_NULL))
894
    lnum = 0;
895

    
896
  if (s->channel->cf->gw_mode == GW_DIRECT)
897
  {
898
    a->nh.labels = lnum;
899
    memcpy(a->nh.label, labels, 4*lnum);
900
  }
901
  else /* GW_RECURSIVE */
902
  {
903
    mpls_label_stack ms;
904

    
905
    ms.len = lnum;
906
    memcpy(ms.stack, labels, 4*lnum);
907
    rta_apply_hostentry(a, s->hostentry, &ms);
908
  }
909
}
910

    
911

    
912
static int
913
bgp_match_src(struct bgp_export_state *s, int mode)
914
{
915
  switch (mode)
916
  {
917
  case NH_NO:                return 0;
918
  case NH_ALL:                return 1;
919
  case NH_IBGP:                return s->src && s->src->is_internal;
920
  case NH_EBGP:                return s->src && !s->src->is_internal;
921
  default:                return 0;
922
  }
923
}
924

    
925
static inline int
926
bgp_use_next_hop(struct bgp_export_state *s, eattr *a)
927
{
928
  struct bgp_proto *p = s->proto;
929
  struct bgp_channel *c = s->channel;
930
  ip_addr *nh = (void *) a->u.ptr->data;
931

    
932
  /* Handle next hop self option */
933
  if (c->cf->next_hop_self && bgp_match_src(s, c->cf->next_hop_self))
934
    return 0;
935

    
936
  /* Handle next hop keep option */
937
  if (c->cf->next_hop_keep && bgp_match_src(s, c->cf->next_hop_keep))
938
    return 1;
939

    
940
  /* Keep it when explicitly set in export filter */
941
  if (a->type & EAF_FRESH)
942
    return 1;
943

    
944
  /* Check for non-matching AF */
945
  if ((ipa_is_ip4(*nh) != bgp_channel_is_ipv4(c)) && !c->ext_next_hop)
946
    return 0;
947

    
948
  /* Keep it when exported to internal peers */
949
  if (p->is_interior && ipa_nonzero(*nh))
950
    return 1;
951

    
952
  /* Keep it when forwarded between single-hop BGPs on the same iface */
953
  struct iface *ifa = (s->src && s->src->neigh) ? s->src->neigh->iface : NULL;
954
  return p->neigh && (p->neigh->iface == ifa);
955
}
956

    
957
static inline int
958
bgp_use_gateway(struct bgp_export_state *s)
959
{
960
  struct bgp_proto *p = s->proto;
961
  struct bgp_channel *c = s->channel;
962
  rta *ra = s->route->attrs;
963

    
964
  /* Handle next hop self option - also applies to gateway */
965
  if (c->cf->next_hop_self && bgp_match_src(s, c->cf->next_hop_self))
966
    return 0;
967

    
968
  /* We need one valid global gateway */
969
  if ((ra->dest != RTD_UNICAST) || ra->nh.next || ipa_zero(ra->nh.gw) || ipa_is_link_local(ra->nh.gw))
970
    return 0;
971

    
972
  /* Check for non-matching AF */
973
  if ((ipa_is_ip4(ra->nh.gw) != bgp_channel_is_ipv4(c)) && !c->ext_next_hop)
974
    return 0;
975

    
976
  /* Use it when exported to internal peers */
977
  if (p->is_interior)
978
    return 1;
979

    
980
  /* Use it when forwarded to single-hop BGP peer on on the same iface */
981
  return p->neigh && (p->neigh->iface == ra->nh.iface);
982
}
983

    
984
static void
985
bgp_update_next_hop_ip(struct bgp_export_state *s, eattr *a, ea_list **to)
986
{
987
  if (!a || !bgp_use_next_hop(s, a))
988
  {
989
    if (bgp_use_gateway(s))
990
    {
991
      rta *ra = s->route->attrs;
992
      ip_addr nh[1] = { ra->nh.gw };
993
      bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, 16);
994

    
995
      if (s->mpls)
996
      {
997
        u32 implicit_null = BGP_MPLS_NULL;
998
        u32 *labels = ra->nh.labels ? ra->nh.label : &implicit_null;
999
        uint lnum = ra->nh.labels ? ra->nh.labels : 1;
1000
        bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, labels, lnum * 4);
1001
      }
1002
    }
1003
    else
1004
    {
1005
      ip_addr nh[2] = { s->channel->next_hop_addr, s->channel->link_addr };
1006
      bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, ipa_nonzero(nh[1]) ? 32 : 16);
1007

    
1008
      /* TODO: Use local MPLS assigned label */
1009
      if (s->mpls)
1010
      {
1011
        u32 implicit_null = BGP_MPLS_NULL;
1012
        bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, &implicit_null, 4);
1013
      }
1014
    }
1015
  }
1016

    
1017
  /* Check if next hop is valid */
1018
  a = bgp_find_attr(*to, BA_NEXT_HOP);
1019
  if (!a)
1020
    WITHDRAW(NO_NEXT_HOP);
1021

    
1022
  ip_addr *nh = (void *) a->u.ptr->data;
1023
  ip_addr peer = s->proto->remote_ip;
1024
  uint len = a->u.ptr->length;
1025

    
1026
  /* Forbid zero next hop */
1027
  if (ipa_zero(nh[0]) && ((len != 32) || ipa_zero(nh[1])))
1028
    WITHDRAW(BAD_NEXT_HOP);
1029

    
1030
  /* Forbid next hop equal to neighbor IP */
1031
  if (ipa_equal(peer, nh[0]) || ((len == 32) && ipa_equal(peer, nh[1])))
1032
    WITHDRAW(BAD_NEXT_HOP);
1033

    
1034
  /* Forbid next hop with non-matching AF */
1035
  if ((ipa_is_ip4(nh[0]) != bgp_channel_is_ipv4(s->channel)) &&
1036
      !s->channel->ext_next_hop)
1037
    WITHDRAW(BAD_NEXT_HOP);
1038

    
1039
  /* Just check if MPLS stack */
1040
  if (s->mpls && !bgp_find_attr(*to, BA_MPLS_LABEL_STACK))
1041
    WITHDRAW(NO_LABEL_STACK);
1042
}
1043

    
1044
static uint
1045
bgp_encode_next_hop_ip(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED)
1046
{
1047
  /* This function is used only for MP-BGP, see bgp_encode_next_hop() for IPv4 BGP */
1048
  ip_addr *nh = (void *) a->u.ptr->data;
1049
  uint len = a->u.ptr->length;
1050

    
1051
  ASSERT((len == 16) || (len == 32));
1052

    
1053
  /*
1054
   * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This
1055
   * is specified in RFC 5549 for IPv4 and in RFC 4798 for IPv6. The difference
1056
   * is that IPv4 address is directly encoded with IPv4 NLRI, but as IPv4-mapped
1057
   * IPv6 address with IPv6 NLRI.
1058
   */
1059

    
1060
  if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0]))
1061
  {
1062
    put_ip4(buf, ipa_to_ip4(nh[0]));
1063
    return 4;
1064
  }
1065

    
1066
  put_ip6(buf, ipa_to_ip6(nh[0]));
1067

    
1068
  if (len == 32)
1069
    put_ip6(buf+16, ipa_to_ip6(nh[1]));
1070

    
1071
  return len;
1072
}
1073

    
1074
static void
1075
bgp_decode_next_hop_ip(struct bgp_parse_state *s, byte *data, uint len, rta *a)
1076
{
1077
  struct bgp_channel *c = s->channel;
1078
  struct adata *ad = lp_alloc_adata(s->pool, 32);
1079
  ip_addr *nh = (void *) ad->data;
1080

    
1081
  if (len == 4)
1082
  {
1083
    nh[0] = ipa_from_ip4(get_ip4(data));
1084
    nh[1] = IPA_NONE;
1085
  }
1086
  else if (len == 16)
1087
  {
1088
    nh[0] = ipa_from_ip6(get_ip6(data));
1089
    nh[1] = IPA_NONE;
1090

    
1091
    if (ipa_is_link_local(nh[0]))
1092
    { nh[1] = nh[0]; nh[0] = IPA_NONE; }
1093
  }
1094
  else if (len == 32)
1095
  {
1096
    nh[0] = ipa_from_ip6(get_ip6(data));
1097
    nh[1] = ipa_from_ip6(get_ip6(data+16));
1098

    
1099
    if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1]))
1100
      nh[1] = IPA_NONE;
1101
  }
1102
  else
1103
    bgp_parse_error(s, 9);
1104

    
1105
  if (ipa_zero(nh[1]))
1106
    ad->length = 16;
1107

    
1108
  if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop)
1109
    WITHDRAW(BAD_NEXT_HOP);
1110

    
1111
  // XXXX validate next hop
1112

    
1113
  bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad);
1114
  bgp_apply_next_hop(s, a, nh[0], nh[1]);
1115
}
1116

    
1117
static uint
1118
bgp_encode_next_hop_vpn(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED)
1119
{
1120
  ip_addr *nh = (void *) a->u.ptr->data;
1121
  uint len = a->u.ptr->length;
1122

    
1123
  ASSERT((len == 16) || (len == 32));
1124

    
1125
  /*
1126
   * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This
1127
   * is specified in RFC 5549 for VPNv4 and in RFC 4659 for VPNv6. The difference
1128
   * is that IPv4 address is directly encoded with VPNv4 NLRI, but as IPv4-mapped
1129
   * IPv6 address with VPNv6 NLRI.
1130
   */
1131

    
1132
  if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0]))
1133
  {
1134
    put_u64(buf, 0); /* VPN RD is 0 */
1135
    put_ip4(buf+8, ipa_to_ip4(nh[0]));
1136
    return 12;
1137
  }
1138

    
1139
  put_u64(buf, 0); /* VPN RD is 0 */
1140
  put_ip6(buf+8, ipa_to_ip6(nh[0]));
1141

    
1142
  if (len == 16)
1143
    return 24;
1144

    
1145
  put_u64(buf+24, 0); /* VPN RD is 0 */
1146
  put_ip6(buf+32, ipa_to_ip6(nh[1]));
1147

    
1148
  return 48;
1149
}
1150

    
1151
static void
1152
bgp_decode_next_hop_vpn(struct bgp_parse_state *s, byte *data, uint len, rta *a)
1153
{
1154
  struct bgp_channel *c = s->channel;
1155
  struct adata *ad = lp_alloc_adata(s->pool, 32);
1156
  ip_addr *nh = (void *) ad->data;
1157

    
1158
  if (len == 12)
1159
  {
1160
    nh[0] = ipa_from_ip4(get_ip4(data+8));
1161
    nh[1] = IPA_NONE;
1162
  }
1163
  else if (len == 24)
1164
  {
1165
    nh[0] = ipa_from_ip6(get_ip6(data+8));
1166
    nh[1] = IPA_NONE;
1167

    
1168
    if (ipa_is_link_local(nh[0]))
1169
    { nh[1] = nh[0]; nh[0] = IPA_NONE; }
1170
  }
1171
  else if (len == 48)
1172
  {
1173
    nh[0] = ipa_from_ip6(get_ip6(data+8));
1174
    nh[1] = ipa_from_ip6(get_ip6(data+32));
1175

    
1176
    if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1]))
1177
      nh[1] = IPA_NONE;
1178
  }
1179
  else
1180
    bgp_parse_error(s, 9);
1181

    
1182
  if (ipa_zero(nh[1]))
1183
    ad->length = 16;
1184

    
1185
  /* XXXX which error */
1186
  if ((get_u64(data) != 0) || ((len == 48) && (get_u64(data+24) != 0)))
1187
    bgp_parse_error(s, 9);
1188

    
1189
  if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop)
1190
    WITHDRAW(BAD_NEXT_HOP);
1191

    
1192
  // XXXX validate next hop
1193

    
1194
  bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad);
1195
  bgp_apply_next_hop(s, a, nh[0], nh[1]);
1196
}
1197

    
1198

    
1199

    
1200
static uint
1201
bgp_encode_next_hop_none(struct bgp_write_state *s UNUSED, eattr *a UNUSED, byte *buf UNUSED, uint size UNUSED)
1202
{
1203
  return 0;
1204
}
1205

    
1206
static void
1207
bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, uint len UNUSED, rta *a UNUSED)
1208
{
1209
  /*
1210
   * Although we expect no next hop and RFC 7606 7.11 states that attribute
1211
   * MP_REACH_NLRI with unexpected next hop length is considered malformed,
1212
   * FlowSpec RFC 5575 4 states that next hop shall be ignored on receipt.
1213
   */
1214

    
1215
  return;
1216
}
1217

    
1218
static void
1219
bgp_update_next_hop_none(struct bgp_export_state *s, eattr *a, ea_list **to)
1220
{
1221
  /* NEXT_HOP shall not pass */
1222
  if (a)
1223
    bgp_unset_attr(to, s->pool, BA_NEXT_HOP);
1224
}
1225

    
1226

    
1227
/*
1228
 *        UPDATE
1229
 */
1230

    
1231
static void
1232
bgp_rte_update(struct bgp_parse_state *s, net_addr *n, u32 path_id, rta *a0)
1233
{
1234
  if (path_id != s->last_id)
1235
  {
1236
    s->last_src = rt_get_source(&s->proto->p, path_id);
1237
    s->last_id = path_id;
1238

    
1239
    rta_free(s->cached_rta);
1240
    s->cached_rta = NULL;
1241
  }
1242

    
1243
  if (!a0)
1244
  {
1245
    /* Route withdraw */
1246
    rte_update3(&s->channel->c, n, NULL, s->last_src);
1247
    return;
1248
  }
1249

    
1250
  /* Prepare cached route attributes */
1251
  if (s->cached_rta == NULL)
1252
  {
1253
    a0->src = s->last_src;
1254

    
1255
    /* Workaround for rta_lookup() breaking eattrs */
1256
    ea_list *ea = a0->eattrs;
1257
    s->cached_rta = rta_lookup(a0);
1258
    a0->eattrs = ea;
1259
  }
1260

    
1261
  rta *a = rta_clone(s->cached_rta);
1262
  rte *e = rte_get_temp(a);
1263

    
1264
  e->pflags = 0;
1265
  e->u.bgp.suppressed = 0;
1266
  e->u.bgp.stale = -1;
1267
  rte_update3(&s->channel->c, n, e, s->last_src);
1268
}
1269

    
1270
static void
1271
bgp_encode_mpls_labels(struct bgp_write_state *s UNUSED, adata *mpls, byte **pos, uint *size, byte *pxlen)
1272
{
1273
  u32 dummy = 0;
1274
  u32 *labels = mpls ? (u32 *) mpls->data : &dummy;
1275
  uint lnum = mpls ? (mpls->length / 4) : 1;
1276

    
1277
  for (uint i = 0; i < lnum; i++)
1278
  {
1279
    put_u24(*pos, labels[i] << 4);
1280
    ADVANCE(*pos, *size, 3);
1281
  }
1282

    
1283
  /* Add bottom-of-stack flag */
1284
  (*pos)[-1] |= BGP_MPLS_BOS;
1285

    
1286
  *pxlen += 24 * lnum;
1287
}
1288

    
1289
static void
1290
bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *pxlen, rta *a)
1291
{
1292
  u32 labels[BGP_MPLS_MAX], label;
1293
  uint lnum = 0;
1294

    
1295
  do {
1296
    if (*pxlen < 24)
1297
      bgp_parse_error(s, 1);
1298

    
1299
    label = get_u24(*pos);
1300
    labels[lnum++] = label >> 4;
1301
    ADVANCE(*pos, *len, 3);
1302
    *pxlen -= 24;
1303

    
1304
    /* RFC 8277 2.4 - withdraw does not have variable-size MPLS stack but
1305
       fixed-size 24-bit Compatibility field, which MUST be ignored */
1306
    if (!a && !s->err_withdraw)
1307
      return;
1308
  }
1309
  while (!(label & BGP_MPLS_BOS));
1310

    
1311
  if (!a)
1312
    return;
1313

    
1314
  /* Attach MPLS attribute unless we already have one */
1315
  if (!s->mpls_labels)
1316
  {
1317
    s->mpls_labels = lp_alloc_adata(s->pool, 4*BGP_MPLS_MAX);
1318
    bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_MPLS_LABEL_STACK, 0, s->mpls_labels);
1319
  }
1320

    
1321
  /* Overwrite data in the attribute */
1322
  s->mpls_labels->length = 4*lnum;
1323
  memcpy(s->mpls_labels->data, labels, 4*lnum);
1324

    
1325
  /* Update next hop entry in rta */
1326
  bgp_apply_mpls_labels(s, a, labels, lnum);
1327

    
1328
  /* Attributes were changed, invalidate cached entry */
1329
  rta_free(s->cached_rta);
1330
  s->cached_rta = NULL;
1331

    
1332
  return;
1333
}
1334

    
1335
static uint
1336
bgp_encode_nlri_ip4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1337
{
1338
  byte *pos = buf;
1339

    
1340
  while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1341
  {
1342
    struct bgp_prefix *px = HEAD(buck->prefixes);
1343
    struct net_addr_ip4 *net = (void *) px->net;
1344

    
1345
    /* Encode path ID */
1346
    if (s->add_path)
1347
    {
1348
      put_u32(pos, px->path_id);
1349
      ADVANCE(pos, size, 4);
1350
    }
1351

    
1352
    /* Encode prefix length */
1353
    *pos = net->pxlen;
1354
    ADVANCE(pos, size, 1);
1355

    
1356
    /* Encode MPLS labels */
1357
    if (s->mpls)
1358
      bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1359

    
1360
    /* Encode prefix body */
1361
    ip4_addr a = ip4_hton(net->prefix);
1362
    uint b = (net->pxlen + 7) / 8;
1363
    memcpy(pos, &a, b);
1364
    ADVANCE(pos, size, b);
1365

    
1366
    bgp_free_prefix(s->channel, px);
1367
  }
1368

    
1369
  return pos - buf;
1370
}
1371

    
1372
static void
1373
bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1374
{
1375
  while (len)
1376
  {
1377
    net_addr_ip4 net;
1378
    u32 path_id = 0;
1379

    
1380
    /* Decode path ID */
1381
    if (s->add_path)
1382
    {
1383
      if (len < 5)
1384
        bgp_parse_error(s, 1);
1385

    
1386
      path_id = get_u32(pos);
1387
      ADVANCE(pos, len, 4);
1388
    }
1389

    
1390
    /* Decode prefix length */
1391
    uint l = *pos;
1392
    ADVANCE(pos, len, 1);
1393

    
1394
    if (len < ((l + 7) / 8))
1395
      bgp_parse_error(s, 1);
1396

    
1397
    /* Decode MPLS labels */
1398
    if (s->mpls)
1399
      bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1400

    
1401
    if (l > IP4_MAX_PREFIX_LENGTH)
1402
      bgp_parse_error(s, 10);
1403

    
1404
    /* Decode prefix body */
1405
    ip4_addr addr = IP4_NONE;
1406
    uint b = (l + 7) / 8;
1407
    memcpy(&addr, pos, b);
1408
    ADVANCE(pos, len, b);
1409

    
1410
    net = NET_ADDR_IP4(ip4_ntoh(addr), l);
1411
    net_normalize_ip4(&net);
1412

    
1413
    // XXXX validate prefix
1414

    
1415
    bgp_rte_update(s, (net_addr *) &net, path_id, a);
1416
  }
1417
}
1418

    
1419

    
1420
static uint
1421
bgp_encode_nlri_ip6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1422
{
1423
  byte *pos = buf;
1424

    
1425
  while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1426
  {
1427
    struct bgp_prefix *px = HEAD(buck->prefixes);
1428
    struct net_addr_ip6 *net = (void *) px->net;
1429

    
1430
    /* Encode path ID */
1431
    if (s->add_path)
1432
    {
1433
      put_u32(pos, px->path_id);
1434
      ADVANCE(pos, size, 4);
1435
    }
1436

    
1437
    /* Encode prefix length */
1438
    *pos = net->pxlen;
1439
    ADVANCE(pos, size, 1);
1440

    
1441
    /* Encode MPLS labels */
1442
    if (s->mpls)
1443
      bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1444

    
1445
    /* Encode prefix body */
1446
    ip6_addr a = ip6_hton(net->prefix);
1447
    uint b = (net->pxlen + 7) / 8;
1448
    memcpy(pos, &a, b);
1449
    ADVANCE(pos, size, b);
1450

    
1451
    bgp_free_prefix(s->channel, px);
1452
  }
1453

    
1454
  return pos - buf;
1455
}
1456

    
1457
static void
1458
bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1459
{
1460
  while (len)
1461
  {
1462
    net_addr_ip6 net;
1463
    u32 path_id = 0;
1464

    
1465
    /* Decode path ID */
1466
    if (s->add_path)
1467
    {
1468
      if (len < 5)
1469
        bgp_parse_error(s, 1);
1470

    
1471
      path_id = get_u32(pos);
1472
      ADVANCE(pos, len, 4);
1473
    }
1474

    
1475
    /* Decode prefix length */
1476
    uint l = *pos;
1477
    ADVANCE(pos, len, 1);
1478

    
1479
    if (len < ((l + 7) / 8))
1480
      bgp_parse_error(s, 1);
1481

    
1482
    /* Decode MPLS labels */
1483
    if (s->mpls)
1484
      bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1485

    
1486
    if (l > IP6_MAX_PREFIX_LENGTH)
1487
      bgp_parse_error(s, 10);
1488

    
1489
    /* Decode prefix body */
1490
    ip6_addr addr = IP6_NONE;
1491
    uint b = (l + 7) / 8;
1492
    memcpy(&addr, pos, b);
1493
    ADVANCE(pos, len, b);
1494

    
1495
    net = NET_ADDR_IP6(ip6_ntoh(addr), l);
1496
    net_normalize_ip6(&net);
1497

    
1498
    // XXXX validate prefix
1499

    
1500
    bgp_rte_update(s, (net_addr *) &net, path_id, a);
1501
  }
1502
}
1503

    
1504
static uint
1505
bgp_encode_nlri_vpn4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1506
{
1507
  byte *pos = buf;
1508

    
1509
  while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1510
  {
1511
    struct bgp_prefix *px = HEAD(buck->prefixes);
1512
    struct net_addr_vpn4 *net = (void *) px->net;
1513

    
1514
    /* Encode path ID */
1515
    if (s->add_path)
1516
    {
1517
      put_u32(pos, px->path_id);
1518
      ADVANCE(pos, size, 4);
1519
    }
1520

    
1521
    /* Encode prefix length */
1522
    *pos = 64 + net->pxlen;
1523
    ADVANCE(pos, size, 1);
1524

    
1525
    /* Encode MPLS labels */
1526
    if (s->mpls)
1527
      bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1528

    
1529
    /* Encode route distinguisher */
1530
    put_u64(pos, net->rd);
1531
    ADVANCE(pos, size, 8);
1532

    
1533
    /* Encode prefix body */
1534
    ip4_addr a = ip4_hton(net->prefix);
1535
    uint b = (net->pxlen + 7) / 8;
1536
    memcpy(pos, &a, b);
1537
    ADVANCE(pos, size, b);
1538

    
1539
    bgp_free_prefix(s->channel, px);
1540
  }
1541

    
1542
  return pos - buf;
1543
}
1544

    
1545
static void
1546
bgp_decode_nlri_vpn4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1547
{
1548
  while (len)
1549
  {
1550
    net_addr_vpn4 net;
1551
    u32 path_id = 0;
1552

    
1553
    /* Decode path ID */
1554
    if (s->add_path)
1555
    {
1556
      if (len < 5)
1557
        bgp_parse_error(s, 1);
1558

    
1559
      path_id = get_u32(pos);
1560
      ADVANCE(pos, len, 4);
1561
    }
1562

    
1563
    /* Decode prefix length */
1564
    uint l = *pos;
1565
    ADVANCE(pos, len, 1);
1566

    
1567
    if (len < ((l + 7) / 8))
1568
      bgp_parse_error(s, 1);
1569

    
1570
    /* Decode MPLS labels */
1571
    if (s->mpls)
1572
      bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1573

    
1574
    /* Decode route distinguisher */
1575
    if (l < 64)
1576
      bgp_parse_error(s, 1);
1577

    
1578
    u64 rd = get_u64(pos);
1579
    ADVANCE(pos, len, 8);
1580
    l -= 64;
1581

    
1582
    if (l > IP4_MAX_PREFIX_LENGTH)
1583
      bgp_parse_error(s, 10);
1584

    
1585
    /* Decode prefix body */
1586
    ip4_addr addr = IP4_NONE;
1587
    uint b = (l + 7) / 8;
1588
    memcpy(&addr, pos, b);
1589
    ADVANCE(pos, len, b);
1590

    
1591
    net = NET_ADDR_VPN4(ip4_ntoh(addr), l, rd);
1592
    net_normalize_vpn4(&net);
1593

    
1594
    // XXXX validate prefix
1595

    
1596
    bgp_rte_update(s, (net_addr *) &net, path_id, a);
1597
  }
1598
}
1599

    
1600

    
1601
static uint
1602
bgp_encode_nlri_vpn6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1603
{
1604
  byte *pos = buf;
1605

    
1606
  while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1607
  {
1608
    struct bgp_prefix *px = HEAD(buck->prefixes);
1609
    struct net_addr_vpn6 *net = (void *) px->net;
1610

    
1611
    /* Encode path ID */
1612
    if (s->add_path)
1613
    {
1614
      put_u32(pos, px->path_id);
1615
      ADVANCE(pos, size, 4);
1616
    }
1617

    
1618
    /* Encode prefix length */
1619
    *pos = 64 + net->pxlen;
1620
    ADVANCE(pos, size, 1);
1621

    
1622
    /* Encode MPLS labels */
1623
    if (s->mpls)
1624
      bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1625

    
1626
    /* Encode route distinguisher */
1627
    put_u64(pos, net->rd);
1628
    ADVANCE(pos, size, 8);
1629

    
1630
    /* Encode prefix body */
1631
    ip6_addr a = ip6_hton(net->prefix);
1632
    uint b = (net->pxlen + 7) / 8;
1633
    memcpy(pos, &a, b);
1634
    ADVANCE(pos, size, b);
1635

    
1636
    bgp_free_prefix(s->channel, px);
1637
  }
1638

    
1639
  return pos - buf;
1640
}
1641

    
1642
static void
1643
bgp_decode_nlri_vpn6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1644
{
1645
  while (len)
1646
  {
1647
    net_addr_vpn6 net;
1648
    u32 path_id = 0;
1649

    
1650
    /* Decode path ID */
1651
    if (s->add_path)
1652
    {
1653
      if (len < 5)
1654
        bgp_parse_error(s, 1);
1655

    
1656
      path_id = get_u32(pos);
1657
      ADVANCE(pos, len, 4);
1658
    }
1659

    
1660
    /* Decode prefix length */
1661
    uint l = *pos;
1662
    ADVANCE(pos, len, 1);
1663

    
1664
    if (len < ((l + 7) / 8))
1665
      bgp_parse_error(s, 1);
1666

    
1667
    /* Decode MPLS labels */
1668
    if (s->mpls)
1669
      bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1670

    
1671
    /* Decode route distinguisher */
1672
    if (l < 64)
1673
      bgp_parse_error(s, 1);
1674

    
1675
    u64 rd = get_u64(pos);
1676
    ADVANCE(pos, len, 8);
1677
    l -= 64;
1678

    
1679
    if (l > IP6_MAX_PREFIX_LENGTH)
1680
      bgp_parse_error(s, 10);
1681

    
1682
    /* Decode prefix body */
1683
    ip6_addr addr = IP6_NONE;
1684
    uint b = (l + 7) / 8;
1685
    memcpy(&addr, pos, b);
1686
    ADVANCE(pos, len, b);
1687

    
1688
    net = NET_ADDR_VPN6(ip6_ntoh(addr), l, rd);
1689
    net_normalize_vpn6(&net);
1690

    
1691
    // XXXX validate prefix
1692

    
1693
    bgp_rte_update(s, (net_addr *) &net, path_id, a);
1694
  }
1695
}
1696

    
1697

    
1698
static uint
1699
bgp_encode_nlri_flow4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1700
{
1701
  byte *pos = buf;
1702

    
1703
  while (!EMPTY_LIST(buck->prefixes) && (size >= 4))
1704
  {
1705
    struct bgp_prefix *px = HEAD(buck->prefixes);
1706
    struct net_addr_flow4 *net = (void *) px->net;
1707
    uint flen = net->length - sizeof(net_addr_flow4);
1708

    
1709
    /* Encode path ID */
1710
    if (s->add_path)
1711
    {
1712
      put_u32(pos, px->path_id);
1713
      ADVANCE(pos, size, 4);
1714
    }
1715

    
1716
    if (flen > size)
1717
      break;
1718

    
1719
    /* Copy whole flow data including length */
1720
    memcpy(pos, net->data, flen);
1721
    ADVANCE(pos, size, flen);
1722

    
1723
    bgp_free_prefix(s->channel, px);
1724
  }
1725

    
1726
  return pos - buf;
1727
}
1728

    
1729
static void
1730
bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1731
{
1732
  while (len)
1733
  {
1734
    u32 path_id = 0;
1735

    
1736
    /* Decode path ID */
1737
    if (s->add_path)
1738
    {
1739
      if (len < 4)
1740
        bgp_parse_error(s, 1);
1741

    
1742
      path_id = get_u32(pos);
1743
      ADVANCE(pos, len, 4);
1744
    }
1745

    
1746
    if (len < 2)
1747
      bgp_parse_error(s, 1);
1748

    
1749
    /* Decode flow length */
1750
    uint hlen = flow_hdr_length(pos);
1751
    uint dlen = flow_read_length(pos);
1752
    uint flen = hlen + dlen;
1753
    byte *data = pos + hlen;
1754

    
1755
    if (len < flen)
1756
      bgp_parse_error(s, 1);
1757

    
1758
    /* Validate flow data */
1759
    enum flow_validated_state r = flow4_validate(data, dlen);
1760
    if (r != FLOW_ST_VALID)
1761
    {
1762
      log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r));
1763
      bgp_parse_error(s, 1);
1764
    }
1765

    
1766
    if (data[0] != FLOW_TYPE_DST_PREFIX)
1767
    {
1768
      log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name);
1769
      bgp_parse_error(s, 1);
1770
    }
1771

    
1772
    /* Decode dst prefix */
1773
    ip4_addr px = IP4_NONE;
1774
    uint pxlen = data[1];
1775

    
1776
    // FIXME: Use some generic function
1777
    memcpy(&px, data+2, BYTES(pxlen));
1778
    px = ip4_and(ip4_ntoh(px), ip4_mkmask(pxlen));
1779

    
1780
    /* Prepare the flow */
1781
    net_addr *n = alloca(sizeof(struct net_addr_flow4) + flen);
1782
    net_fill_flow4(n, px, pxlen, pos, flen);
1783
    ADVANCE(pos, len, flen);
1784

    
1785
    bgp_rte_update(s, n, path_id, a);
1786
  }
1787
}
1788

    
1789

    
1790
static uint
1791
bgp_encode_nlri_flow6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1792
{
1793
  byte *pos = buf;
1794

    
1795
  while (!EMPTY_LIST(buck->prefixes) && (size >= 4))
1796
  {
1797
    struct bgp_prefix *px = HEAD(buck->prefixes);
1798
    struct net_addr_flow6 *net = (void *) px->net;
1799
    uint flen = net->length - sizeof(net_addr_flow6);
1800

    
1801
    /* Encode path ID */
1802
    if (s->add_path)
1803
    {
1804
      put_u32(pos, px->path_id);
1805
      ADVANCE(pos, size, 4);
1806
    }
1807

    
1808
    if (flen > size)
1809
      break;
1810

    
1811
    /* Copy whole flow data including length */
1812
    memcpy(pos, net->data, flen);
1813
    ADVANCE(pos, size, flen);
1814

    
1815
    bgp_free_prefix(s->channel, px);
1816
  }
1817

    
1818
  return pos - buf;
1819
}
1820

    
1821
static void
1822
bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1823
{
1824
  while (len)
1825
  {
1826
    u32 path_id = 0;
1827

    
1828
    /* Decode path ID */
1829
    if (s->add_path)
1830
    {
1831
      if (len < 4)
1832
        bgp_parse_error(s, 1);
1833

    
1834
      path_id = get_u32(pos);
1835
      ADVANCE(pos, len, 4);
1836
    }
1837

    
1838
    if (len < 2)
1839
      bgp_parse_error(s, 1);
1840

    
1841
    /* Decode flow length */
1842
    uint hlen = flow_hdr_length(pos);
1843
    uint dlen = flow_read_length(pos);
1844
    uint flen = hlen + dlen;
1845
    byte *data = pos + hlen;
1846

    
1847
    if (len < flen)
1848
      bgp_parse_error(s, 1);
1849

    
1850
    /* Validate flow data */
1851
    enum flow_validated_state r = flow6_validate(data, dlen);
1852
    if (r != FLOW_ST_VALID)
1853
    {
1854
      log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r));
1855
      bgp_parse_error(s, 1);
1856
    }
1857

    
1858
    if (data[0] != FLOW_TYPE_DST_PREFIX)
1859
    {
1860
      log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name);
1861
      bgp_parse_error(s, 1);
1862
    }
1863

    
1864
    /* Decode dst prefix */
1865
    ip6_addr px = IP6_NONE;
1866
    uint pxlen = data[1];
1867

    
1868
    // FIXME: Use some generic function
1869
    memcpy(&px, data+2, BYTES(pxlen));
1870
    px = ip6_and(ip6_ntoh(px), ip6_mkmask(pxlen));
1871

    
1872
    /* Prepare the flow */
1873
    net_addr *n = alloca(sizeof(struct net_addr_flow6) + flen);
1874
    net_fill_flow6(n, px, pxlen, pos, flen);
1875
    ADVANCE(pos, len, flen);
1876

    
1877
    bgp_rte_update(s, n, path_id, a);
1878
  }
1879
}
1880

    
1881

    
1882
static const struct bgp_af_desc bgp_af_table[] = {
1883
  {
1884
    .afi = BGP_AF_IPV4,
1885
    .net = NET_IP4,
1886
    .name = "ipv4",
1887
    .encode_nlri = bgp_encode_nlri_ip4,
1888
    .decode_nlri = bgp_decode_nlri_ip4,
1889
    .encode_next_hop = bgp_encode_next_hop_ip,
1890
    .decode_next_hop = bgp_decode_next_hop_ip,
1891
    .update_next_hop = bgp_update_next_hop_ip,
1892
  },
1893
  {
1894
    .afi = BGP_AF_IPV4_MC,
1895
    .net = NET_IP4,
1896
    .name = "ipv4-mc",
1897
    .encode_nlri = bgp_encode_nlri_ip4,
1898
    .decode_nlri = bgp_decode_nlri_ip4,
1899
    .encode_next_hop = bgp_encode_next_hop_ip,
1900
    .decode_next_hop = bgp_decode_next_hop_ip,
1901
    .update_next_hop = bgp_update_next_hop_ip,
1902
  },
1903
  {
1904
    .afi = BGP_AF_IPV4_MPLS,
1905
    .net = NET_IP4,
1906
    .mpls = 1,
1907
    .name = "ipv4-mpls",
1908
    .encode_nlri = bgp_encode_nlri_ip4,
1909
    .decode_nlri = bgp_decode_nlri_ip4,
1910
    .encode_next_hop = bgp_encode_next_hop_ip,
1911
    .decode_next_hop = bgp_decode_next_hop_ip,
1912
    .update_next_hop = bgp_update_next_hop_ip,
1913
  },
1914
  {
1915
    .afi = BGP_AF_IPV6,
1916
    .net = NET_IP6,
1917
    .name = "ipv6",
1918
    .encode_nlri = bgp_encode_nlri_ip6,
1919
    .decode_nlri = bgp_decode_nlri_ip6,
1920
    .encode_next_hop = bgp_encode_next_hop_ip,
1921
    .decode_next_hop = bgp_decode_next_hop_ip,
1922
    .update_next_hop = bgp_update_next_hop_ip,
1923
  },
1924
  {
1925
    .afi = BGP_AF_IPV6_MC,
1926
    .net = NET_IP6,
1927
    .name = "ipv6-mc",
1928
    .encode_nlri = bgp_encode_nlri_ip6,
1929
    .decode_nlri = bgp_decode_nlri_ip6,
1930
    .encode_next_hop = bgp_encode_next_hop_ip,
1931
    .decode_next_hop = bgp_decode_next_hop_ip,
1932
    .update_next_hop = bgp_update_next_hop_ip,
1933
  },
1934
  {
1935
    .afi = BGP_AF_IPV6_MPLS,
1936
    .net = NET_IP6,
1937
    .mpls = 1,
1938
    .name = "ipv6-mpls",
1939
    .encode_nlri = bgp_encode_nlri_ip6,
1940
    .decode_nlri = bgp_decode_nlri_ip6,
1941
    .encode_next_hop = bgp_encode_next_hop_ip,
1942
    .decode_next_hop = bgp_decode_next_hop_ip,
1943
    .update_next_hop = bgp_update_next_hop_ip,
1944
  },
1945
  {
1946
    .afi = BGP_AF_VPN4_MPLS,
1947
    .net = NET_VPN4,
1948
    .mpls = 1,
1949
    .name = "vpn4-mpls",
1950
    .encode_nlri = bgp_encode_nlri_vpn4,
1951
    .decode_nlri = bgp_decode_nlri_vpn4,
1952
    .encode_next_hop = bgp_encode_next_hop_vpn,
1953
    .decode_next_hop = bgp_decode_next_hop_vpn,
1954
    .update_next_hop = bgp_update_next_hop_ip,
1955
  },
1956
  {
1957
    .afi = BGP_AF_VPN6_MPLS,
1958
    .net = NET_VPN6,
1959
    .mpls = 1,
1960
    .name = "vpn6-mpls",
1961
    .encode_nlri = bgp_encode_nlri_vpn6,
1962
    .decode_nlri = bgp_decode_nlri_vpn6,
1963
    .encode_next_hop = bgp_encode_next_hop_vpn,
1964
    .decode_next_hop = bgp_decode_next_hop_vpn,
1965
    .update_next_hop = bgp_update_next_hop_ip,
1966
  },
1967
  {
1968
    .afi = BGP_AF_VPN4_MC,
1969
    .net = NET_VPN4,
1970
    .name = "vpn4-mc",
1971
    .encode_nlri = bgp_encode_nlri_vpn4,
1972
    .decode_nlri = bgp_decode_nlri_vpn4,
1973
    .encode_next_hop = bgp_encode_next_hop_vpn,
1974
    .decode_next_hop = bgp_decode_next_hop_vpn,
1975
    .update_next_hop = bgp_update_next_hop_ip,
1976
  },
1977
  {
1978
    .afi = BGP_AF_VPN6_MC,
1979
    .net = NET_VPN6,
1980
    .name = "vpn6-mc",
1981
    .encode_nlri = bgp_encode_nlri_vpn6,
1982
    .decode_nlri = bgp_decode_nlri_vpn6,
1983
    .encode_next_hop = bgp_encode_next_hop_vpn,
1984
    .decode_next_hop = bgp_decode_next_hop_vpn,
1985
    .update_next_hop = bgp_update_next_hop_ip,
1986
  },
1987
  {
1988
    .afi = BGP_AF_FLOW4,
1989
    .net = NET_FLOW4,
1990
    .no_igp = 1,
1991
    .name = "flow4",
1992
    .encode_nlri = bgp_encode_nlri_flow4,
1993
    .decode_nlri = bgp_decode_nlri_flow4,
1994
    .encode_next_hop = bgp_encode_next_hop_none,
1995
    .decode_next_hop = bgp_decode_next_hop_none,
1996
    .update_next_hop = bgp_update_next_hop_none,
1997
  },
1998
  {
1999
    .afi = BGP_AF_FLOW6,
2000
    .net = NET_FLOW6,
2001
    .no_igp = 1,
2002
    .name = "flow6",
2003
    .encode_nlri = bgp_encode_nlri_flow6,
2004
    .decode_nlri = bgp_decode_nlri_flow6,
2005
    .encode_next_hop = bgp_encode_next_hop_none,
2006
    .decode_next_hop = bgp_decode_next_hop_none,
2007
    .update_next_hop = bgp_update_next_hop_none,
2008
  },
2009
};
2010

    
2011
const struct bgp_af_desc *
2012
bgp_get_af_desc(u32 afi)
2013
{
2014
  uint i;
2015
  for (i = 0; i < ARRAY_SIZE(bgp_af_table); i++)
2016
    if (bgp_af_table[i].afi == afi)
2017
      return &bgp_af_table[i];
2018

    
2019
  return NULL;
2020
}
2021

    
2022
static inline uint
2023
bgp_encode_nlri(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2024
{
2025
  return s->channel->desc->encode_nlri(s, buck, buf, end - buf);
2026
}
2027

    
2028
static inline uint
2029
bgp_encode_next_hop(struct bgp_write_state *s, eattr *nh, byte *buf)
2030
{
2031
  return s->channel->desc->encode_next_hop(s, nh, buf, 255);
2032
}
2033

    
2034
void
2035
bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to)
2036
{
2037
  s->channel->desc->update_next_hop(s, a, to);
2038
}
2039

    
2040
#define MAX_ATTRS_LENGTH (end-buf+BGP_HEADER_LENGTH - 1024)
2041

    
2042
static byte *
2043
bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2044
{
2045
  /*
2046
   *        2 B        Withdrawn Routes Length (zero)
2047
   *        ---        IPv4 Withdrawn Routes NLRI (unused)
2048
   *        2 B        Total Path Attribute Length
2049
   *        var        Path Attributes
2050
   *        var        IPv4 Network Layer Reachability Information
2051
   */
2052

    
2053
  int lr, la;
2054

    
2055
  la = bgp_encode_attrs(s, buck->eattrs, buf+4, buf + MAX_ATTRS_LENGTH);
2056
  if (la < 0)
2057
  {
2058
    /* Attribute list too long */
2059
    bgp_withdraw_bucket(s->channel, buck);
2060
    return NULL;
2061
  }
2062

    
2063
  put_u16(buf+0, 0);
2064
  put_u16(buf+2, la);
2065

    
2066
  lr = bgp_encode_nlri(s, buck, buf+4+la, end);
2067

    
2068
  return buf+4+la+lr;
2069
}
2070

    
2071
static byte *
2072
bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2073
{
2074
  /*
2075
   *        2 B        IPv4 Withdrawn Routes Length (zero)
2076
   *        ---        IPv4 Withdrawn Routes NLRI (unused)
2077
   *        2 B        Total Path Attribute Length
2078
   *        1 B        MP_REACH_NLRI hdr - Attribute Flags
2079
   *        1 B        MP_REACH_NLRI hdr - Attribute Type Code
2080
   *        2 B        MP_REACH_NLRI hdr - Length of Attribute Data
2081
   *        2 B        MP_REACH_NLRI data - Address Family Identifier
2082
   *        1 B        MP_REACH_NLRI data - Subsequent Address Family Identifier
2083
   *        1 B        MP_REACH_NLRI data - Length of Next Hop Network Address
2084
   *        var        MP_REACH_NLRI data - Network Address of Next Hop
2085
   *        1 B        MP_REACH_NLRI data - Reserved (zero)
2086
   *        var        MP_REACH_NLRI data - Network Layer Reachability Information
2087
   *        var        Rest of Path Attributes
2088
   *        ---        IPv4 Network Layer Reachability Information (unused)
2089
   */
2090

    
2091
  int lh, lr, la;        /* Lengths of next hop, NLRI and attributes */
2092

    
2093
  /* Begin of MP_REACH_NLRI atribute */
2094
  buf[4] = BAF_OPTIONAL | BAF_EXT_LEN;
2095
  buf[5] = BA_MP_REACH_NLRI;
2096
  put_u16(buf+6, 0);                /* Will be fixed later */
2097
  put_af3(buf+8, s->channel->afi);
2098
  byte *pos = buf+11;
2099

    
2100
  /* Encode attributes to temporary buffer */
2101
  byte *abuf = alloca(MAX_ATTRS_LENGTH);
2102
  la = bgp_encode_attrs(s, buck->eattrs, abuf, abuf + MAX_ATTRS_LENGTH);
2103
  if (la < 0)
2104
  {
2105
    /* Attribute list too long */
2106
    bgp_withdraw_bucket(s->channel, buck);
2107
    return NULL;
2108
  }
2109

    
2110
  /* Encode the next hop */
2111
  lh = bgp_encode_next_hop(s, s->mp_next_hop, pos+1);
2112
  *pos = lh;
2113
  pos += 1+lh;
2114

    
2115
  /* Reserved field */
2116
  *pos++ = 0;
2117

    
2118
  /* Encode the NLRI */
2119
  lr = bgp_encode_nlri(s, buck, pos, end - la);
2120
  pos += lr;
2121

    
2122
  /* End of MP_REACH_NLRI atribute, update data length */
2123
  put_u16(buf+6, pos-buf-8);
2124

    
2125
  /* Copy remaining attributes */
2126
  memcpy(pos, abuf, la);
2127
  pos += la;
2128

    
2129
  /* Initial UPDATE fields */
2130
  put_u16(buf+0, 0);
2131
  put_u16(buf+2, pos-buf-4);
2132

    
2133
  return pos;
2134
}
2135

    
2136
#undef MAX_ATTRS_LENGTH
2137

    
2138
static byte *
2139
bgp_create_ip_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2140
{
2141
  /*
2142
   *        2 B        Withdrawn Routes Length
2143
   *        var        IPv4 Withdrawn Routes NLRI
2144
   *        2 B        Total Path Attribute Length (zero)
2145
   *        ---        Path Attributes (unused)
2146
   *        ---        IPv4 Network Layer Reachability Information (unused)
2147
   */
2148

    
2149
  uint len = bgp_encode_nlri(s, buck, buf+2, end);
2150

    
2151
  put_u16(buf+0, len);
2152
  put_u16(buf+2+len, 0);
2153

    
2154
  return buf+4+len;
2155
}
2156

    
2157
static byte *
2158
bgp_create_mp_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
2159
{
2160
  /*
2161
   *        2 B        Withdrawn Routes Length (zero)
2162
   *        ---        IPv4 Withdrawn Routes NLRI (unused)
2163
   *        2 B        Total Path Attribute Length
2164
   *        1 B        MP_UNREACH_NLRI hdr - Attribute Flags
2165
   *        1 B        MP_UNREACH_NLRI hdr - Attribute Type Code
2166
   *        2 B        MP_UNREACH_NLRI hdr - Length of Attribute Data
2167
   *        2 B        MP_UNREACH_NLRI data - Address Family Identifier
2168
   *        1 B        MP_UNREACH_NLRI data - Subsequent Address Family Identifier
2169
   *        var        MP_UNREACH_NLRI data - Network Layer Reachability Information
2170
   *        ---        IPv4 Network Layer Reachability Information (unused)
2171
   */
2172

    
2173
  uint len = bgp_encode_nlri(s, buck, buf+11, end);
2174

    
2175
  put_u16(buf+0, 0);
2176
  put_u16(buf+2, 7+len);
2177

    
2178
  /* Begin of MP_UNREACH_NLRI atribute */
2179
  buf[4] = BAF_OPTIONAL | BAF_EXT_LEN;
2180
  buf[5] = BA_MP_UNREACH_NLRI;
2181
  put_u16(buf+6, 3+len);
2182
  put_af3(buf+8, s->channel->afi);
2183

    
2184
  return buf+11+len;
2185
}
2186

    
2187
static byte *
2188
bgp_create_update(struct bgp_channel *c, byte *buf)
2189
{
2190
  struct bgp_proto *p = (void *) c->c.proto;
2191
  struct bgp_bucket *buck;
2192
  byte *end = buf + (bgp_max_packet_length(p->conn) - BGP_HEADER_LENGTH);
2193
  byte *res = NULL;
2194

    
2195
again: ;
2196

    
2197
  /* Initialize write state */
2198
  struct bgp_write_state s = {
2199
    .proto = p,
2200
    .channel = c,
2201
    .pool = bgp_linpool,
2202
    .mp_reach = (c->afi != BGP_AF_IPV4) || c->ext_next_hop,
2203
    .as4_session = p->as4_session,
2204
    .add_path = c->add_path_tx,
2205
    .mpls = c->desc->mpls,
2206
  };
2207

    
2208
  /* Try unreachable bucket */
2209
  if ((buck = c->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
2210
  {
2211
    res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ?
2212
      bgp_create_ip_unreach(&s, buck, buf, end):
2213
      bgp_create_mp_unreach(&s, buck, buf, end);
2214

    
2215
    goto done;
2216
  }
2217

    
2218
  /* Try reachable buckets */
2219
  if (!EMPTY_LIST(c->bucket_queue))
2220
  {
2221
    buck = HEAD(c->bucket_queue);
2222

    
2223
    /* Cleanup empty buckets */
2224
    if (EMPTY_LIST(buck->prefixes))
2225
    {
2226
      bgp_free_bucket(c, buck);
2227
      goto again;
2228
    }
2229

    
2230
    res = !s.mp_reach ?
2231
      bgp_create_ip_reach(&s, buck, buf, end):
2232
      bgp_create_mp_reach(&s, buck, buf, end);
2233

    
2234
    if (EMPTY_LIST(buck->prefixes))
2235
      bgp_free_bucket(c, buck);
2236
    else
2237
      bgp_defer_bucket(c, buck);
2238

    
2239
    if (!res)
2240
      goto again;
2241

    
2242
    goto done;
2243
  }
2244

    
2245
  /* No more prefixes to send */
2246
  return NULL;
2247

    
2248
done:
2249
  BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE");
2250
  lp_flush(s.pool);
2251

    
2252
  return res;
2253
}
2254

    
2255
static byte *
2256
bgp_create_ip_end_mark(struct bgp_channel *c UNUSED, byte *buf)
2257
{
2258
  /* Empty update packet */
2259
  put_u32(buf, 0);
2260

    
2261
  return buf+4;
2262
}
2263

    
2264
static byte *
2265
bgp_create_mp_end_mark(struct bgp_channel *c, byte *buf)
2266
{
2267
  put_u16(buf+0, 0);
2268
  put_u16(buf+2, 6);                /* length 4--9 */
2269

    
2270
  /* Empty MP_UNREACH_NLRI atribute */
2271
  buf[4] = BAF_OPTIONAL;
2272
  buf[5] = BA_MP_UNREACH_NLRI;
2273
  buf[6] = 3;                        /* Length 7--9 */
2274
  put_af3(buf+7, c->afi);
2275

    
2276
  return buf+10;
2277
}
2278

    
2279
static byte *
2280
bgp_create_end_mark(struct bgp_channel *c, byte *buf)
2281
{
2282
  struct bgp_proto *p = (void *) c->c.proto;
2283

    
2284
  BGP_TRACE(D_PACKETS, "Sending END-OF-RIB");
2285

    
2286
  return (c->afi == BGP_AF_IPV4) ?
2287
    bgp_create_ip_end_mark(c, buf):
2288
    bgp_create_mp_end_mark(c, buf);
2289
}
2290

    
2291
static inline void
2292
bgp_rx_end_mark(struct bgp_parse_state *s, u32 afi)
2293
{
2294
  struct bgp_proto *p = s->proto;
2295
  struct bgp_channel *c = bgp_get_channel(p, afi);
2296

    
2297
  BGP_TRACE(D_PACKETS, "Got END-OF-RIB");
2298

    
2299
  if (!c)
2300
    DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi));
2301

    
2302
  if (c->load_state == BFS_LOADING)
2303
    c->load_state = BFS_NONE;
2304

    
2305
  if (p->p.gr_recovery)
2306
    channel_graceful_restart_unlock(&c->c);
2307

    
2308
  if (c->gr_active)
2309
    bgp_graceful_restart_done(c);
2310
}
2311

    
2312
static inline void
2313
bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_list *ea, byte *nh, uint nh_len)
2314
{
2315
  struct bgp_channel *c = bgp_get_channel(s->proto, afi);
2316
  rta *a = NULL;
2317

    
2318
  if (!c)
2319
    DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi));
2320

    
2321
  s->channel = c;
2322
  s->add_path = c->add_path_rx;
2323
  s->mpls = c->desc->mpls;
2324

    
2325
  s->last_id = 0;
2326
  s->last_src = s->proto->p.main_source;
2327

    
2328
  /*
2329
   * IPv4 BGP and MP-BGP may be used together in one update, therefore we do not
2330
   * add BA_NEXT_HOP in bgp_decode_attrs(), but we add it here independently for
2331
   * IPv4 BGP and MP-BGP. We undo the attribute (and possibly others attached by
2332
   * decode_next_hop hooks) by restoring a->eattrs afterwards.
2333
   */
2334

    
2335
  if (ea)
2336
  {
2337
    a = allocz(RTA_MAX_SIZE);
2338

    
2339
    a->source = RTS_BGP;
2340
    a->scope = SCOPE_UNIVERSE;
2341
    a->from = s->proto->remote_ip;
2342
    a->eattrs = ea;
2343

    
2344
    c->desc->decode_next_hop(s, nh, nh_len, a);
2345

    
2346
    /* Handle withdraw during next hop decoding */
2347
    if (s->err_withdraw)
2348
      a = NULL;
2349
  }
2350

    
2351
  c->desc->decode_nlri(s, nlri, len, a);
2352

    
2353
  rta_free(s->cached_rta);
2354
  s->cached_rta = NULL;
2355
}
2356

    
2357
static void
2358
bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len)
2359
{
2360
  struct bgp_proto *p = conn->bgp;
2361
  ea_list *ea = NULL;
2362

    
2363
  BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE");
2364

    
2365
  /* Workaround for some BGP implementations that skip initial KEEPALIVE */
2366
  if (conn->state == BS_OPENCONFIRM)
2367
    bgp_conn_enter_established_state(conn);
2368

    
2369
  if (conn->state != BS_ESTABLISHED)
2370
  { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
2371

    
2372
  bgp_start_timer(conn->hold_timer, conn->hold_time);
2373

    
2374
  /* Initialize parse state */
2375
  struct bgp_parse_state s = {
2376
    .proto = p,
2377
    .pool = bgp_linpool,
2378
    .as4_session = p->as4_session,
2379
  };
2380

    
2381
  /* Parse error handler */
2382
  if (setjmp(s.err_jmpbuf))
2383
  {
2384
    bgp_error(conn, 3, s.err_subcode, NULL, 0);
2385
    goto done;
2386
  }
2387

    
2388
  /* Check minimal length */
2389
  if (len < 23)
2390
  { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2391

    
2392
  /* Skip fixed header */
2393
  uint pos = 19;
2394

    
2395
  /*
2396
   *        UPDATE message format
2397
   *
2398
   *        2 B        IPv4 Withdrawn Routes Length
2399
   *        var        IPv4 Withdrawn Routes NLRI
2400
   *        2 B        Total Path Attribute Length
2401
   *        var        Path Attributes
2402
   *        var        IPv4 Reachable Routes NLRI
2403
   */
2404

    
2405
  s.ip_unreach_len = get_u16(pkt + pos);
2406
  s.ip_unreach_nlri = pkt + pos + 2;
2407
  pos += 2 + s.ip_unreach_len;
2408

    
2409
  if (pos + 2 > len)
2410
    bgp_parse_error(&s, 1);
2411

    
2412
  s.attr_len = get_u16(pkt + pos);
2413
  s.attrs = pkt + pos + 2;
2414
  pos += 2 + s.attr_len;
2415

    
2416
  if (pos > len)
2417
    bgp_parse_error(&s, 1);
2418

    
2419
  s.ip_reach_len = len - pos;
2420
  s.ip_reach_nlri = pkt + pos;
2421

    
2422

    
2423
  if (s.attr_len)
2424
    ea = bgp_decode_attrs(&s, s.attrs, s.attr_len);
2425
  else
2426
    ea = NULL;
2427

    
2428
  /* Check for End-of-RIB marker */
2429
  if (!s.attr_len && !s.ip_unreach_len && !s.ip_reach_len)
2430
  { bgp_rx_end_mark(&s, BGP_AF_IPV4); goto done; }
2431

    
2432
  /* Check for MP End-of-RIB marker */
2433
  if ((s.attr_len < 8) && !s.ip_unreach_len && !s.ip_reach_len &&
2434
      !s.mp_reach_len && !s.mp_unreach_len && s.mp_unreach_af)
2435
  { bgp_rx_end_mark(&s, s.mp_unreach_af); goto done; }
2436

    
2437
  if (s.ip_unreach_len)
2438
    bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_unreach_nlri, s.ip_unreach_len, NULL, NULL, 0);
2439

    
2440
  if (s.mp_unreach_len)
2441
    bgp_decode_nlri(&s, s.mp_unreach_af, s.mp_unreach_nlri, s.mp_unreach_len, NULL, NULL, 0);
2442

    
2443
  if (s.ip_reach_len)
2444
    bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_reach_nlri, s.ip_reach_len,
2445
                    ea, s.ip_next_hop_data, s.ip_next_hop_len);
2446

    
2447
  if (s.mp_reach_len)
2448
    bgp_decode_nlri(&s, s.mp_reach_af, s.mp_reach_nlri, s.mp_reach_len,
2449
                    ea, s.mp_next_hop_data, s.mp_next_hop_len);
2450

    
2451
done:
2452
  rta_free(s.cached_rta);
2453
  lp_flush(s.pool);
2454
  return;
2455
}
2456

    
2457
static uint
2458
bgp_find_update_afi(byte *pos, uint len)
2459
{
2460
  /*
2461
   * This is stripped-down version of bgp_rx_update(), bgp_decode_attrs() and
2462
   * bgp_decode_mp_[un]reach_nlri() used by MRT code in order to find out which
2463
   * AFI/SAFI is associated with incoming UPDATE. Returns 0 for framing errors.
2464
   */
2465
  if (len < 23)
2466
    return 0;
2467

    
2468
  /* Assume there is no withrawn NLRI, read lengths and move to attribute list */
2469
  uint wlen = get_u16(pos + 19);
2470
  uint alen = get_u16(pos + 21);
2471
  ADVANCE(pos, len, 23);
2472

    
2473
  /* Either non-zero withdrawn NLRI, non-zero reachable NLRI, or IPv4 End-of-RIB */
2474
  if ((wlen != 0) || (alen < len) || !alen)
2475
    return BGP_AF_IPV4;
2476

    
2477
  if (alen > len)
2478
    return 0;
2479

    
2480
  /* Process attribute list (alen == len) */
2481
  while (len)
2482
  {
2483
    if (len < 2)
2484
      return 0;
2485

    
2486
    uint flags = pos[0];
2487
    uint code = pos[1];
2488
    ADVANCE(pos, len, 2);
2489

    
2490
    uint ll = !(flags & BAF_EXT_LEN) ? 1 : 2;
2491
    if (len < ll)
2492
      return 0;
2493

    
2494
    /* Read attribute length and move to attribute body */
2495
    alen = (ll == 1) ? get_u8(pos) : get_u16(pos);
2496
    ADVANCE(pos, len, ll);
2497

    
2498
    if (len < alen)
2499
      return 0;
2500

    
2501
    /* Found MP NLRI */
2502
    if ((code == BA_MP_REACH_NLRI) || (code == BA_MP_UNREACH_NLRI))
2503
    {
2504
      if (alen < 3)
2505
        return 0;
2506

    
2507
      return BGP_AF(get_u16(pos), pos[2]);
2508
    }
2509

    
2510
    /* Move to the next attribute */
2511
    ADVANCE(pos, len, alen);
2512
  }
2513

    
2514
  /* No basic or MP NLRI, but there are some attributes -> error */
2515
  return 0;
2516
}
2517

    
2518

    
2519
/*
2520
 *        ROUTE-REFRESH
2521
 */
2522

    
2523
static inline byte *
2524
bgp_create_route_refresh(struct bgp_channel *c, byte *buf)
2525
{
2526
  struct bgp_proto *p = (void *) c->c.proto;
2527

    
2528
  BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH");
2529

    
2530
  /* Original route refresh request, RFC 2918 */
2531
  put_af4(buf, c->afi);
2532
  buf[2] = BGP_RR_REQUEST;
2533

    
2534
  return buf+4;
2535
}
2536

    
2537
static inline byte *
2538
bgp_create_begin_refresh(struct bgp_channel *c, byte *buf)
2539
{
2540
  struct bgp_proto *p = (void *) c->c.proto;
2541

    
2542
  BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR");
2543

    
2544
  /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */
2545
  put_af4(buf, c->afi);
2546
  buf[2] = BGP_RR_BEGIN;
2547

    
2548
  return buf+4;
2549
}
2550

    
2551
static inline byte *
2552
bgp_create_end_refresh(struct bgp_channel *c, byte *buf)
2553
{
2554
  struct bgp_proto *p = (void *) c->c.proto;
2555

    
2556
  BGP_TRACE(D_PACKETS, "Sending END-OF-RR");
2557

    
2558
  /* Demarcation of ending of route refresh (EoRR), RFC 7313 */
2559
  put_af4(buf, c->afi);
2560
  buf[2] = BGP_RR_END;
2561

    
2562
  return buf+4;
2563
}
2564

    
2565
static void
2566
bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len)
2567
{
2568
  struct bgp_proto *p = conn->bgp;
2569

    
2570
  if (conn->state != BS_ESTABLISHED)
2571
  { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
2572

    
2573
  if (!conn->local_caps->route_refresh)
2574
  { bgp_error(conn, 1, 3, pkt+18, 1); return; }
2575

    
2576
  if (len < (BGP_HEADER_LENGTH + 4))
2577
  { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2578

    
2579
  if (len > (BGP_HEADER_LENGTH + 4))
2580
  { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; }
2581

    
2582
  struct bgp_channel *c = bgp_get_channel(p, get_af4(pkt+19));
2583
  if (!c)
2584
  {
2585
    log(L_WARN "%s: Got ROUTE-REFRESH subtype %u for AF %u.%u, ignoring",
2586
        p->p.name, pkt[21], get_u16(pkt+19), pkt[22]);
2587
    return;
2588
  }
2589

    
2590
  /* RFC 7313 redefined reserved field as RR message subtype */
2591
  uint subtype = p->enhanced_refresh ? pkt[21] : BGP_RR_REQUEST;
2592

    
2593
  switch (subtype)
2594
  {
2595
  case BGP_RR_REQUEST:
2596
    BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH");
2597
    channel_request_feeding(&c->c);
2598
    break;
2599

    
2600
  case BGP_RR_BEGIN:
2601
    BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR");
2602
    bgp_refresh_begin(c);
2603
    break;
2604

    
2605
  case BGP_RR_END:
2606
    BGP_TRACE(D_PACKETS, "Got END-OF-RR");
2607
    bgp_refresh_end(c);
2608
    break;
2609

    
2610
  default:
2611
    log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring",
2612
        p->p.name, subtype);
2613
    break;
2614
  }
2615
}
2616

    
2617
static inline struct bgp_channel *
2618
bgp_get_channel_to_send(struct bgp_proto *p, struct bgp_conn *conn)
2619
{
2620
  uint i = conn->last_channel;
2621

    
2622
  /* Try the last channel, but at most several times */
2623
  if ((conn->channels_to_send & (1 << i)) &&
2624
      (conn->last_channel_count < 16))
2625
    goto found;
2626

    
2627
  /* Find channel with non-zero channels_to_send */
2628
  do
2629
  {
2630
    i++;
2631
    if (i >= p->channel_count)
2632
      i = 0;
2633
  }
2634
  while (! (conn->channels_to_send & (1 << i)));
2635

    
2636
  /* Use that channel */
2637
  conn->last_channel = i;
2638
  conn->last_channel_count = 0;
2639

    
2640
found:
2641
  conn->last_channel_count++;
2642
  return p->channel_map[i];
2643
}
2644

    
2645
static inline int
2646
bgp_send(struct bgp_conn *conn, uint type, uint len)
2647
{
2648
  sock *sk = conn->sk;
2649
  byte *buf = sk->tbuf;
2650

    
2651
  memset(buf, 0xff, 16);                /* Marker */
2652
  put_u16(buf+16, len);
2653
  buf[18] = type;
2654

    
2655
  return sk_send(sk, len);
2656
}
2657

    
2658
/**
2659
 * bgp_fire_tx - transmit packets
2660
 * @conn: connection
2661
 *
2662
 * Whenever the transmit buffers of the underlying TCP connection
2663
 * are free and we have any packets queued for sending, the socket functions
2664
 * call bgp_fire_tx() which takes care of selecting the highest priority packet
2665
 * queued (Notification > Keepalive > Open > Update), assembling its header
2666
 * and body and sending it to the connection.
2667
 */
2668
static int
2669
bgp_fire_tx(struct bgp_conn *conn)
2670
{
2671
  struct bgp_proto *p = conn->bgp;
2672
  struct bgp_channel *c;
2673
  byte *buf, *pkt, *end;
2674
  uint s;
2675

    
2676
  if (!conn->sk)
2677
    return 0;
2678

    
2679
  buf = conn->sk->tbuf;
2680
  pkt = buf + BGP_HEADER_LENGTH;
2681
  s = conn->packets_to_send;
2682

    
2683
  if (s & (1 << PKT_SCHEDULE_CLOSE))
2684
  {
2685
    /* We can finally close connection and enter idle state */
2686
    bgp_conn_enter_idle_state(conn);
2687
    return 0;
2688
  }
2689
  if (s & (1 << PKT_NOTIFICATION))
2690
  {
2691
    conn->packets_to_send = 1 << PKT_SCHEDULE_CLOSE;
2692
    end = bgp_create_notification(conn, pkt);
2693
    return bgp_send(conn, PKT_NOTIFICATION, end - buf);
2694
  }
2695
  else if (s & (1 << PKT_OPEN))
2696
  {
2697
    conn->packets_to_send &= ~(1 << PKT_OPEN);
2698
    end = bgp_create_open(conn, pkt);
2699
    return bgp_send(conn, PKT_OPEN, end - buf);
2700
  }
2701
  else if (s & (1 << PKT_KEEPALIVE))
2702
  {
2703
    conn->packets_to_send &= ~(1 << PKT_KEEPALIVE);
2704
    BGP_TRACE(D_PACKETS, "Sending KEEPALIVE");
2705
    bgp_start_timer(conn->keepalive_timer, conn->keepalive_time);
2706
    return bgp_send(conn, PKT_KEEPALIVE, BGP_HEADER_LENGTH);
2707
  }
2708
  else while (conn->channels_to_send)
2709
  {
2710
    c = bgp_get_channel_to_send(p, conn);
2711
    s = c->packets_to_send;
2712

    
2713
    if (s & (1 << PKT_ROUTE_REFRESH))
2714
    {
2715
      c->packets_to_send &= ~(1 << PKT_ROUTE_REFRESH);
2716
      end = bgp_create_route_refresh(c, pkt);
2717
      return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2718
    }
2719
    else if (s & (1 << PKT_BEGIN_REFRESH))
2720
    {
2721
      /* BoRR is a subtype of RR, but uses separate bit in packets_to_send */
2722
      c->packets_to_send &= ~(1 << PKT_BEGIN_REFRESH);
2723
      end = bgp_create_begin_refresh(c, pkt);
2724
      return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2725
    }
2726
    else if (s & (1 << PKT_UPDATE))
2727
    {
2728
      end = bgp_create_update(c, pkt);
2729
      if (end)
2730
        return bgp_send(conn, PKT_UPDATE, end - buf);
2731

    
2732
      /* No update to send, perhaps we need to send End-of-RIB or EoRR */
2733
      c->packets_to_send = 0;
2734
      conn->channels_to_send &= ~(1 << c->index);
2735

    
2736
      if (c->feed_state == BFS_LOADED)
2737
      {
2738
        c->feed_state = BFS_NONE;
2739
        end = bgp_create_end_mark(c, pkt);
2740
        return bgp_send(conn, PKT_UPDATE, end - buf);
2741
      }
2742

    
2743
      else if (c->feed_state == BFS_REFRESHED)
2744
      {
2745
        c->feed_state = BFS_NONE;
2746
        end = bgp_create_end_refresh(c, pkt);
2747
        return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2748
      }
2749
    }
2750
    else if (s)
2751
      bug("Channel packets_to_send: %x", s);
2752

    
2753
    c->packets_to_send = 0;
2754
    conn->channels_to_send &= ~(1 << c->index);
2755
  }
2756

    
2757
  return 0;
2758
}
2759

    
2760
/**
2761
 * bgp_schedule_packet - schedule a packet for transmission
2762
 * @conn: connection
2763
 * @c: channel
2764
 * @type: packet type
2765
 *
2766
 * Schedule a packet of type @type to be sent as soon as possible.
2767
 */
2768
void
2769
bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type)
2770
{
2771
  ASSERT(conn->sk);
2772

    
2773
  DBG("BGP: Scheduling packet type %d\n", type);
2774

    
2775
  if (c)
2776
  {
2777
    if (! conn->channels_to_send)
2778
    {
2779
      conn->last_channel = c->index;
2780
      conn->last_channel_count = 0;
2781
    }
2782

    
2783
    c->packets_to_send |= 1 << type;
2784
    conn->channels_to_send |= 1 << c->index;
2785
  }
2786
  else
2787
    conn->packets_to_send |= 1 << type;
2788

    
2789
  if ((conn->sk->tpos == conn->sk->tbuf) && !ev_active(conn->tx_ev))
2790
    ev_schedule(conn->tx_ev);
2791
}
2792

    
2793
void
2794
bgp_kick_tx(void *vconn)
2795
{
2796
  struct bgp_conn *conn = vconn;
2797

    
2798
  DBG("BGP: kicking TX\n");
2799
  while (bgp_fire_tx(conn) > 0)
2800
    ;
2801
}
2802

    
2803
void
2804
bgp_tx(sock *sk)
2805
{
2806
  struct bgp_conn *conn = sk->data;
2807

    
2808
  DBG("BGP: TX hook\n");
2809
  while (bgp_fire_tx(conn) > 0)
2810
    ;
2811
}
2812

    
2813

    
2814
static struct {
2815
  byte major, minor;
2816
  byte *msg;
2817
} bgp_msg_table[] = {
2818
  { 1, 0, "Invalid message header" },
2819
  { 1, 1, "Connection not synchronized" },
2820
  { 1, 2, "Bad message length" },
2821
  { 1, 3, "Bad message type" },
2822
  { 2, 0, "Invalid OPEN message" },
2823
  { 2, 1, "Unsupported version number" },
2824
  { 2, 2, "Bad peer AS" },
2825
  { 2, 3, "Bad BGP identifier" },
2826
  { 2, 4, "Unsupported optional parameter" },
2827
  { 2, 5, "Authentication failure" },
2828
  { 2, 6, "Unacceptable hold time" },
2829
  { 2, 7, "Required capability missing" }, /* [RFC5492] */
2830
  { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */
2831
  { 3, 0, "Invalid UPDATE message" },
2832
  { 3, 1, "Malformed attribute list" },
2833
  { 3, 2, "Unrecognized well-known attribute" },
2834
  { 3, 3, "Missing mandatory attribute" },
2835
  { 3, 4, "Invalid attribute flags" },
2836
  { 3, 5, "Invalid attribute length" },
2837
  { 3, 6, "Invalid ORIGIN attribute" },
2838
  { 3, 7, "AS routing loop" },                /* Deprecated */
2839
  { 3, 8, "Invalid NEXT_HOP attribute" },
2840
  { 3, 9, "Optional attribute error" },
2841
  { 3, 10, "Invalid network field" },
2842
  { 3, 11, "Malformed AS_PATH" },
2843
  { 4, 0, "Hold timer expired" },
2844
  { 5, 0, "Finite state machine error" }, /* Subcodes are according to [RFC6608] */
2845
  { 5, 1, "Unexpected message in OpenSent state" },
2846
  { 5, 2, "Unexpected message in OpenConfirm state" },
2847
  { 5, 3, "Unexpected message in Established state" },
2848
  { 6, 0, "Cease" }, /* Subcodes are according to [RFC4486] */
2849
  { 6, 1, "Maximum number of prefixes reached" },
2850
  { 6, 2, "Administrative shutdown" },
2851
  { 6, 3, "Peer de-configured" },
2852
  { 6, 4, "Administrative reset" },
2853
  { 6, 5, "Connection rejected" },
2854
  { 6, 6, "Other configuration change" },
2855
  { 6, 7, "Connection collision resolution" },
2856
  { 6, 8, "Out of Resources" },
2857
  { 7, 0, "Invalid ROUTE-REFRESH message" }, /* [RFC7313] */
2858
  { 7, 1, "Invalid ROUTE-REFRESH message length" } /* [RFC7313] */
2859
};
2860

    
2861
/**
2862
 * bgp_error_dsc - return BGP error description
2863
 * @code: BGP error code
2864
 * @subcode: BGP error subcode
2865
 *
2866
 * bgp_error_dsc() returns error description for BGP errors
2867
 * which might be static string or given temporary buffer.
2868
 */
2869
const char *
2870
bgp_error_dsc(uint code, uint subcode)
2871
{
2872
  static char buff[32];
2873
  uint i;
2874

    
2875
  for (i=0; i < ARRAY_SIZE(bgp_msg_table); i++)
2876
    if (bgp_msg_table[i].major == code && bgp_msg_table[i].minor == subcode)
2877
      return bgp_msg_table[i].msg;
2878

    
2879
  bsprintf(buff, "Unknown error %u.%u", code, subcode);
2880
  return buff;
2881
}
2882

    
2883
/* RFC 8203 - shutdown communication message */
2884
static int
2885
bgp_handle_message(struct bgp_proto *p, byte *data, uint len, byte **bp)
2886
{
2887
  byte *msg = data + 1;
2888
  uint msg_len = data[0];
2889
  uint i;
2890

    
2891
  /* Handle zero length message */
2892
  if (msg_len == 0)
2893
    return 1;
2894

    
2895
  /* Handle proper message */
2896
  if ((msg_len > 255) && (msg_len + 1 > len))
2897
    return 0;
2898

    
2899
  /* Some elementary cleanup */
2900
  for (i = 0; i < msg_len; i++)
2901
    if (msg[i] < ' ')
2902
      msg[i] = ' ';
2903

    
2904
  proto_set_message(&p->p, msg, msg_len);
2905
  *bp += bsprintf(*bp, ": \"%s\"", p->p.message);
2906
  return 1;
2907
}
2908

    
2909
void
2910
bgp_log_error(struct bgp_proto *p, u8 class, char *msg, uint code, uint subcode, byte *data, uint len)
2911
{
2912
  byte argbuf[256], *t = argbuf;
2913
  uint i;
2914

    
2915
  /* Don't report Cease messages generated by myself */
2916
  if (code == 6 && class == BE_BGP_TX)
2917
    return;
2918

    
2919
  /* Reset shutdown message */
2920
  if ((code == 6) && ((subcode == 2) || (subcode == 4)))
2921
    proto_set_message(&p->p, NULL, 0);
2922

    
2923
  if (len)
2924
    {
2925
      /* Bad peer AS - we would like to print the AS */
2926
      if ((code == 2) && (subcode == 2) && ((len == 2) || (len == 4)))
2927
        {
2928
          t += bsprintf(t, ": %u", (len == 2) ? get_u16(data) : get_u32(data));
2929
          goto done;
2930
        }
2931

    
2932
      /* RFC 8203 - shutdown communication */
2933
      if (((code == 6) && ((subcode == 2) || (subcode == 4))))
2934
        if (bgp_handle_message(p, data, len, &t))
2935
          goto done;
2936

    
2937
      *t++ = ':';
2938
      *t++ = ' ';
2939
      if (len > 16)
2940
        len = 16;
2941
      for (i=0; i<len; i++)
2942
        t += bsprintf(t, "%02x", data[i]);
2943
    }
2944

    
2945
done:
2946
  *t = 0;
2947
  const byte *dsc = bgp_error_dsc(code, subcode);
2948
  log(L_REMOTE "%s: %s: %s%s", p->p.name, msg, dsc, argbuf);
2949
}
2950

    
2951
static void
2952
bgp_rx_notification(struct bgp_conn *conn, byte *pkt, uint len)
2953
{
2954
  struct bgp_proto *p = conn->bgp;
2955

    
2956
  if (len < 21)
2957
  { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2958

    
2959
  uint code = pkt[19];
2960
  uint subcode = pkt[20];
2961
  int err = (code != 6);
2962

    
2963
  bgp_log_error(p, BE_BGP_RX, "Received", code, subcode, pkt+21, len-21);
2964
  bgp_store_error(p, conn, BE_BGP_RX, (code << 16) | subcode);
2965

    
2966
  bgp_conn_enter_close_state(conn);
2967
  bgp_schedule_packet(conn, NULL, PKT_SCHEDULE_CLOSE);
2968

    
2969
  if (err)
2970
  {
2971
    bgp_update_startup_delay(p);
2972
    bgp_stop(p, 0, NULL, 0);
2973
  }
2974
  else
2975
  {
2976
    uint subcode_bit = 1 << ((subcode <= 8) ? subcode : 0);
2977
    if (p->cf->disable_after_cease & subcode_bit)
2978
    {
2979
      log(L_INFO "%s: Disabled after Cease notification", p->p.name);
2980
      p->startup_delay = 0;
2981
      p->p.disabled = 1;
2982
    }
2983
  }
2984
}
2985

    
2986
static void
2987
bgp_rx_keepalive(struct bgp_conn *conn)
2988
{
2989
  struct bgp_proto *p = conn->bgp;
2990

    
2991
  BGP_TRACE(D_PACKETS, "Got KEEPALIVE");
2992
  bgp_start_timer(conn->hold_timer, conn->hold_time);
2993

    
2994
  if (conn->state == BS_OPENCONFIRM)
2995
  { bgp_conn_enter_established_state(conn); return; }
2996

    
2997
  if (conn->state != BS_ESTABLISHED)
2998
    bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0);
2999
}
3000

    
3001

    
3002
/**
3003
 * bgp_rx_packet - handle a received packet
3004
 * @conn: BGP connection
3005
 * @pkt: start of the packet
3006
 * @len: packet size
3007
 *
3008
 * bgp_rx_packet() takes a newly received packet and calls the corresponding
3009
 * packet handler according to the packet type.
3010
 */
3011
static void
3012
bgp_rx_packet(struct bgp_conn *conn, byte *pkt, uint len)
3013
{
3014
  byte type = pkt[18];
3015

    
3016
  DBG("BGP: Got packet %02x (%d bytes)\n", type, len);
3017

    
3018
  if (conn->bgp->p.mrtdump & MD_MESSAGES)
3019
    bgp_dump_message(conn, pkt, len);
3020

    
3021
  switch (type)
3022
  {
3023
  case PKT_OPEN:                return bgp_rx_open(conn, pkt, len);
3024
  case PKT_UPDATE:                return bgp_rx_update(conn, pkt, len);
3025
  case PKT_NOTIFICATION:        return bgp_rx_notification(conn, pkt, len);
3026
  case PKT_KEEPALIVE:                return bgp_rx_keepalive(conn);
3027
  case PKT_ROUTE_REFRESH:        return bgp_rx_route_refresh(conn, pkt, len);
3028
  default:                        bgp_error(conn, 1, 3, pkt+18, 1);
3029
  }
3030
}
3031

    
3032
/**
3033
 * bgp_rx - handle received data
3034
 * @sk: socket
3035
 * @size: amount of data received
3036
 *
3037
 * bgp_rx() is called by the socket layer whenever new data arrive from
3038
 * the underlying TCP connection. It assembles the data fragments to packets,
3039
 * checks their headers and framing and passes complete packets to
3040
 * bgp_rx_packet().
3041
 */
3042
int
3043
bgp_rx(sock *sk, uint size)
3044
{
3045
  struct bgp_conn *conn = sk->data;
3046
  byte *pkt_start = sk->rbuf;
3047
  byte *end = pkt_start + size;
3048
  uint i, len;
3049

    
3050
  DBG("BGP: RX hook: Got %d bytes\n", size);
3051
  while (end >= pkt_start + BGP_HEADER_LENGTH)
3052
    {
3053
      if ((conn->state == BS_CLOSE) || (conn->sk != sk))
3054
        return 0;
3055
      for(i=0; i<16; i++)
3056
        if (pkt_start[i] != 0xff)
3057
          {
3058
            bgp_error(conn, 1, 1, NULL, 0);
3059
            break;
3060
          }
3061
      len = get_u16(pkt_start+16);
3062
      if ((len < BGP_HEADER_LENGTH) || (len > bgp_max_packet_length(conn)))
3063
        {
3064
          bgp_error(conn, 1, 2, pkt_start+16, 2);
3065
          break;
3066
        }
3067
      if (end < pkt_start + len)
3068
        break;
3069
      bgp_rx_packet(conn, pkt_start, len);
3070
      pkt_start += len;
3071
    }
3072
  if (pkt_start != sk->rbuf)
3073
    {
3074
      memmove(sk->rbuf, pkt_start, end - pkt_start);
3075
      sk->rpos = sk->rbuf + (end - pkt_start);
3076
    }
3077
  return 0;
3078
}