Statistics
| Branch: | Revision:

iof-bird-daemon / proto / bgp / packets.c @ c49e4a65

History | View | Annotate | Download (66.6 KB)

1
/*
2
 *        BIRD -- BGP Packet Processing
3
 *
4
 *        (c) 2000 Martin Mares <mj@ucw.cz>
5
 *        (c) 2008--2016 Ondrej Zajicek <santiago@crfreenet.org>
6
 *        (c) 2008--2016 CZ.NIC z.s.p.o.
7
 *
8
 *        Can be freely distributed and used under the terms of the GNU GPL.
9
 */
10

    
11
#undef LOCAL_DEBUG
12

    
13
#include <stdlib.h>
14

    
15
#include "nest/bird.h"
16
#include "nest/iface.h"
17
#include "nest/protocol.h"
18
#include "nest/route.h"
19
#include "nest/attrs.h"
20
#include "nest/mrtdump.h"
21
#include "conf/conf.h"
22
#include "lib/unaligned.h"
23
#include "lib/flowspec.h"
24
#include "lib/socket.h"
25

    
26
#include "nest/cli.h"
27

    
28
#include "bgp.h"
29

    
30

    
31
#define BGP_RR_REQUEST                0
32
#define BGP_RR_BEGIN                1
33
#define BGP_RR_END                2
34

    
35
#define BGP_NLRI_MAX                (4 + 1 + 32)
36

    
37
#define BGP_MPLS_BOS                1        /* Bottom-of-stack bit */
38
#define BGP_MPLS_MAX                10        /* Max number of labels that 24*n <= 255 */
39
#define BGP_MPLS_NULL                3        /* Implicit NULL label */
40
#define BGP_MPLS_MAGIC                0x800000 /* Magic withdraw label value, RFC 3107 3 */
41

    
42

    
43
static struct tbf rl_rcv_update = TBF_DEFAULT_LOG_LIMITS;
44
static struct tbf rl_snd_update = TBF_DEFAULT_LOG_LIMITS;
45

    
46
/* Table for state -> RFC 6608 FSM error subcodes */
47
static byte fsm_err_subcode[BS_MAX] = {
48
  [BS_OPENSENT] = 1,
49
  [BS_OPENCONFIRM] = 2,
50
  [BS_ESTABLISHED] = 3
51
};
52

    
53

    
54
static struct bgp_channel *
55
bgp_get_channel(struct bgp_proto *p, u32 afi)
56
{
57
  uint i;
58

    
59
  for (i = 0; i < p->channel_count; i++)
60
    if (p->afi_map[i] == afi)
61
      return p->channel_map[i];
62

    
63
  return NULL;
64
}
65

    
66
static inline void
67
put_af3(byte *buf, u32 id)
68
{
69
  put_u16(buf, id >> 16);
70
  buf[2] = id & 0xff;
71
}
72

    
73
static inline void
74
put_af4(byte *buf, u32 id)
75
{
76
  put_u16(buf, id >> 16);
77
  buf[2] = 0;
78
  buf[3] = id & 0xff;
79
}
80

    
81
static inline u32
82
get_af3(byte *buf)
83
{
84
  return (get_u16(buf) << 16) | buf[2];
85
}
86

    
87
static inline u32
88
get_af4(byte *buf)
89
{
90
  return (get_u16(buf) << 16) | buf[3];
91
}
92

    
93
/*
94
 * MRT Dump format is not semantically specified.
95
 * We will use these values in appropriate fields:
96
 *
97
 * Local AS, Remote AS - configured AS numbers for given BGP instance.
98
 * Local IP, Remote IP - IP addresses of the TCP connection (0 if no connection)
99
 *
100
 * We dump two kinds of MRT messages: STATE_CHANGE (for BGP state
101
 * changes) and MESSAGE (for received BGP messages).
102
 *
103
 * STATE_CHANGE uses always AS4 variant, but MESSAGE uses AS4 variant
104
 * only when AS4 session is established and even in that case MESSAGE
105
 * does not use AS4 variant for initial OPEN message. This strange
106
 * behavior is here for compatibility with Quagga and Bgpdump,
107
 */
108

    
109
static byte *
110
mrt_put_bgp4_hdr(byte *buf, struct bgp_conn *conn, int as4)
111
{
112
  struct bgp_proto *p = conn->bgp;
113
  uint v4 = ipa_is_ip4(p->cf->remote_ip);
114

    
115
  if (as4)
116
  {
117
    put_u32(buf+0, p->remote_as);
118
    put_u32(buf+4, p->public_as);
119
    buf+=8;
120
  }
121
  else
122
  {
123
    put_u16(buf+0, (p->remote_as <= 0xFFFF) ? p->remote_as : AS_TRANS);
124
    put_u16(buf+2, (p->public_as <= 0xFFFF) ? p->public_as : AS_TRANS);
125
    buf+=4;
126
  }
127

    
128
  put_u16(buf+0, (p->neigh && p->neigh->iface) ? p->neigh->iface->index : 0);
129
  put_u16(buf+2, v4 ? BGP_AFI_IPV4 : BGP_AFI_IPV6);
130
  buf+=4;
131

    
132
  if (v4)
133
  {
134
    buf = put_ip4(buf, conn->sk ? ipa_to_ip4(conn->sk->daddr) : IP4_NONE);
135
    buf = put_ip4(buf, conn->sk ? ipa_to_ip4(conn->sk->saddr) : IP4_NONE);
136
  }
137
  else
138
  {
139
    buf = put_ip6(buf, conn->sk ? ipa_to_ip6(conn->sk->daddr) : IP6_NONE);
140
    buf = put_ip6(buf, conn->sk ? ipa_to_ip6(conn->sk->saddr) : IP6_NONE);
141
  }
142

    
143
  return buf;
144
}
145

    
146
static void
147
mrt_dump_bgp_packet(struct bgp_conn *conn, byte *pkt, uint len)
148
{
149
  byte *buf = alloca(128+len);        /* 128 is enough for MRT headers */
150
  byte *bp = buf + MRTDUMP_HDR_LENGTH;
151
  int as4 = conn->bgp->as4_session;
152

    
153
  bp = mrt_put_bgp4_hdr(bp, conn, as4);
154
  memcpy(bp, pkt, len);
155
  bp += len;
156
  mrt_dump_message(&conn->bgp->p, BGP4MP, as4 ? BGP4MP_MESSAGE_AS4 : BGP4MP_MESSAGE,
157
                   buf, bp-buf);
158
}
159

    
160
static inline u16
161
convert_state(uint state)
162
{
163
  /* Convert state from our BS_* values to values used in MRTDump */
164
  return (state == BS_CLOSE) ? 1 : state + 1;
165
}
166

    
167
void
168
mrt_dump_bgp_state_change(struct bgp_conn *conn, uint old, uint new)
169
{
170
  byte buf[128];
171
  byte *bp = buf + MRTDUMP_HDR_LENGTH;
172

    
173
  bp = mrt_put_bgp4_hdr(bp, conn, 1);
174
  put_u16(bp+0, convert_state(old));
175
  put_u16(bp+2, convert_state(new));
176
  bp += 4;
177
  mrt_dump_message(&conn->bgp->p, BGP4MP, BGP4MP_STATE_CHANGE_AS4, buf, bp-buf);
178
}
179

    
180
static byte *
181
bgp_create_notification(struct bgp_conn *conn, byte *buf)
182
{
183
  struct bgp_proto *p = conn->bgp;
184

    
185
  BGP_TRACE(D_PACKETS, "Sending NOTIFICATION(code=%d.%d)", conn->notify_code, conn->notify_subcode);
186
  buf[0] = conn->notify_code;
187
  buf[1] = conn->notify_subcode;
188
  memcpy(buf+2, conn->notify_data, conn->notify_size);
189
  return buf + 2 + conn->notify_size;
190
}
191

    
192

    
193
/* Capability negotiation as per RFC 5492 */
194

    
195
const struct bgp_af_caps *
196
bgp_find_af_caps(struct bgp_caps *caps, u32 afi)
197
{
198
  struct bgp_af_caps *ac;
199

    
200
  WALK_AF_CAPS(caps, ac)
201
    if (ac->afi == afi)
202
      return ac;
203

    
204
  return NULL;
205
}
206

    
207
static struct bgp_af_caps *
208
bgp_get_af_caps(struct bgp_caps *caps, u32 afi)
209
{
210
  struct bgp_af_caps *ac;
211

    
212
  WALK_AF_CAPS(caps, ac)
213
    if (ac->afi == afi)
214
      return ac;
215

    
216
  ac = &caps->af_data[caps->af_count++];
217
  memset(ac, 0, sizeof(struct bgp_af_caps));
218
  ac->afi = afi;
219

    
220
  return ac;
221
}
222

    
223
static int
224
bgp_af_caps_cmp(const void *X, const void *Y)
225
{
226
  const struct bgp_af_caps *x = X, *y = Y;
227
  return (x->afi < y->afi) ? -1 : (x->afi > y->afi) ? 1 : 0;
228
}
229

    
230

    
231
static byte *
232
bgp_write_capabilities(struct bgp_conn *conn, byte *buf)
233
{
234
  struct bgp_proto *p = conn->bgp;
235
  struct bgp_channel *c;
236
  struct bgp_caps *caps;
237
  struct bgp_af_caps *ac;
238
  uint any_ext_next_hop = 0;
239
  uint any_add_path = 0;
240
  byte *data;
241

    
242
  /* Prepare bgp_caps structure */
243

    
244
  int n = list_length(&p->p.channels);
245
  caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps) + n * sizeof(struct bgp_af_caps));
246
  conn->local_caps = caps;
247

    
248
  caps->as4_support = p->cf->enable_as4;
249
  caps->ext_messages = p->cf->enable_extended_messages;
250
  caps->route_refresh = p->cf->enable_refresh;
251
  caps->enhanced_refresh = p->cf->enable_refresh;
252

    
253
  if (caps->as4_support)
254
    caps->as4_number = p->public_as;
255

    
256
  if (p->cf->gr_mode)
257
  {
258
    caps->gr_aware = 1;
259
    caps->gr_time = p->cf->gr_time;
260
    caps->gr_flags = p->p.gr_recovery ? BGP_GRF_RESTART : 0;
261
  }
262

    
263
  /* Allocate and fill per-AF fields */
264
  WALK_LIST(c, p->p.channels)
265
  {
266
    ac = &caps->af_data[caps->af_count++];
267
    ac->afi = c->afi;
268
    ac->ready = 1;
269

    
270
    ac->ext_next_hop = bgp_channel_is_ipv4(c) && c->cf->ext_next_hop;
271
    any_ext_next_hop |= ac->ext_next_hop;
272

    
273
    ac->add_path = c->cf->add_path;
274
    any_add_path |= ac->add_path;
275

    
276
    if (c->cf->gr_able)
277
    {
278
      ac->gr_able = 1;
279

    
280
      if (p->p.gr_recovery)
281
        ac->gr_af_flags |= BGP_GRF_FORWARDING;
282
    }
283
  }
284

    
285
  /* Sort capability fields by AFI/SAFI */
286
  qsort(caps->af_data, caps->af_count, sizeof(struct bgp_af_caps), bgp_af_caps_cmp);
287

    
288

    
289
  /* Create capability list in buffer */
290

    
291
  /*
292
   * Note that max length is ~ 20+14*af_count. With max 10 channels that is
293
   * 160. Option limit is 253 and buffer size is 4096, so we cannot overflow
294
   * unless we add new capabilities or more AFs.
295
   */
296

    
297
  WALK_AF_CAPS(caps, ac)
298
    if (ac->ready)
299
    {
300
      *buf++ = 1;                /* Capability 1: Multiprotocol extensions */
301
      *buf++ = 4;                /* Capability data length */
302
      put_af4(buf, ac->afi);
303
      buf += 4;
304
    }
305

    
306
  if (caps->route_refresh)
307
  {
308
    *buf++ = 2;                        /* Capability 2: Support for route refresh */
309
    *buf++ = 0;                        /* Capability data length */
310
  }
311

    
312
  if (any_ext_next_hop)
313
  {
314
    *buf++ = 5;                        /* Capability 5: Support for extended next hop */
315
    *buf++ = 0;                        /* Capability data length, will be fixed later */
316
    data = buf;
317

    
318
    WALK_AF_CAPS(caps, ac)
319
      if (ac->ext_next_hop)
320
      {
321
        put_af4(buf, ac->afi);
322
        put_u16(buf+4, BGP_AFI_IPV6);
323
        buf += 6;
324
      }
325

    
326
    data[-1] = buf - data;
327
  }
328

    
329
  if (caps->ext_messages)
330
  {
331
    *buf++ = 6;                        /* Capability 6: Support for extended messages */
332
    *buf++ = 0;                        /* Capability data length */
333
  }
334

    
335
  if (caps->gr_aware)
336
  {
337
    *buf++ = 64;                /* Capability 64: Support for graceful restart */
338
    *buf++ = 0;                        /* Capability data length, will be fixed later */
339
    data = buf;
340

    
341
    put_u16(buf, caps->gr_time);
342
    buf[0] |= caps->gr_flags;
343
    buf += 2;
344

    
345
    WALK_AF_CAPS(caps, ac)
346
      if (ac->gr_able)
347
      {
348
        put_af3(buf, ac->afi);
349
        buf[3] = ac->gr_af_flags;
350
        buf += 4;
351
      }
352

    
353
    data[-1] = buf - data;
354
  }
355

    
356
  if (caps->as4_support)
357
  {
358
    *buf++ = 65;                /* Capability 65: Support for 4-octet AS number */
359
    *buf++ = 4;                        /* Capability data length */
360
    put_u32(buf, p->public_as);
361
    buf += 4;
362
  }
363

    
364
  if (any_add_path)
365
  {
366
    *buf++ = 69;                /* Capability 69: Support for ADD-PATH */
367
    *buf++ = 0;                        /* Capability data length, will be fixed later */
368
    data = buf;
369

    
370
    WALK_AF_CAPS(caps, ac)
371
      if (ac->add_path)
372
      {
373
        put_af3(buf, ac->afi);
374
        buf[3] = ac->add_path;
375
        buf += 4;
376
      }
377

    
378
    data[-1] = buf - data;
379
  }
380

    
381
  if (caps->enhanced_refresh)
382
  {
383
    *buf++ = 70;                /* Capability 70: Support for enhanced route refresh */
384
    *buf++ = 0;                        /* Capability data length */
385
  }
386

    
387
  return buf;
388
}
389

    
390
static void
391
bgp_read_capabilities(struct bgp_conn *conn, struct bgp_caps *caps, byte *pos, int len)
392
{
393
  struct bgp_proto *p = conn->bgp;
394
  struct bgp_af_caps *ac;
395
  int i, cl;
396
  u32 af;
397

    
398
  while (len > 0)
399
  {
400
    if (len < 2 || len < (2 + pos[1]))
401
      goto err;
402

    
403
    /* Capability length */
404
    cl = pos[1];
405

    
406
    /* Capability type */
407
    switch (pos[0])
408
    {
409
    case  1: /* Multiprotocol capability, RFC 4760 */
410
      if (cl != 4)
411
        goto err;
412

    
413
      af = get_af4(pos+2);
414
      ac = bgp_get_af_caps(caps, af);
415
      ac->ready = 1;
416
      break;
417

    
418
    case  2: /* Route refresh capability, RFC 2918 */
419
      if (cl != 0)
420
        goto err;
421

    
422
      caps->route_refresh = 1;
423
      break;
424

    
425
    case  5: /* Extended next hop encoding capability, RFC 5549 */
426
      if (cl % 6)
427
        goto err;
428

    
429
      for (i = 0; i < cl; i += 6)
430
      {
431
        /* Specified only for IPv4 prefixes with IPv6 next hops */
432
        if ((get_u16(pos+2+i+0) != BGP_AFI_IPV4) ||
433
            (get_u16(pos+2+i+4) != BGP_AFI_IPV6))
434
          continue;
435

    
436
        af = get_af4(pos+2+i);
437
        ac = bgp_get_af_caps(caps, af);
438
        ac->ext_next_hop = 1;
439
      }
440
      break;
441

    
442
    case  6: /* Extended message length capability, RFC draft */
443
      if (cl != 0)
444
        goto err;
445

    
446
      caps->ext_messages = 1;
447
      break;
448

    
449
    case 64: /* Graceful restart capability, RFC 4724 */
450
      if (cl % 4 != 2)
451
        goto err;
452

    
453
      /* Only the last instance is valid */
454
      WALK_AF_CAPS(caps, ac)
455
      {
456
        ac->gr_able = 0;
457
        ac->gr_af_flags = 0;
458
      }
459

    
460
      caps->gr_aware = 1;
461
      caps->gr_flags = pos[2] & 0xf0;
462
      caps->gr_time = get_u16(pos + 2) & 0x0fff;
463

    
464
      for (i = 2; i < cl; i += 4)
465
      {
466
        af = get_af3(pos+2+i);
467
        ac = bgp_get_af_caps(caps, af);
468
        ac->gr_able = 1;
469
        ac->gr_af_flags = pos[2+i+3];
470
      }
471
      break;
472

    
473
    case 65: /* AS4 capability, RFC 6793 */
474
      if (cl != 4)
475
        goto err;
476

    
477
      caps->as4_support = 1;
478
      caps->as4_number = get_u32(pos + 2);
479
      break;
480

    
481
    case 69: /* ADD-PATH capability, RFC 7911 */
482
      if (cl % 4)
483
        goto err;
484

    
485
      for (i = 0; i < cl; i += 4)
486
      {
487
        byte val = pos[2+i+3];
488
        if (!val || (val > BGP_ADD_PATH_FULL))
489
        {
490
          log(L_WARN "%s: Got ADD-PATH capability with unknown value %u, ignoring",
491
              p->p.name, val);
492
          break;
493
        }
494
      }
495

    
496
      for (i = 0; i < cl; i += 4)
497
      {
498
        af = get_af3(pos+2+i);
499
        ac = bgp_get_af_caps(caps, af);
500
        ac->add_path = pos[2+i+3];
501
      }
502
      break;
503

    
504
    case 70: /* Enhanced route refresh capability, RFC 7313 */
505
      if (cl != 0)
506
        goto err;
507

    
508
      caps->enhanced_refresh = 1;
509
      break;
510

    
511
      /* We can safely ignore all other capabilities */
512
    }
513

    
514
    ADVANCE(pos, len, 2 + cl);
515
  }
516
  return;
517

    
518
err:
519
  bgp_error(conn, 2, 0, NULL, 0);
520
  return;
521
}
522

    
523
static int
524
bgp_read_options(struct bgp_conn *conn, byte *pos, int len)
525
{
526
  struct bgp_proto *p = conn->bgp;
527
  struct bgp_caps *caps;
528
  int ol;
529

    
530
  /* Max number of announced AFIs is limited by max option length (255) */
531
  caps = alloca(sizeof(struct bgp_caps) + 64 * sizeof(struct bgp_af_caps));
532
  memset(caps, 0, sizeof(struct bgp_caps));
533

    
534
  while (len > 0)
535
  {
536
    if ((len < 2) || (len < (2 + pos[1])))
537
    { bgp_error(conn, 2, 0, NULL, 0); return -1; }
538

    
539
    ol = pos[1];
540
    if (pos[0] == 2)
541
    {
542
      /* BGP capabilities, RFC 5492 */
543
      if (p->cf->capabilities)
544
        bgp_read_capabilities(conn, caps, pos + 2, ol);
545
    }
546
    else
547
    {
548
      /* Unknown option */
549
      bgp_error(conn, 2, 4, pos, ol); /* FIXME: ol or ol+2 ? */
550
      return -1;
551
    }
552

    
553
    ADVANCE(pos, len, 2 + ol);
554
  }
555

    
556
  uint n = sizeof(struct bgp_caps) + caps->af_count * sizeof(struct bgp_af_caps);
557
  conn->remote_caps = mb_allocz(p->p.pool, n);
558
  memcpy(conn->remote_caps, caps, n);
559

    
560
  return 0;
561
}
562

    
563
static byte *
564
bgp_create_open(struct bgp_conn *conn, byte *buf)
565
{
566
  struct bgp_proto *p = conn->bgp;
567

    
568
  BGP_TRACE(D_PACKETS, "Sending OPEN(ver=%d,as=%d,hold=%d,id=%08x)",
569
            BGP_VERSION, p->public_as, p->cf->hold_time, p->local_id);
570

    
571
  buf[0] = BGP_VERSION;
572
  put_u16(buf+1, (p->public_as < 0xFFFF) ? p->public_as : AS_TRANS);
573
  put_u16(buf+3, p->cf->hold_time);
574
  put_u32(buf+5, p->local_id);
575

    
576
  if (p->cf->capabilities)
577
  {
578
    /* Prepare local_caps and write capabilities to buffer */
579
    byte *end = bgp_write_capabilities(conn, buf+12);
580
    uint len = end - (buf+12);
581

    
582
    buf[9] = len + 2;                /* Optional parameters length */
583
    buf[10] = 2;                /* Option 2: Capability list */
584
    buf[11] = len;                /* Option data length */
585

    
586
    return end;
587
  }
588
  else
589
  {
590
    /* Prepare empty local_caps */
591
    conn->local_caps = mb_allocz(p->p.pool, sizeof(struct bgp_caps));
592

    
593
    buf[9] = 0;                        /* No optional parameters */
594
    return buf + 10;
595
  }
596

    
597
  return buf;
598
}
599

    
600
static void
601
bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len)
602
{
603
  struct bgp_proto *p = conn->bgp;
604
  struct bgp_conn *other;
605
  u32 asn, hold, id;
606

    
607
  /* Check state */
608
  if (conn->state != BS_OPENSENT)
609
  { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
610

    
611
  /* Check message contents */
612
  if (len < 29 || len != 29 + (uint) pkt[28])
613
  { bgp_error(conn, 1, 2, pkt+16, 2); return; }
614

    
615
  if (pkt[19] != BGP_VERSION)
616
  { u16 val = BGP_VERSION; bgp_error(conn, 2, 1, (byte *) &val, 2); return; }
617

    
618
  asn = get_u16(pkt+20);
619
  hold = get_u16(pkt+22);
620
  id = get_u32(pkt+24);
621
  BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%R)", asn, hold, id);
622

    
623
  if (bgp_read_options(conn, pkt+29, pkt[28]) < 0)
624
    return;
625

    
626
  if (hold > 0 && hold < 3)
627
  { bgp_error(conn, 2, 6, pkt+22, 2); return; }
628

    
629
  /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */
630
  if (!id || (p->is_internal && id == p->local_id))
631
  { bgp_error(conn, 2, 3, pkt+24, -4); return; }
632

    
633
  struct bgp_caps *caps = conn->remote_caps;
634

    
635
  if (caps->as4_support)
636
  {
637
    u32 as4 = caps->as4_number;
638

    
639
    if ((as4 != asn) && (asn != AS_TRANS))
640
      log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name);
641

    
642
    if (as4 != p->remote_as)
643
    { as4 = htonl(as4); bgp_error(conn, 2, 2, (byte *) &as4, 4); return; }
644
  }
645
  else
646
  {
647
    if (asn != p->remote_as)
648
    { bgp_error(conn, 2, 2, pkt+20, 2); return; }
649
  }
650

    
651
  /* Check the other connection */
652
  other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn;
653
  switch (other->state)
654
  {
655
  case BS_CONNECT:
656
  case BS_ACTIVE:
657
    /* Stop outgoing connection attempts */
658
    bgp_conn_enter_idle_state(other);
659
    break;
660

    
661
  case BS_IDLE:
662
  case BS_OPENSENT:
663
  case BS_CLOSE:
664
    break;
665

    
666
  case BS_OPENCONFIRM:
667
    /*
668
     * Description of collision detection rules in RFC 4271 is confusing and
669
     * contradictory, but it is essentially:
670
     *
671
     * 1. Router with higher ID is dominant
672
     * 2. If both have the same ID, router with higher ASN is dominant [RFC6286]
673
     * 3. When both connections are in OpenConfirm state, one initiated by
674
     *    the dominant router is kept.
675
     *
676
     * The first line in the expression below evaluates whether the neighbor
677
     * is dominant, the second line whether the new connection was initiated
678
     * by the neighbor. If both are true (or both are false), we keep the new
679
     * connection, otherwise we keep the old one.
680
     */
681
    if (((p->local_id < id) || ((p->local_id == id) && (p->public_as < p->remote_as)))
682
        == (conn == &p->incoming_conn))
683
    {
684
      /* Should close the other connection */
685
      BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection");
686
      bgp_error(other, 6, 7, NULL, 0);
687
      break;
688
    }
689
    /* Fall thru */
690
  case BS_ESTABLISHED:
691
    /* Should close this connection */
692
    BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection");
693
    bgp_error(conn, 6, 7, NULL, 0);
694
    return;
695

    
696
  default:
697
    bug("bgp_rx_open: Unknown state");
698
  }
699

    
700
  /* Update our local variables */
701
  conn->hold_time = MIN(hold, p->cf->hold_time);
702
  conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3;
703
  conn->as4_session = conn->local_caps->as4_support && caps->as4_support;
704
  conn->ext_messages = conn->local_caps->ext_messages && caps->ext_messages;
705
  p->remote_id = id;
706

    
707
  DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n",
708
      conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, conn->as4_session);
709

    
710
  bgp_schedule_packet(conn, NULL, PKT_KEEPALIVE);
711
  bgp_start_timer(conn->hold_timer, conn->hold_time);
712
  bgp_conn_enter_openconfirm_state(conn);
713
}
714

    
715

    
716
/*
717
 *        Next hop handling
718
 */
719

    
720
#define REPORT(msg, args...) \
721
  ({ log(L_REMOTE "%s: " msg, s->proto->p.name, ## args); })
722

    
723
#define DISCARD(msg, args...) \
724
  ({ REPORT(msg, ## args); return; })
725

    
726
#define WITHDRAW(msg, args...) \
727
  ({ REPORT(msg, ## args); s->err_withdraw = 1; return; })
728

    
729
#define BAD_AFI                "Unexpected AF <%u/%u> in UPDATE"
730
#define BAD_NEXT_HOP        "Invalid NEXT_HOP attribute"
731
#define NO_NEXT_HOP        "Missing NEXT_HOP attribute"
732
#define NO_LABEL_STACK        "Missing MPLS stack"
733

    
734

    
735
static void
736
bgp_apply_next_hop(struct bgp_parse_state *s, rta *a, ip_addr gw, ip_addr ll)
737
{
738
  struct bgp_proto *p = s->proto;
739
  struct bgp_channel *c = s->channel;
740

    
741
  if (c->cf->gw_mode == GW_DIRECT)
742
  {
743
    neighbor *nbr = NULL;
744

    
745
    /* GW_DIRECT -> single_hop -> p->neigh != NULL */
746
    if (ipa_nonzero(gw))
747
      nbr = neigh_find2(&p->p, &gw, NULL, 0);
748
    else if (ipa_nonzero(ll))
749
      nbr = neigh_find2(&p->p, &ll, p->neigh->iface, 0);
750

    
751
    if (!nbr || (nbr->scope == SCOPE_HOST))
752
      WITHDRAW(BAD_NEXT_HOP);
753

    
754
    a->dest = RTD_UNICAST;
755
    a->nh.gw = nbr->addr;
756
    a->nh.iface = nbr->iface;
757
  }
758
  else /* GW_RECURSIVE */
759
  {
760
    if (ipa_zero(gw))
761
      WITHDRAW(BAD_NEXT_HOP);
762

    
763
    rtable *tab = ipa_is_ip4(gw) ? c->igp_table_ip4 : c->igp_table_ip6;
764
    s->hostentry = rt_get_hostentry(tab, gw, ll, c->c.table);
765

    
766
    if (!s->mpls)
767
      rta_apply_hostentry(a, s->hostentry, NULL);
768

    
769
    /* With MPLS, hostentry is applied later in bgp_apply_mpls_labels() */
770
  }
771
}
772

    
773
static void
774
bgp_apply_mpls_labels(struct bgp_parse_state *s, rta *a, u32 *labels, uint lnum)
775
{
776
  if (lnum > MPLS_MAX_LABEL_STACK)
777
  {
778
    REPORT("Too many MPLS labels ($u)", lnum);
779

    
780
    a->dest = RTD_UNREACHABLE;
781
    a->hostentry = NULL;
782
    a->nh = (struct nexthop) { };
783
    return;
784
  }
785

    
786
  /* Handle implicit NULL as empty MPLS stack */
787
  if ((lnum == 1) && (labels[0] == BGP_MPLS_NULL))
788
    lnum = 0;
789

    
790
  if (s->channel->cf->gw_mode == GW_DIRECT)
791
  {
792
    a->nh.labels = lnum;
793
    memcpy(a->nh.label, labels, 4*lnum);
794
  }
795
  else /* GW_RECURSIVE */
796
  {
797
    mpls_label_stack ms;
798

    
799
    ms.len = lnum;
800
    memcpy(ms.stack, labels, 4*lnum);
801
    rta_apply_hostentry(a, s->hostentry, &ms);
802
  }
803
}
804

    
805

    
806
static inline int
807
bgp_use_next_hop(struct bgp_export_state *s, eattr *a)
808
{
809
  struct bgp_proto *p = s->proto;
810
  ip_addr *nh = (void *) a->u.ptr->data;
811

    
812
  if (s->channel->cf->next_hop_self)
813
    return 0;
814

    
815
  if (s->channel->cf->next_hop_keep)
816
    return 1;
817

    
818
  /* Keep it when explicitly set in export filter */
819
  if (a->type & EAF_FRESH)
820
    return 1;
821

    
822
  /* Keep it when exported to internal peers */
823
  if (p->is_interior && ipa_nonzero(*nh))
824
    return 1;
825

    
826
  /* Keep it when forwarded between single-hop BGPs on the same iface */
827
  struct iface *ifa = (s->src && s->src->neigh) ? s->src->neigh->iface : NULL;
828
  return p->neigh && (p->neigh->iface == ifa);
829
}
830

    
831
static inline int
832
bgp_use_gateway(struct bgp_export_state *s)
833
{
834
  struct bgp_proto *p = s->proto;
835
  rta *ra = s->route->attrs;
836

    
837
  if (s->channel->cf->next_hop_self)
838
    return 0;
839

    
840
  /* We need one valid global gateway */
841
  if ((ra->dest != RTD_UNICAST) || ra->nh.next || ipa_zero(ra->nh.gw) || ipa_is_link_local(ra->nh.gw))
842
    return 0;
843

    
844
  /* Use it when exported to internal peers */
845
  if (p->is_interior)
846
    return 1;
847

    
848
  /* Use it when forwarded to single-hop BGP peer on on the same iface */
849
  return p->neigh && (p->neigh->iface == ra->nh.iface);
850
}
851

    
852
static void
853
bgp_update_next_hop_ip(struct bgp_export_state *s, eattr *a, ea_list **to)
854
{
855
  if (!a || !bgp_use_next_hop(s, a))
856
  {
857
    if (bgp_use_gateway(s))
858
    {
859
      rta *ra = s->route->attrs;
860
      ip_addr nh[1] = { ra->nh.gw };
861
      bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, 16);
862

    
863
      if (s->mpls)
864
      {
865
        u32 implicit_null = BGP_MPLS_NULL;
866
        u32 *labels = ra->nh.labels ? ra->nh.label : &implicit_null;
867
        uint lnum = ra->nh.labels ? ra->nh.labels : 1;
868
        bgp_set_attr_data(to, s->pool, BA_MPLS_LABEL_STACK, 0, labels, lnum * 4);
869
      }
870
    }
871
    else
872
    {
873
      ip_addr nh[2] = { s->channel->next_hop_addr, s->channel->link_addr };
874
      bgp_set_attr_data(to, s->pool, BA_NEXT_HOP, 0, nh, ipa_nonzero(nh[1]) ? 32 : 16);
875

    
876
      /* TODO: Use local MPLS assigned label */
877
      if (s->mpls)
878
        bgp_unset_attr(to, s->pool, BA_MPLS_LABEL_STACK);
879
    }
880
  }
881

    
882
  /* Check if next hop is valid */
883
  a = bgp_find_attr(*to, BA_NEXT_HOP);
884
  if (!a)
885
    WITHDRAW(NO_NEXT_HOP);
886

    
887
  ip_addr *nh = (void *) a->u.ptr->data;
888
  ip_addr peer = s->proto->cf->remote_ip;
889
  uint len = a->u.ptr->length;
890

    
891
  /* Forbid zero next hop */
892
  if (ipa_zero(nh[0]) && ((len != 32) || ipa_zero(nh[1])))
893
    WITHDRAW(BAD_NEXT_HOP);
894

    
895
  /* Forbid next hop equal to neighbor IP */
896
  if (ipa_equal(peer, nh[0]) || ((len == 32) && ipa_equal(peer, nh[1])))
897
    WITHDRAW(BAD_NEXT_HOP);
898

    
899
  /* Forbid next hop with non-matching AF */
900
  if ((ipa_is_ip4(nh[0]) != bgp_channel_is_ipv4(s->channel)) &&
901
      !s->channel->ext_next_hop)
902
    WITHDRAW(BAD_NEXT_HOP);
903

    
904
  /* Just check if MPLS stack */
905
  if (s->mpls && !bgp_find_attr(*to, BA_MPLS_LABEL_STACK))
906
    WITHDRAW(NO_LABEL_STACK);
907
}
908

    
909
static uint
910
bgp_encode_next_hop_ip(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED)
911
{
912
  /* This function is used only for MP-BGP, see bgp_encode_next_hop() for IPv4 BGP */
913
  ip_addr *nh = (void *) a->u.ptr->data;
914
  uint len = a->u.ptr->length;
915

    
916
  ASSERT((len == 16) || (len == 32));
917

    
918
  /*
919
   * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This
920
   * is specified in RFC 5549 for IPv4 and in RFC 4798 for IPv6. The difference
921
   * is that IPv4 address is directly encoded with IPv4 NLRI, but as IPv4-mapped
922
   * IPv6 address with IPv6 NLRI.
923
   */
924

    
925
  if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0]))
926
  {
927
    put_ip4(buf, ipa_to_ip4(nh[0]));
928
    return 4;
929
  }
930

    
931
  put_ip6(buf, ipa_to_ip6(nh[0]));
932

    
933
  if (len == 32)
934
    put_ip6(buf+16, ipa_to_ip6(nh[1]));
935

    
936
  return len;
937
}
938

    
939
static void
940
bgp_decode_next_hop_ip(struct bgp_parse_state *s, byte *data, uint len, rta *a)
941
{
942
  struct bgp_channel *c = s->channel;
943
  struct adata *ad = lp_alloc_adata(s->pool, 32);
944
  ip_addr *nh = (void *) ad->data;
945

    
946
  if (len == 4)
947
  {
948
    nh[0] = ipa_from_ip4(get_ip4(data));
949
    nh[1] = IPA_NONE;
950
  }
951
  else if (len == 16)
952
  {
953
    nh[0] = ipa_from_ip6(get_ip6(data));
954
    nh[1] = IPA_NONE;
955

    
956
    if (ipa_is_link_local(nh[0]))
957
    { nh[1] = nh[0]; nh[0] = IPA_NONE; }
958
  }
959
  else if (len == 32)
960
  {
961
    nh[0] = ipa_from_ip6(get_ip6(data));
962
    nh[1] = ipa_from_ip6(get_ip6(data+16));
963

    
964
    if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1]))
965
      nh[1] = IPA_NONE;
966
  }
967
  else
968
    bgp_parse_error(s, 9);
969

    
970
  if (ipa_zero(nh[1]))
971
    ad->length = 16;
972

    
973
  if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop)
974
    WITHDRAW(BAD_NEXT_HOP);
975

    
976
  // XXXX validate next hop
977

    
978
  bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad);
979
  bgp_apply_next_hop(s, a, nh[0], nh[1]);
980
}
981

    
982
static uint
983
bgp_encode_next_hop_vpn(struct bgp_write_state *s, eattr *a, byte *buf, uint size UNUSED)
984
{
985
  ip_addr *nh = (void *) a->u.ptr->data;
986
  uint len = a->u.ptr->length;
987

    
988
  ASSERT((len == 16) || (len == 32));
989

    
990
  /*
991
   * Both IPv4 and IPv6 next hops can be used (with ext_next_hop enabled). This
992
   * is specified in RFC 5549 for VPNv4 and in RFC 4659 for VPNv6. The difference
993
   * is that IPv4 address is directly encoded with VPNv4 NLRI, but as IPv4-mapped
994
   * IPv6 address with VPNv6 NLRI.
995
   */
996

    
997
  if (bgp_channel_is_ipv4(s->channel) && ipa_is_ip4(nh[0]))
998
  {
999
    put_u64(buf, 0); /* VPN RD is 0 */
1000
    put_ip4(buf+8, ipa_to_ip4(nh[0]));
1001
    return 12;
1002
  }
1003

    
1004
  put_u64(buf, 0); /* VPN RD is 0 */
1005
  put_ip6(buf+8, ipa_to_ip6(nh[0]));
1006

    
1007
  if (len == 16)
1008
    return 24;
1009

    
1010
  put_u64(buf+24, 0); /* VPN RD is 0 */
1011
  put_ip6(buf+32, ipa_to_ip6(nh[1]));
1012

    
1013
  return 48;
1014
}
1015

    
1016
static void
1017
bgp_decode_next_hop_vpn(struct bgp_parse_state *s, byte *data, uint len, rta *a)
1018
{
1019
  struct bgp_channel *c = s->channel;
1020
  struct adata *ad = lp_alloc_adata(s->pool, 32);
1021
  ip_addr *nh = (void *) ad->data;
1022

    
1023
  if (len == 12)
1024
  {
1025
    nh[0] = ipa_from_ip4(get_ip4(data+8));
1026
    nh[1] = IPA_NONE;
1027
  }
1028
  else if (len == 24)
1029
  {
1030
    nh[0] = ipa_from_ip6(get_ip6(data+8));
1031
    nh[1] = IPA_NONE;
1032

    
1033
    if (ipa_is_link_local(nh[0]))
1034
    { nh[1] = nh[0]; nh[0] = IPA_NONE; }
1035
  }
1036
  else if (len == 48)
1037
  {
1038
    nh[0] = ipa_from_ip6(get_ip6(data+8));
1039
    nh[1] = ipa_from_ip6(get_ip6(data+32));
1040

    
1041
    if (ipa_is_ip4(nh[0]) || !ip6_is_link_local(nh[1]))
1042
      nh[1] = IPA_NONE;
1043
  }
1044
  else
1045
    bgp_parse_error(s, 9);
1046

    
1047
  if (ipa_zero(nh[1]))
1048
    ad->length = 16;
1049

    
1050
  /* XXXX which error */
1051
  if ((get_u64(data) != 0) || ((len == 48) && (get_u64(data+24) != 0)))
1052
    bgp_parse_error(s, 9);
1053

    
1054
  if ((bgp_channel_is_ipv4(c) != ipa_is_ip4(nh[0])) && !c->ext_next_hop)
1055
    WITHDRAW(BAD_NEXT_HOP);
1056

    
1057
  // XXXX validate next hop
1058

    
1059
  bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_NEXT_HOP, 0, ad);
1060
  bgp_apply_next_hop(s, a, nh[0], nh[1]);
1061
}
1062

    
1063

    
1064

    
1065
static uint
1066
bgp_encode_next_hop_none(struct bgp_write_state *s UNUSED, eattr *a UNUSED, byte *buf UNUSED, uint size UNUSED)
1067
{
1068
  return 0;
1069
}
1070

    
1071
static void
1072
bgp_decode_next_hop_none(struct bgp_parse_state *s UNUSED, byte *data UNUSED, uint len UNUSED, rta *a UNUSED)
1073
{
1074
  /*
1075
   * Although we expect no next hop and RFC 7606 7.11 states that attribute
1076
   * MP_REACH_NLRI with unexpected next hop length is considered malformed,
1077
   * FlowSpec RFC 5575 4 states that next hop shall be ignored on receipt.
1078
   */
1079

    
1080
  return;
1081
}
1082

    
1083
static void
1084
bgp_update_next_hop_none(struct bgp_export_state *s, eattr *a, ea_list **to)
1085
{
1086
  /* NEXT_HOP shall not pass */
1087
  if (a)
1088
    bgp_unset_attr(to, s->pool, BA_NEXT_HOP);
1089
}
1090

    
1091

    
1092
/*
1093
 *        UPDATE
1094
 */
1095

    
1096
static void
1097
bgp_rte_update(struct bgp_parse_state *s, net_addr *n, u32 path_id, rta *a0)
1098
{
1099
  if (path_id != s->last_id)
1100
  {
1101
    s->last_src = rt_get_source(&s->proto->p, path_id);
1102
    s->last_id = path_id;
1103

    
1104
    rta_free(s->cached_rta);
1105
    s->cached_rta = NULL;
1106
  }
1107

    
1108
  if (!a0)
1109
  {
1110
    /* Route withdraw */
1111
    rte_update2(&s->channel->c, n, NULL, s->last_src);
1112
    return;
1113
  }
1114

    
1115
  /* Prepare cached route attributes */
1116
  if (s->cached_rta == NULL)
1117
  {
1118
    a0->src = s->last_src;
1119

    
1120
    /* Workaround for rta_lookup() breaking eattrs */
1121
    ea_list *ea = a0->eattrs;
1122
    s->cached_rta = rta_lookup(a0);
1123
    a0->eattrs = ea;
1124
  }
1125

    
1126
  rta *a = rta_clone(s->cached_rta);
1127
  rte *e = rte_get_temp(a);
1128

    
1129
  e->pflags = 0;
1130
  e->u.bgp.suppressed = 0;
1131
  rte_update2(&s->channel->c, n, e, s->last_src);
1132
}
1133

    
1134
static void
1135
bgp_encode_mpls_labels(struct bgp_write_state *s UNUSED, adata *mpls, byte **pos, uint *size, byte *pxlen)
1136
{
1137
  u32 dummy = 0;
1138
  u32 *labels = mpls ? (u32 *) mpls->data : &dummy;
1139
  uint lnum = mpls ? (mpls->length / 4) : 1;
1140

    
1141
  for (uint i = 0; i < lnum; i++)
1142
  {
1143
    put_u24(*pos, labels[i] << 4);
1144
    ADVANCE(*pos, *size, 3);
1145
  }
1146

    
1147
  /* Add bottom-of-stack flag */
1148
  (*pos)[-1] |= BGP_MPLS_BOS;
1149

    
1150
  *pxlen += 24 * lnum;
1151
}
1152

    
1153
static void
1154
bgp_decode_mpls_labels(struct bgp_parse_state *s, byte **pos, uint *len, uint *pxlen, rta *a)
1155
{
1156
  u32 labels[BGP_MPLS_MAX], label;
1157
  uint lnum = 0;
1158

    
1159
  do {
1160
    if (*pxlen < 24)
1161
      bgp_parse_error(s, 1);
1162

    
1163
    label = get_u24(*pos);
1164
    labels[lnum++] = label >> 4;
1165
    ADVANCE(*pos, *len, 3);
1166
    *pxlen -= 24;
1167

    
1168
    /* Withdraw: Magic label stack value 0x800000 according to RFC 3107, section 3, last paragraph */
1169
    if (!a && !s->err_withdraw && (lnum == 1) && (label == BGP_MPLS_MAGIC))
1170
      break;
1171
  }
1172
  while (!(label & BGP_MPLS_BOS));
1173

    
1174
  if (!a)
1175
    return;
1176

    
1177
  /* Attach MPLS attribute unless we already have one */
1178
  if (!s->mpls_labels)
1179
  {
1180
    s->mpls_labels = lp_alloc_adata(s->pool, 4*BGP_MPLS_MAX);
1181
    bgp_set_attr_ptr(&(a->eattrs), s->pool, BA_MPLS_LABEL_STACK, 0, s->mpls_labels);
1182
  }
1183

    
1184
  /* Overwrite data in the attribute */
1185
  s->mpls_labels->length = 4*lnum;
1186
  memcpy(s->mpls_labels->data, labels, 4*lnum);
1187

    
1188
  /* Update next hop entry in rta */
1189
  bgp_apply_mpls_labels(s, a, labels, lnum);
1190

    
1191
  /* Attributes were changed, invalidate cached entry */
1192
  rta_free(s->cached_rta);
1193
  s->cached_rta = NULL;
1194

    
1195
  return;
1196
}
1197

    
1198
static uint
1199
bgp_encode_nlri_ip4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1200
{
1201
  byte *pos = buf;
1202

    
1203
  while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1204
  {
1205
    struct bgp_prefix *px = HEAD(buck->prefixes);
1206
    struct net_addr_ip4 *net = (void *) px->net;
1207

    
1208
    /* Encode path ID */
1209
    if (s->add_path)
1210
    {
1211
      put_u32(pos, px->path_id);
1212
      ADVANCE(pos, size, 4);
1213
    }
1214

    
1215
    /* Encode prefix length */
1216
    *pos = net->pxlen;
1217
    ADVANCE(pos, size, 1);
1218

    
1219
    /* Encode MPLS labels */
1220
    if (s->mpls)
1221
      bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1222

    
1223
    /* Encode prefix body */
1224
    ip4_addr a = ip4_hton(net->prefix);
1225
    uint b = (net->pxlen + 7) / 8;
1226
    memcpy(pos, &a, b);
1227
    ADVANCE(pos, size, b);
1228

    
1229
    bgp_free_prefix(s->channel, px);
1230
  }
1231

    
1232
  return pos - buf;
1233
}
1234

    
1235
static void
1236
bgp_decode_nlri_ip4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1237
{
1238
  while (len)
1239
  {
1240
    net_addr_ip4 net;
1241
    u32 path_id = 0;
1242

    
1243
    /* Decode path ID */
1244
    if (s->add_path)
1245
    {
1246
      if (len < 5)
1247
        bgp_parse_error(s, 1);
1248

    
1249
      path_id = get_u32(pos);
1250
      ADVANCE(pos, len, 4);
1251
    }
1252

    
1253
    /* Decode prefix length */
1254
    uint l = *pos;
1255
    ADVANCE(pos, len, 1);
1256

    
1257
    if (len < ((l + 7) / 8))
1258
      bgp_parse_error(s, 1);
1259

    
1260
    /* Decode MPLS labels */
1261
    if (s->mpls)
1262
      bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1263

    
1264
    if (l > IP4_MAX_PREFIX_LENGTH)
1265
      bgp_parse_error(s, 10);
1266

    
1267
    /* Decode prefix body */
1268
    ip4_addr addr = IP4_NONE;
1269
    uint b = (l + 7) / 8;
1270
    memcpy(&addr, pos, b);
1271
    ADVANCE(pos, len, b);
1272

    
1273
    net = NET_ADDR_IP4(ip4_ntoh(addr), l);
1274
    net_normalize_ip4(&net);
1275

    
1276
    // XXXX validate prefix
1277

    
1278
    bgp_rte_update(s, (net_addr *) &net, path_id, a);
1279
  }
1280
}
1281

    
1282

    
1283
static uint
1284
bgp_encode_nlri_ip6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1285
{
1286
  byte *pos = buf;
1287

    
1288
  while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1289
  {
1290
    struct bgp_prefix *px = HEAD(buck->prefixes);
1291
    struct net_addr_ip6 *net = (void *) px->net;
1292

    
1293
    /* Encode path ID */
1294
    if (s->add_path)
1295
    {
1296
      put_u32(pos, px->path_id);
1297
      ADVANCE(pos, size, 4);
1298
    }
1299

    
1300
    /* Encode prefix length */
1301
    *pos = net->pxlen;
1302
    ADVANCE(pos, size, 1);
1303

    
1304
    /* Encode MPLS labels */
1305
    if (s->mpls)
1306
      bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1307

    
1308
    /* Encode prefix body */
1309
    ip6_addr a = ip6_hton(net->prefix);
1310
    uint b = (net->pxlen + 7) / 8;
1311
    memcpy(pos, &a, b);
1312
    ADVANCE(pos, size, b);
1313

    
1314
    bgp_free_prefix(s->channel, px);
1315
  }
1316

    
1317
  return pos - buf;
1318
}
1319

    
1320
static void
1321
bgp_decode_nlri_ip6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1322
{
1323
  while (len)
1324
  {
1325
    net_addr_ip6 net;
1326
    u32 path_id = 0;
1327

    
1328
    /* Decode path ID */
1329
    if (s->add_path)
1330
    {
1331
      if (len < 5)
1332
        bgp_parse_error(s, 1);
1333

    
1334
      path_id = get_u32(pos);
1335
      ADVANCE(pos, len, 4);
1336
    }
1337

    
1338
    /* Decode prefix length */
1339
    uint l = *pos;
1340
    ADVANCE(pos, len, 1);
1341

    
1342
    if (len < ((l + 7) / 8))
1343
      bgp_parse_error(s, 1);
1344

    
1345
    /* Decode MPLS labels */
1346
    if (s->mpls)
1347
      bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1348

    
1349
    if (l > IP6_MAX_PREFIX_LENGTH)
1350
      bgp_parse_error(s, 10);
1351

    
1352
    /* Decode prefix body */
1353
    ip6_addr addr = IP6_NONE;
1354
    uint b = (l + 7) / 8;
1355
    memcpy(&addr, pos, b);
1356
    ADVANCE(pos, len, b);
1357

    
1358
    net = NET_ADDR_IP6(ip6_ntoh(addr), l);
1359
    net_normalize_ip6(&net);
1360

    
1361
    // XXXX validate prefix
1362

    
1363
    bgp_rte_update(s, (net_addr *) &net, path_id, a);
1364
  }
1365
}
1366

    
1367
static uint
1368
bgp_encode_nlri_vpn4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1369
{
1370
  byte *pos = buf;
1371

    
1372
  while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1373
  {
1374
    struct bgp_prefix *px = HEAD(buck->prefixes);
1375
    struct net_addr_vpn4 *net = (void *) px->net;
1376

    
1377
    /* Encode path ID */
1378
    if (s->add_path)
1379
    {
1380
      put_u32(pos, px->path_id);
1381
      ADVANCE(pos, size, 4);
1382
    }
1383

    
1384
    /* Encode prefix length */
1385
    *pos = 64 + net->pxlen;
1386
    ADVANCE(pos, size, 1);
1387

    
1388
    /* Encode MPLS labels */
1389
    bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1390

    
1391
    /* Encode route distinguisher */
1392
    put_u64(pos, net->rd);
1393
    ADVANCE(pos, size, 8);
1394

    
1395
    /* Encode prefix body */
1396
    ip4_addr a = ip4_hton(net->prefix);
1397
    uint b = (net->pxlen + 7) / 8;
1398
    memcpy(pos, &a, b);
1399
    ADVANCE(pos, size, b);
1400

    
1401
    bgp_free_prefix(s->channel, px);
1402
  }
1403

    
1404
  return pos - buf;
1405
}
1406

    
1407
static void
1408
bgp_decode_nlri_vpn4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1409
{
1410
  while (len)
1411
  {
1412
    net_addr_vpn4 net;
1413
    u32 path_id = 0;
1414

    
1415
    /* Decode path ID */
1416
    if (s->add_path)
1417
    {
1418
      if (len < 5)
1419
        bgp_parse_error(s, 1);
1420

    
1421
      path_id = get_u32(pos);
1422
      ADVANCE(pos, len, 4);
1423
    }
1424

    
1425
    /* Decode prefix length */
1426
    uint l = *pos;
1427
    ADVANCE(pos, len, 1);
1428

    
1429
    if (len < ((l + 7) / 8))
1430
      bgp_parse_error(s, 1);
1431

    
1432
    /* Decode MPLS labels */
1433
    bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1434

    
1435
    /* Decode route distinguisher */
1436
    if (l < 64)
1437
      bgp_parse_error(s, 1);
1438

    
1439
    u64 rd = get_u64(pos);
1440
    ADVANCE(pos, len, 8);
1441
    l -= 64;
1442

    
1443
    if (l > IP4_MAX_PREFIX_LENGTH)
1444
      bgp_parse_error(s, 10);
1445

    
1446
    /* Decode prefix body */
1447
    ip4_addr addr = IP4_NONE;
1448
    uint b = (l + 7) / 8;
1449
    memcpy(&addr, pos, b);
1450
    ADVANCE(pos, len, b);
1451

    
1452
    net = NET_ADDR_VPN4(ip4_ntoh(addr), l, rd);
1453
    net_normalize_vpn4(&net);
1454

    
1455
    // XXXX validate prefix
1456

    
1457
    bgp_rte_update(s, (net_addr *) &net, path_id, a);
1458
  }
1459
}
1460

    
1461

    
1462
static uint
1463
bgp_encode_nlri_vpn6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1464
{
1465
  byte *pos = buf;
1466

    
1467
  while (!EMPTY_LIST(buck->prefixes) && (size >= BGP_NLRI_MAX))
1468
  {
1469
    struct bgp_prefix *px = HEAD(buck->prefixes);
1470
    struct net_addr_vpn6 *net = (void *) px->net;
1471

    
1472
    /* Encode path ID */
1473
    if (s->add_path)
1474
    {
1475
      put_u32(pos, px->path_id);
1476
      ADVANCE(pos, size, 4);
1477
    }
1478

    
1479
    /* Encode prefix length */
1480
    *pos = 64 + net->pxlen;
1481
    ADVANCE(pos, size, 1);
1482

    
1483
    /* Encode MPLS labels */
1484
    bgp_encode_mpls_labels(s, s->mpls_labels, &pos, &size, pos - 1);
1485

    
1486
    /* Encode route distinguisher */
1487
    put_u64(pos, net->rd);
1488
    ADVANCE(pos, size, 8);
1489

    
1490
    /* Encode prefix body */
1491
    ip6_addr a = ip6_hton(net->prefix);
1492
    uint b = (net->pxlen + 7) / 8;
1493
    memcpy(pos, &a, b);
1494
    ADVANCE(pos, size, b);
1495

    
1496
    bgp_free_prefix(s->channel, px);
1497
  }
1498

    
1499
  return pos - buf;
1500
}
1501

    
1502
static void
1503
bgp_decode_nlri_vpn6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1504
{
1505
  while (len)
1506
  {
1507
    net_addr_vpn6 net;
1508
    u32 path_id = 0;
1509

    
1510
    /* Decode path ID */
1511
    if (s->add_path)
1512
    {
1513
      if (len < 5)
1514
        bgp_parse_error(s, 1);
1515

    
1516
      path_id = get_u32(pos);
1517
      ADVANCE(pos, len, 4);
1518
    }
1519

    
1520
    /* Decode prefix length */
1521
    uint l = *pos;
1522
    ADVANCE(pos, len, 1);
1523

    
1524
    if (len < ((l + 7) / 8))
1525
      bgp_parse_error(s, 1);
1526

    
1527
    /* Decode MPLS labels */
1528
    if (s->mpls)
1529
      bgp_decode_mpls_labels(s, &pos, &len, &l, a);
1530

    
1531
    /* Decode route distinguisher */
1532
    if (l < 64)
1533
      bgp_parse_error(s, 1);
1534

    
1535
    u64 rd = get_u64(pos);
1536
    ADVANCE(pos, len, 8);
1537
    l -= 64;
1538

    
1539
    if (l > IP6_MAX_PREFIX_LENGTH)
1540
      bgp_parse_error(s, 10);
1541

    
1542
    /* Decode prefix body */
1543
    ip6_addr addr = IP6_NONE;
1544
    uint b = (l + 7) / 8;
1545
    memcpy(&addr, pos, b);
1546
    ADVANCE(pos, len, b);
1547

    
1548
    net = NET_ADDR_VPN6(ip6_ntoh(addr), l, rd);
1549
    net_normalize_vpn6(&net);
1550

    
1551
    // XXXX validate prefix
1552

    
1553
    bgp_rte_update(s, (net_addr *) &net, path_id, a);
1554
  }
1555
}
1556

    
1557

    
1558
static uint
1559
bgp_encode_nlri_flow4(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1560
{
1561
  byte *pos = buf;
1562

    
1563
  while (!EMPTY_LIST(buck->prefixes) && (size >= 4))
1564
  {
1565
    struct bgp_prefix *px = HEAD(buck->prefixes);
1566
    struct net_addr_flow4 *net = (void *) px->net;
1567
    uint flen = net->length - sizeof(net_addr_flow4);
1568

    
1569
    /* Encode path ID */
1570
    if (s->add_path)
1571
    {
1572
      put_u32(pos, px->path_id);
1573
      ADVANCE(pos, size, 4);
1574
    }
1575

    
1576
    if (flen > size)
1577
      break;
1578

    
1579
    /* Copy whole flow data including length */
1580
    memcpy(pos, net->data, flen);
1581
    ADVANCE(pos, size, flen);
1582

    
1583
    bgp_free_prefix(s->channel, px);
1584
  }
1585

    
1586
  return pos - buf;
1587
}
1588

    
1589
static void
1590
bgp_decode_nlri_flow4(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1591
{
1592
  while (len)
1593
  {
1594
    u32 path_id = 0;
1595

    
1596
    /* Decode path ID */
1597
    if (s->add_path)
1598
    {
1599
      if (len < 4)
1600
        bgp_parse_error(s, 1);
1601

    
1602
      path_id = get_u32(pos);
1603
      ADVANCE(pos, len, 4);
1604
    }
1605

    
1606
    if (len < 2)
1607
      bgp_parse_error(s, 1);
1608

    
1609
    /* Decode flow length */
1610
    uint hlen = flow_hdr_length(pos);
1611
    uint dlen = flow_read_length(pos);
1612
    uint flen = hlen + dlen;
1613
    byte *data = pos + hlen;
1614

    
1615
    if (len < flen)
1616
      bgp_parse_error(s, 1);
1617

    
1618
    /* Validate flow data */
1619
    enum flow_validated_state r = flow4_validate(data, dlen);
1620
    if (r != FLOW_ST_VALID)
1621
    {
1622
      log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r));
1623
      bgp_parse_error(s, 1);
1624
    }
1625

    
1626
    if (data[0] != FLOW_TYPE_DST_PREFIX)
1627
    {
1628
      log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name);
1629
      bgp_parse_error(s, 1);
1630
    }
1631

    
1632
    /* Decode dst prefix */
1633
    ip4_addr px = IP4_NONE;
1634
    uint pxlen = data[1];
1635

    
1636
    // FIXME: Use some generic function
1637
    memcpy(&px, data, BYTES(pxlen));
1638
    px = ip4_and(px, ip4_mkmask(pxlen));
1639

    
1640
    /* Prepare the flow */
1641
    net_addr *n = alloca(sizeof(struct net_addr_flow4) + flen);
1642
    net_fill_flow4(n, px, pxlen, pos, flen);
1643
    ADVANCE(pos, len, flen);
1644

    
1645
    bgp_rte_update(s, n, path_id, a);
1646
  }
1647
}
1648

    
1649

    
1650
static uint
1651
bgp_encode_nlri_flow6(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, uint size)
1652
{
1653
  byte *pos = buf;
1654

    
1655
  while (!EMPTY_LIST(buck->prefixes) && (size >= 4))
1656
  {
1657
    struct bgp_prefix *px = HEAD(buck->prefixes);
1658
    struct net_addr_flow6 *net = (void *) px->net;
1659
    uint flen = net->length - sizeof(net_addr_flow6);
1660

    
1661
    /* Encode path ID */
1662
    if (s->add_path)
1663
    {
1664
      put_u32(pos, px->path_id);
1665
      ADVANCE(pos, size, 4);
1666
    }
1667

    
1668
    if (flen > size)
1669
      break;
1670

    
1671
    /* Copy whole flow data including length */
1672
    memcpy(pos, net->data, flen);
1673
    ADVANCE(pos, size, flen);
1674

    
1675
    bgp_free_prefix(s->channel, px);
1676
  }
1677

    
1678
  return pos - buf;
1679
}
1680

    
1681
static void
1682
bgp_decode_nlri_flow6(struct bgp_parse_state *s, byte *pos, uint len, rta *a)
1683
{
1684
  while (len)
1685
  {
1686
    u32 path_id = 0;
1687

    
1688
    /* Decode path ID */
1689
    if (s->add_path)
1690
    {
1691
      if (len < 4)
1692
        bgp_parse_error(s, 1);
1693

    
1694
      path_id = get_u32(pos);
1695
      ADVANCE(pos, len, 4);
1696
    }
1697

    
1698
    if (len < 2)
1699
      bgp_parse_error(s, 1);
1700

    
1701
    /* Decode flow length */
1702
    uint hlen = flow_hdr_length(pos);
1703
    uint dlen = flow_read_length(pos);
1704
    uint flen = hlen + dlen;
1705
    byte *data = pos + hlen;
1706

    
1707
    if (len < flen)
1708
      bgp_parse_error(s, 1);
1709

    
1710
    /* Validate flow data */
1711
    enum flow_validated_state r = flow6_validate(data, dlen);
1712
    if (r != FLOW_ST_VALID)
1713
    {
1714
      log(L_REMOTE "%s: Invalid flow route: %s", s->proto->p.name, flow_validated_state_str(r));
1715
      bgp_parse_error(s, 1);
1716
    }
1717

    
1718
    if (data[0] != FLOW_TYPE_DST_PREFIX)
1719
    {
1720
      log(L_REMOTE "%s: No dst prefix at first pos", s->proto->p.name);
1721
      bgp_parse_error(s, 1);
1722
    }
1723

    
1724
    /* Decode dst prefix */
1725
    ip6_addr px = IP6_NONE;
1726
    uint pxlen = data[1];
1727

    
1728
    // FIXME: Use some generic function
1729
    memcpy(&px, data, BYTES(pxlen));
1730
    px = ip6_and(px, ip6_mkmask(pxlen));
1731

    
1732
    /* Prepare the flow */
1733
    net_addr *n = alloca(sizeof(struct net_addr_flow6) + flen);
1734
    net_fill_flow6(n, px, pxlen, pos, flen);
1735
    ADVANCE(pos, len, flen);
1736

    
1737
    bgp_rte_update(s, n, path_id, a);
1738
  }
1739
}
1740

    
1741

    
1742
static const struct bgp_af_desc bgp_af_table[] = {
1743
  {
1744
    .afi = BGP_AF_IPV4,
1745
    .net = NET_IP4,
1746
    .name = "ipv4",
1747
    .encode_nlri = bgp_encode_nlri_ip4,
1748
    .decode_nlri = bgp_decode_nlri_ip4,
1749
    .encode_next_hop = bgp_encode_next_hop_ip,
1750
    .decode_next_hop = bgp_decode_next_hop_ip,
1751
    .update_next_hop = bgp_update_next_hop_ip,
1752
  },
1753
  {
1754
    .afi = BGP_AF_IPV4_MC,
1755
    .net = NET_IP4,
1756
    .name = "ipv4-mc",
1757
    .encode_nlri = bgp_encode_nlri_ip4,
1758
    .decode_nlri = bgp_decode_nlri_ip4,
1759
    .encode_next_hop = bgp_encode_next_hop_ip,
1760
    .decode_next_hop = bgp_decode_next_hop_ip,
1761
    .update_next_hop = bgp_update_next_hop_ip,
1762
  },
1763
  {
1764
    .afi = BGP_AF_IPV4_MPLS,
1765
    .net = NET_IP4,
1766
    .mpls = 1,
1767
    .name = "ipv4-mpls",
1768
    .encode_nlri = bgp_encode_nlri_ip4,
1769
    .decode_nlri = bgp_decode_nlri_ip4,
1770
    .encode_next_hop = bgp_encode_next_hop_ip,
1771
    .decode_next_hop = bgp_decode_next_hop_ip,
1772
    .update_next_hop = bgp_update_next_hop_ip,
1773
  },
1774
  {
1775
    .afi = BGP_AF_IPV6,
1776
    .net = NET_IP6,
1777
    .name = "ipv6",
1778
    .encode_nlri = bgp_encode_nlri_ip6,
1779
    .decode_nlri = bgp_decode_nlri_ip6,
1780
    .encode_next_hop = bgp_encode_next_hop_ip,
1781
    .decode_next_hop = bgp_decode_next_hop_ip,
1782
    .update_next_hop = bgp_update_next_hop_ip,
1783
  },
1784
  {
1785
    .afi = BGP_AF_IPV6_MC,
1786
    .net = NET_IP6,
1787
    .name = "ipv6-mc",
1788
    .encode_nlri = bgp_encode_nlri_ip6,
1789
    .decode_nlri = bgp_decode_nlri_ip6,
1790
    .encode_next_hop = bgp_encode_next_hop_ip,
1791
    .decode_next_hop = bgp_decode_next_hop_ip,
1792
    .update_next_hop = bgp_update_next_hop_ip,
1793
  },
1794
  {
1795
    .afi = BGP_AF_IPV6_MPLS,
1796
    .net = NET_IP6,
1797
    .mpls = 1,
1798
    .name = "ipv6-mpls",
1799
    .encode_nlri = bgp_encode_nlri_ip6,
1800
    .decode_nlri = bgp_decode_nlri_ip6,
1801
    .encode_next_hop = bgp_encode_next_hop_ip,
1802
    .decode_next_hop = bgp_decode_next_hop_ip,
1803
    .update_next_hop = bgp_update_next_hop_ip,
1804
  },
1805
  {
1806
    .afi = BGP_AF_VPN4_MPLS,
1807
    .net = NET_VPN4,
1808
    .mpls = 1,
1809
    .name = "vpn4-mpls",
1810
    .encode_nlri = bgp_encode_nlri_vpn4,
1811
    .decode_nlri = bgp_decode_nlri_vpn4,
1812
    .encode_next_hop = bgp_encode_next_hop_vpn,
1813
    .decode_next_hop = bgp_decode_next_hop_vpn,
1814
    .update_next_hop = bgp_update_next_hop_ip,
1815
  },
1816
  {
1817
    .afi = BGP_AF_VPN6_MPLS,
1818
    .net = NET_VPN6,
1819
    .mpls = 1,
1820
    .name = "vpn6-mpls",
1821
    .encode_nlri = bgp_encode_nlri_vpn6,
1822
    .decode_nlri = bgp_decode_nlri_vpn6,
1823
    .encode_next_hop = bgp_encode_next_hop_vpn,
1824
    .decode_next_hop = bgp_decode_next_hop_vpn,
1825
    .update_next_hop = bgp_update_next_hop_ip,
1826
  },
1827
  {
1828
    .afi = BGP_AF_FLOW4,
1829
    .net = NET_FLOW4,
1830
    .no_igp = 1,
1831
    .name = "flow4",
1832
    .encode_nlri = bgp_encode_nlri_flow4,
1833
    .decode_nlri = bgp_decode_nlri_flow4,
1834
    .encode_next_hop = bgp_encode_next_hop_none,
1835
    .decode_next_hop = bgp_decode_next_hop_none,
1836
    .update_next_hop = bgp_update_next_hop_none,
1837
  },
1838
  {
1839
    .afi = BGP_AF_FLOW6,
1840
    .net = NET_FLOW6,
1841
    .no_igp = 1,
1842
    .name = "flow6",
1843
    .encode_nlri = bgp_encode_nlri_flow6,
1844
    .decode_nlri = bgp_decode_nlri_flow6,
1845
    .encode_next_hop = bgp_encode_next_hop_none,
1846
    .decode_next_hop = bgp_decode_next_hop_none,
1847
    .update_next_hop = bgp_update_next_hop_none,
1848
  },
1849
};
1850

    
1851
const struct bgp_af_desc *
1852
bgp_get_af_desc(u32 afi)
1853
{
1854
  uint i;
1855
  for (i = 0; i < ARRAY_SIZE(bgp_af_table); i++)
1856
    if (bgp_af_table[i].afi == afi)
1857
      return &bgp_af_table[i];
1858

    
1859
  return NULL;
1860
}
1861

    
1862
static inline uint
1863
bgp_encode_nlri(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
1864
{
1865
  return s->channel->desc->encode_nlri(s, buck, buf, end - buf);
1866
}
1867

    
1868
static inline uint
1869
bgp_encode_next_hop(struct bgp_write_state *s, eattr *nh, byte *buf)
1870
{
1871
  return s->channel->desc->encode_next_hop(s, nh, buf, 255);
1872
}
1873

    
1874
void
1875
bgp_update_next_hop(struct bgp_export_state *s, eattr *a, ea_list **to)
1876
{
1877
  s->channel->desc->update_next_hop(s, a, to);
1878
}
1879

    
1880
#define MAX_ATTRS_LENGTH (end-buf+BGP_HEADER_LENGTH - 1024)
1881

    
1882
static byte *
1883
bgp_create_ip_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
1884
{
1885
  /*
1886
   *        2 B        Withdrawn Routes Length (zero)
1887
   *        ---        IPv4 Withdrawn Routes NLRI (unused)
1888
   *        2 B        Total Path Attribute Length
1889
   *        var        Path Attributes
1890
   *        var        IPv4 Network Layer Reachability Information
1891
   */
1892

    
1893
  int lr, la;
1894

    
1895
  la = bgp_encode_attrs(s, buck->eattrs, buf+4, buf + MAX_ATTRS_LENGTH);
1896
  if (la < 0)
1897
  {
1898
    /* Attribute list too long */
1899
    bgp_withdraw_bucket(s->channel, buck);
1900
    return NULL;
1901
  }
1902

    
1903
  put_u16(buf+0, 0);
1904
  put_u16(buf+2, la);
1905

    
1906
  lr = bgp_encode_nlri(s, buck, buf+4+la, end);
1907

    
1908
  return buf+4+la+lr;
1909
}
1910

    
1911
static byte *
1912
bgp_create_mp_reach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
1913
{
1914
  /*
1915
   *        2 B        IPv4 Withdrawn Routes Length (zero)
1916
   *        ---        IPv4 Withdrawn Routes NLRI (unused)
1917
   *        2 B        Total Path Attribute Length
1918
   *        1 B        MP_REACH_NLRI hdr - Attribute Flags
1919
   *        1 B        MP_REACH_NLRI hdr - Attribute Type Code
1920
   *        2 B        MP_REACH_NLRI hdr - Length of Attribute Data
1921
   *        2 B        MP_REACH_NLRI data - Address Family Identifier
1922
   *        1 B        MP_REACH_NLRI data - Subsequent Address Family Identifier
1923
   *        1 B        MP_REACH_NLRI data - Length of Next Hop Network Address
1924
   *        var        MP_REACH_NLRI data - Network Address of Next Hop
1925
   *        1 B        MP_REACH_NLRI data - Reserved (zero)
1926
   *        var        MP_REACH_NLRI data - Network Layer Reachability Information
1927
   *        var        Rest of Path Attributes
1928
   *        ---        IPv4 Network Layer Reachability Information (unused)
1929
   */
1930

    
1931
  int lh, lr, la;        /* Lengths of next hop, NLRI and attributes */
1932

    
1933
  /* Begin of MP_REACH_NLRI atribute */
1934
  buf[4] = BAF_OPTIONAL | BAF_EXT_LEN;
1935
  buf[5] = BA_MP_REACH_NLRI;
1936
  put_u16(buf+6, 0);                /* Will be fixed later */
1937
  put_af3(buf+8, s->channel->afi);
1938
  byte *pos = buf+11;
1939

    
1940
  /* Encode attributes to temporary buffer */
1941
  byte *abuf = alloca(MAX_ATTRS_LENGTH);
1942
  la = bgp_encode_attrs(s, buck->eattrs, abuf, abuf + MAX_ATTRS_LENGTH);
1943
  if (la < 0)
1944
  {
1945
    /* Attribute list too long */
1946
    bgp_withdraw_bucket(s->channel, buck);
1947
    return NULL;
1948
  }
1949

    
1950
  /* Encode the next hop */
1951
  lh = bgp_encode_next_hop(s, s->mp_next_hop, pos+1);
1952
  *pos = lh;
1953
  pos += 1+lh;
1954

    
1955
  /* Reserved field */
1956
  *pos++ = 0;
1957

    
1958
  /* Encode the NLRI */
1959
  lr = bgp_encode_nlri(s, buck, pos, end - la);
1960
  pos += lr;
1961

    
1962
  /* End of MP_REACH_NLRI atribute, update data length */
1963
  put_u16(buf+6, pos-buf-8);
1964

    
1965
  /* Copy remaining attributes */
1966
  memcpy(pos, abuf, la);
1967
  pos += la;
1968

    
1969
  /* Initial UPDATE fields */
1970
  put_u16(buf+0, 0);
1971
  put_u16(buf+2, pos-buf-4);
1972

    
1973
  return pos;
1974
}
1975

    
1976
#undef MAX_ATTRS_LENGTH
1977

    
1978
static byte *
1979
bgp_create_ip_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
1980
{
1981
  /*
1982
   *        2 B        Withdrawn Routes Length
1983
   *        var        IPv4 Withdrawn Routes NLRI
1984
   *        2 B        Total Path Attribute Length (zero)
1985
   *        ---        Path Attributes (unused)
1986
   *        ---        IPv4 Network Layer Reachability Information (unused)
1987
   */
1988

    
1989
  uint len = bgp_encode_nlri(s, buck, buf+2, end);
1990

    
1991
  put_u16(buf+0, len);
1992
  put_u16(buf+2+len, 0);
1993

    
1994
  return buf+4+len;
1995
}
1996

    
1997
static byte *
1998
bgp_create_mp_unreach(struct bgp_write_state *s, struct bgp_bucket *buck, byte *buf, byte *end)
1999
{
2000
  /*
2001
   *        2 B        Withdrawn Routes Length (zero)
2002
   *        ---        IPv4 Withdrawn Routes NLRI (unused)
2003
   *        2 B        Total Path Attribute Length
2004
   *        1 B        MP_UNREACH_NLRI hdr - Attribute Flags
2005
   *        1 B        MP_UNREACH_NLRI hdr - Attribute Type Code
2006
   *        2 B        MP_UNREACH_NLRI hdr - Length of Attribute Data
2007
   *        2 B        MP_UNREACH_NLRI data - Address Family Identifier
2008
   *        1 B        MP_UNREACH_NLRI data - Subsequent Address Family Identifier
2009
   *        var        MP_UNREACH_NLRI data - Network Layer Reachability Information
2010
   *        ---        IPv4 Network Layer Reachability Information (unused)
2011
   */
2012

    
2013
  uint len = bgp_encode_nlri(s, buck, buf+11, end);
2014

    
2015
  put_u16(buf+0, 0);
2016
  put_u16(buf+2, 7+len);
2017

    
2018
  /* Begin of MP_UNREACH_NLRI atribute */
2019
  buf[4] = BAF_OPTIONAL | BAF_EXT_LEN;
2020
  buf[5] = BA_MP_UNREACH_NLRI;
2021
  put_u16(buf+6, 3+len);
2022
  put_af3(buf+8, s->channel->afi);
2023

    
2024
  return buf+11+len;
2025
}
2026

    
2027
static byte *
2028
bgp_create_update(struct bgp_channel *c, byte *buf)
2029
{
2030
  struct bgp_proto *p = (void *) c->c.proto;
2031
  struct bgp_bucket *buck;
2032
  byte *end = buf + (bgp_max_packet_length(p->conn) - BGP_HEADER_LENGTH);
2033
  byte *res = NULL;
2034

    
2035
again: ;
2036

    
2037
  /* Initialize write state */
2038
  struct bgp_write_state s = {
2039
    .proto = p,
2040
    .channel = c,
2041
    .pool = bgp_linpool,
2042
    .as4_session = p->as4_session,
2043
    .add_path = c->add_path_tx,
2044
    .mpls = c->desc->mpls,
2045
  };
2046

    
2047
  /* Try unreachable bucket */
2048
  if ((buck = c->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
2049
  {
2050
    res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ?
2051
      bgp_create_ip_unreach(&s, buck, buf, end):
2052
      bgp_create_mp_unreach(&s, buck, buf, end);
2053

    
2054
    goto done;
2055
  }
2056

    
2057
  /* Try reachable buckets */
2058
  if (!EMPTY_LIST(c->bucket_queue))
2059
  {
2060
    buck = HEAD(c->bucket_queue);
2061

    
2062
    /* Cleanup empty buckets */
2063
    if (EMPTY_LIST(buck->prefixes))
2064
    {
2065
      bgp_free_bucket(c, buck);
2066
      goto again;
2067
    }
2068

    
2069
    res = (c->afi == BGP_AF_IPV4) && !c->ext_next_hop ?
2070
      bgp_create_ip_reach(&s, buck, buf, end):
2071
      bgp_create_mp_reach(&s, buck, buf, end);
2072

    
2073
    if (EMPTY_LIST(buck->prefixes))
2074
      bgp_free_bucket(c, buck);
2075
    else
2076
      bgp_defer_bucket(c, buck);
2077

    
2078
    if (!res)
2079
      goto again;
2080

    
2081
    goto done;
2082
  }
2083

    
2084
  /* No more prefixes to send */
2085
  return NULL;
2086

    
2087
done:
2088
  BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE");
2089
  lp_flush(s.pool);
2090

    
2091
  return res;
2092
}
2093

    
2094
static byte *
2095
bgp_create_ip_end_mark(struct bgp_channel *c UNUSED, byte *buf)
2096
{
2097
  /* Empty update packet */
2098
  put_u32(buf, 0);
2099

    
2100
  return buf+4;
2101
}
2102

    
2103
static byte *
2104
bgp_create_mp_end_mark(struct bgp_channel *c, byte *buf)
2105
{
2106
  put_u16(buf+0, 0);
2107
  put_u16(buf+2, 6);                /* length 4--9 */
2108

    
2109
  /* Empty MP_UNREACH_NLRI atribute */
2110
  buf[4] = BAF_OPTIONAL;
2111
  buf[5] = BA_MP_UNREACH_NLRI;
2112
  buf[6] = 3;                        /* Length 7--9 */
2113
  put_af3(buf+7, c->afi);
2114

    
2115
  return buf+10;
2116
}
2117

    
2118
static byte *
2119
bgp_create_end_mark(struct bgp_channel *c, byte *buf)
2120
{
2121
  struct bgp_proto *p = (void *) c->c.proto;
2122

    
2123
  BGP_TRACE(D_PACKETS, "Sending END-OF-RIB");
2124

    
2125
  return (c->afi == BGP_AF_IPV4) ?
2126
    bgp_create_ip_end_mark(c, buf):
2127
    bgp_create_mp_end_mark(c, buf);
2128
}
2129

    
2130
static inline void
2131
bgp_rx_end_mark(struct bgp_parse_state *s, u32 afi)
2132
{
2133
  struct bgp_proto *p = s->proto;
2134
  struct bgp_channel *c = bgp_get_channel(p, afi);
2135

    
2136
  BGP_TRACE(D_PACKETS, "Got END-OF-RIB");
2137

    
2138
  if (!c)
2139
    DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi));
2140

    
2141
  if (c->load_state == BFS_LOADING)
2142
    c->load_state = BFS_NONE;
2143

    
2144
  if (p->p.gr_recovery)
2145
    channel_graceful_restart_unlock(&c->c);
2146

    
2147
  if (c->gr_active)
2148
    bgp_graceful_restart_done(c);
2149
}
2150

    
2151
static inline void
2152
bgp_decode_nlri(struct bgp_parse_state *s, u32 afi, byte *nlri, uint len, ea_list *ea, byte *nh, uint nh_len)
2153
{
2154
  struct bgp_channel *c = bgp_get_channel(s->proto, afi);
2155
  rta *a = NULL;
2156

    
2157
  if (!c)
2158
    DISCARD(BAD_AFI, BGP_AFI(afi), BGP_SAFI(afi));
2159

    
2160
  s->channel = c;
2161
  s->add_path = c->add_path_rx;
2162
  s->mpls = c->desc->mpls;
2163

    
2164
  s->last_id = 0;
2165
  s->last_src = s->proto->p.main_source;
2166

    
2167
  /*
2168
   * IPv4 BGP and MP-BGP may be used together in one update, therefore we do not
2169
   * add BA_NEXT_HOP in bgp_decode_attrs(), but we add it here independently for
2170
   * IPv4 BGP and MP-BGP. We undo the attribute (and possibly others attached by
2171
   * decode_next_hop hooks) by restoring a->eattrs afterwards.
2172
   */
2173

    
2174
  if (ea)
2175
  {
2176
    a = allocz(RTA_MAX_SIZE);
2177

    
2178
    a->source = RTS_BGP;
2179
    a->scope = SCOPE_UNIVERSE;
2180
    a->from = s->proto->cf->remote_ip;
2181
    a->eattrs = ea;
2182

    
2183
    c->desc->decode_next_hop(s, nh, nh_len, a);
2184

    
2185
    /* Handle withdraw during next hop decoding */
2186
    if (s->err_withdraw)
2187
      a = NULL;
2188
  }
2189

    
2190
  c->desc->decode_nlri(s, nlri, len, a);
2191

    
2192
  rta_free(s->cached_rta);
2193
  s->cached_rta = NULL;
2194
}
2195

    
2196
static void
2197
bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len)
2198
{
2199
  struct bgp_proto *p = conn->bgp;
2200
  ea_list *ea = NULL;
2201

    
2202
  BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE");
2203

    
2204
  /* Workaround for some BGP implementations that skip initial KEEPALIVE */
2205
  if (conn->state == BS_OPENCONFIRM)
2206
    bgp_conn_enter_established_state(conn);
2207

    
2208
  if (conn->state != BS_ESTABLISHED)
2209
  { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
2210

    
2211
  bgp_start_timer(conn->hold_timer, conn->hold_time);
2212

    
2213
  /* Initialize parse state */
2214
  struct bgp_parse_state s = {
2215
    .proto = p,
2216
    .pool = bgp_linpool,
2217
    .as4_session = p->as4_session,
2218
  };
2219

    
2220
  /* Parse error handler */
2221
  if (setjmp(s.err_jmpbuf))
2222
  {
2223
    bgp_error(conn, 3, s.err_subcode, NULL, 0);
2224
    goto done;
2225
  }
2226

    
2227
  /* Check minimal length */
2228
  if (len < 23)
2229
  { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2230

    
2231
  /* Skip fixed header */
2232
  uint pos = 19;
2233

    
2234
  /*
2235
   *        UPDATE message format
2236
   *
2237
   *        2 B        IPv4 Withdrawn Routes Length
2238
   *        var        IPv4 Withdrawn Routes NLRI
2239
   *        2 B        Total Path Attribute Length
2240
   *        var        Path Attributes
2241
   *        var        IPv4 Reachable Routes NLRI
2242
   */
2243

    
2244
  s.ip_unreach_len = get_u16(pkt + pos);
2245
  s.ip_unreach_nlri = pkt + pos + 2;
2246
  pos += 2 + s.ip_unreach_len;
2247

    
2248
  if (pos + 2 > len)
2249
    bgp_parse_error(&s, 1);
2250

    
2251
  s.attr_len = get_u16(pkt + pos);
2252
  s.attrs = pkt + pos + 2;
2253
  pos += 2 + s.attr_len;
2254

    
2255
  if (pos > len)
2256
    bgp_parse_error(&s, 1);
2257

    
2258
  s.ip_reach_len = len - pos;
2259
  s.ip_reach_nlri = pkt + pos;
2260

    
2261

    
2262
  if (s.attr_len)
2263
    ea = bgp_decode_attrs(&s, s.attrs, s.attr_len);
2264

    
2265
  /* Check for End-of-RIB marker */
2266
  if (!s.attr_len && !s.ip_unreach_len && !s.ip_reach_len)
2267
  { bgp_rx_end_mark(&s, BGP_AF_IPV4); goto done; }
2268

    
2269
  /* Check for MP End-of-RIB marker */
2270
  if ((s.attr_len < 8) && !s.ip_unreach_len && !s.ip_reach_len &&
2271
      !s.mp_reach_len && !s.mp_unreach_len && s.mp_unreach_af)
2272
  { bgp_rx_end_mark(&s, s.mp_unreach_af); goto done; }
2273

    
2274
  if (s.ip_unreach_len)
2275
    bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_unreach_nlri, s.ip_unreach_len, NULL, NULL, 0);
2276

    
2277
  if (s.mp_unreach_len)
2278
    bgp_decode_nlri(&s, s.mp_unreach_af, s.mp_unreach_nlri, s.mp_unreach_len, NULL, NULL, 0);
2279

    
2280
  if (s.ip_reach_len)
2281
    bgp_decode_nlri(&s, BGP_AF_IPV4, s.ip_reach_nlri, s.ip_reach_len,
2282
                    ea, s.ip_next_hop_data, s.ip_next_hop_len);
2283

    
2284
  if (s.mp_reach_len)
2285
    bgp_decode_nlri(&s, s.mp_reach_af, s.mp_reach_nlri, s.mp_reach_len,
2286
                    ea, s.mp_next_hop_data, s.mp_next_hop_len);
2287

    
2288
done:
2289
  rta_free(s.cached_rta);
2290
  lp_flush(s.pool);
2291
  return;
2292
}
2293

    
2294

    
2295
/*
2296
 *        ROUTE-REFRESH
2297
 */
2298

    
2299
static inline byte *
2300
bgp_create_route_refresh(struct bgp_channel *c, byte *buf)
2301
{
2302
  struct bgp_proto *p = (void *) c->c.proto;
2303

    
2304
  BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH");
2305

    
2306
  /* Original route refresh request, RFC 2918 */
2307
  put_af4(buf, c->afi);
2308
  buf[2] = BGP_RR_REQUEST;
2309

    
2310
  return buf+4;
2311
}
2312

    
2313
static inline byte *
2314
bgp_create_begin_refresh(struct bgp_channel *c, byte *buf)
2315
{
2316
  struct bgp_proto *p = (void *) c->c.proto;
2317

    
2318
  BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR");
2319

    
2320
  /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */
2321
  put_af4(buf, c->afi);
2322
  buf[2] = BGP_RR_BEGIN;
2323

    
2324
  return buf+4;
2325
}
2326

    
2327
static inline byte *
2328
bgp_create_end_refresh(struct bgp_channel *c, byte *buf)
2329
{
2330
  struct bgp_proto *p = (void *) c->c.proto;
2331

    
2332
  BGP_TRACE(D_PACKETS, "Sending END-OF-RR");
2333

    
2334
  /* Demarcation of ending of route refresh (EoRR), RFC 7313 */
2335
  put_af4(buf, c->afi);
2336
  buf[2] = BGP_RR_END;
2337

    
2338
  return buf+4;
2339
}
2340

    
2341
static void
2342
bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len)
2343
{
2344
  struct bgp_proto *p = conn->bgp;
2345

    
2346
  if (conn->state != BS_ESTABLISHED)
2347
  { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
2348

    
2349
  if (!conn->local_caps->route_refresh)
2350
  { bgp_error(conn, 1, 3, pkt+18, 1); return; }
2351

    
2352
  if (len < (BGP_HEADER_LENGTH + 4))
2353
  { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2354

    
2355
  if (len > (BGP_HEADER_LENGTH + 4))
2356
  { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; }
2357

    
2358
  struct bgp_channel *c = bgp_get_channel(p, get_af4(pkt+19));
2359
  if (!c)
2360
  {
2361
    log(L_WARN "%s: Got ROUTE-REFRESH subtype %u for AF %u.%u, ignoring",
2362
        p->p.name, pkt[21], get_u16(pkt+19), pkt[22]);
2363
    return;
2364
  }
2365

    
2366
  /* RFC 7313 redefined reserved field as RR message subtype */
2367
  uint subtype = p->enhanced_refresh ? pkt[21] : BGP_RR_REQUEST;
2368

    
2369
  switch (subtype)
2370
  {
2371
  case BGP_RR_REQUEST:
2372
    BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH");
2373
    channel_request_feeding(&c->c);
2374
    break;
2375

    
2376
  case BGP_RR_BEGIN:
2377
    BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR");
2378
    bgp_refresh_begin(c);
2379
    break;
2380

    
2381
  case BGP_RR_END:
2382
    BGP_TRACE(D_PACKETS, "Got END-OF-RR");
2383
    bgp_refresh_end(c);
2384
    break;
2385

    
2386
  default:
2387
    log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring",
2388
        p->p.name, subtype);
2389
    break;
2390
  }
2391
}
2392

    
2393
static inline struct bgp_channel *
2394
bgp_get_channel_to_send(struct bgp_proto *p, struct bgp_conn *conn)
2395
{
2396
  uint i = conn->last_channel;
2397

    
2398
  /* Try the last channel, but at most several times */
2399
  if ((conn->channels_to_send & (1 << i)) &&
2400
      (conn->last_channel_count < 16))
2401
    goto found;
2402

    
2403
  /* Find channel with non-zero channels_to_send */
2404
  do
2405
  {
2406
    i++;
2407
    if (i >= p->channel_count)
2408
      i = 0;
2409
  }
2410
  while (! (conn->channels_to_send & (1 << i)));
2411

    
2412
  /* Use that channel */
2413
  conn->last_channel = i;
2414
  conn->last_channel_count = 0;
2415

    
2416
found:
2417
  conn->last_channel_count++;
2418
  return p->channel_map[i];
2419
}
2420

    
2421
static inline int
2422
bgp_send(struct bgp_conn *conn, uint type, uint len)
2423
{
2424
  sock *sk = conn->sk;
2425
  byte *buf = sk->tbuf;
2426

    
2427
  memset(buf, 0xff, 16);                /* Marker */
2428
  put_u16(buf+16, len);
2429
  buf[18] = type;
2430

    
2431
  return sk_send(sk, len);
2432
}
2433

    
2434
/**
2435
 * bgp_fire_tx - transmit packets
2436
 * @conn: connection
2437
 *
2438
 * Whenever the transmit buffers of the underlying TCP connection
2439
 * are free and we have any packets queued for sending, the socket functions
2440
 * call bgp_fire_tx() which takes care of selecting the highest priority packet
2441
 * queued (Notification > Keepalive > Open > Update), assembling its header
2442
 * and body and sending it to the connection.
2443
 */
2444
static int
2445
bgp_fire_tx(struct bgp_conn *conn)
2446
{
2447
  struct bgp_proto *p = conn->bgp;
2448
  struct bgp_channel *c;
2449
  byte *buf, *pkt, *end;
2450
  uint s;
2451

    
2452
  if (!conn->sk)
2453
    return 0;
2454

    
2455
  buf = conn->sk->tbuf;
2456
  pkt = buf + BGP_HEADER_LENGTH;
2457
  s = conn->packets_to_send;
2458

    
2459
  if (s & (1 << PKT_SCHEDULE_CLOSE))
2460
  {
2461
    /* We can finally close connection and enter idle state */
2462
    bgp_conn_enter_idle_state(conn);
2463
    return 0;
2464
  }
2465
  if (s & (1 << PKT_NOTIFICATION))
2466
  {
2467
    conn->packets_to_send = 1 << PKT_SCHEDULE_CLOSE;
2468
    end = bgp_create_notification(conn, pkt);
2469
    return bgp_send(conn, PKT_NOTIFICATION, end - buf);
2470
  }
2471
  else if (s & (1 << PKT_KEEPALIVE))
2472
  {
2473
    conn->packets_to_send &= ~(1 << PKT_KEEPALIVE);
2474
    BGP_TRACE(D_PACKETS, "Sending KEEPALIVE");
2475
    bgp_start_timer(conn->keepalive_timer, conn->keepalive_time);
2476
    return bgp_send(conn, PKT_KEEPALIVE, BGP_HEADER_LENGTH);
2477
  }
2478
  else if (s & (1 << PKT_OPEN))
2479
  {
2480
    conn->packets_to_send &= ~(1 << PKT_OPEN);
2481
    end = bgp_create_open(conn, pkt);
2482
    return bgp_send(conn, PKT_OPEN, end - buf);
2483
  }
2484
  else while (conn->channels_to_send)
2485
  {
2486
    c = bgp_get_channel_to_send(p, conn);
2487
    s = c->packets_to_send;
2488

    
2489
    if (s & (1 << PKT_ROUTE_REFRESH))
2490
    {
2491
      c->packets_to_send &= ~(1 << PKT_ROUTE_REFRESH);
2492
      end = bgp_create_route_refresh(c, pkt);
2493
      return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2494
    }
2495
    else if (s & (1 << PKT_BEGIN_REFRESH))
2496
    {
2497
      /* BoRR is a subtype of RR, but uses separate bit in packets_to_send */
2498
      c->packets_to_send &= ~(1 << PKT_BEGIN_REFRESH);
2499
      end = bgp_create_begin_refresh(c, pkt);
2500
      return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2501
    }
2502
    else if (s & (1 << PKT_UPDATE))
2503
    {
2504
      end = bgp_create_update(c, pkt);
2505
      if (end)
2506
        return bgp_send(conn, PKT_UPDATE, end - buf);
2507

    
2508
      /* No update to send, perhaps we need to send End-of-RIB or EoRR */
2509
      c->packets_to_send = 0;
2510
      conn->channels_to_send &= ~(1 << c->index);
2511

    
2512
      if (c->feed_state == BFS_LOADED)
2513
      {
2514
        c->feed_state = BFS_NONE;
2515
        end = bgp_create_end_mark(c, pkt);
2516
        return bgp_send(conn, PKT_UPDATE, end - buf);
2517
      }
2518

    
2519
      else if (c->feed_state == BFS_REFRESHED)
2520
      {
2521
        c->feed_state = BFS_NONE;
2522
        end = bgp_create_end_refresh(c, pkt);
2523
        return bgp_send(conn, PKT_ROUTE_REFRESH, end - buf);
2524
      }
2525
    }
2526
    else if (s)
2527
      bug("Channel packets_to_send: %x", s);
2528

    
2529
    c->packets_to_send = 0;
2530
    conn->channels_to_send &= ~(1 << c->index);
2531
  }
2532

    
2533
  return 0;
2534
}
2535

    
2536
/**
2537
 * bgp_schedule_packet - schedule a packet for transmission
2538
 * @conn: connection
2539
 * @c: channel
2540
 * @type: packet type
2541
 *
2542
 * Schedule a packet of type @type to be sent as soon as possible.
2543
 */
2544
void
2545
bgp_schedule_packet(struct bgp_conn *conn, struct bgp_channel *c, int type)
2546
{
2547
  ASSERT(conn->sk);
2548

    
2549
  DBG("BGP: Scheduling packet type %d\n", type);
2550

    
2551
  if (c)
2552
  {
2553
    if (! conn->channels_to_send)
2554
    {
2555
      conn->last_channel = c->index;
2556
      conn->last_channel_count = 0;
2557
    }
2558

    
2559
    c->packets_to_send |= 1 << type;
2560
    conn->channels_to_send |= 1 << c->index;
2561
  }
2562
  else
2563
    conn->packets_to_send |= 1 << type;
2564

    
2565
  if ((conn->sk->tpos == conn->sk->tbuf) && !ev_active(conn->tx_ev))
2566
    ev_schedule(conn->tx_ev);
2567
}
2568

    
2569
void
2570
bgp_kick_tx(void *vconn)
2571
{
2572
  struct bgp_conn *conn = vconn;
2573

    
2574
  DBG("BGP: kicking TX\n");
2575
  while (bgp_fire_tx(conn) > 0)
2576
    ;
2577
}
2578

    
2579
void
2580
bgp_tx(sock *sk)
2581
{
2582
  struct bgp_conn *conn = sk->data;
2583

    
2584
  DBG("BGP: TX hook\n");
2585
  while (bgp_fire_tx(conn) > 0)
2586
    ;
2587
}
2588

    
2589

    
2590
static struct {
2591
  byte major, minor;
2592
  byte *msg;
2593
} bgp_msg_table[] = {
2594
  { 1, 0, "Invalid message header" },
2595
  { 1, 1, "Connection not synchronized" },
2596
  { 1, 2, "Bad message length" },
2597
  { 1, 3, "Bad message type" },
2598
  { 2, 0, "Invalid OPEN message" },
2599
  { 2, 1, "Unsupported version number" },
2600
  { 2, 2, "Bad peer AS" },
2601
  { 2, 3, "Bad BGP identifier" },
2602
  { 2, 4, "Unsupported optional parameter" },
2603
  { 2, 5, "Authentication failure" },
2604
  { 2, 6, "Unacceptable hold time" },
2605
  { 2, 7, "Required capability missing" }, /* [RFC5492] */
2606
  { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */
2607
  { 3, 0, "Invalid UPDATE message" },
2608
  { 3, 1, "Malformed attribute list" },
2609
  { 3, 2, "Unrecognized well-known attribute" },
2610
  { 3, 3, "Missing mandatory attribute" },
2611
  { 3, 4, "Invalid attribute flags" },
2612
  { 3, 5, "Invalid attribute length" },
2613
  { 3, 6, "Invalid ORIGIN attribute" },
2614
  { 3, 7, "AS routing loop" },                /* Deprecated */
2615
  { 3, 8, "Invalid NEXT_HOP attribute" },
2616
  { 3, 9, "Optional attribute error" },
2617
  { 3, 10, "Invalid network field" },
2618
  { 3, 11, "Malformed AS_PATH" },
2619
  { 4, 0, "Hold timer expired" },
2620
  { 5, 0, "Finite state machine error" }, /* Subcodes are according to [RFC6608] */
2621
  { 5, 1, "Unexpected message in OpenSent state" },
2622
  { 5, 2, "Unexpected message in OpenConfirm state" },
2623
  { 5, 3, "Unexpected message in Established state" },
2624
  { 6, 0, "Cease" }, /* Subcodes are according to [RFC4486] */
2625
  { 6, 1, "Maximum number of prefixes reached" },
2626
  { 6, 2, "Administrative shutdown" },
2627
  { 6, 3, "Peer de-configured" },
2628
  { 6, 4, "Administrative reset" },
2629
  { 6, 5, "Connection rejected" },
2630
  { 6, 6, "Other configuration change" },
2631
  { 6, 7, "Connection collision resolution" },
2632
  { 6, 8, "Out of Resources" },
2633
  { 7, 0, "Invalid ROUTE-REFRESH message" }, /* [RFC7313] */
2634
  { 7, 1, "Invalid ROUTE-REFRESH message length" } /* [RFC7313] */
2635
};
2636

    
2637
/**
2638
 * bgp_error_dsc - return BGP error description
2639
 * @code: BGP error code
2640
 * @subcode: BGP error subcode
2641
 *
2642
 * bgp_error_dsc() returns error description for BGP errors
2643
 * which might be static string or given temporary buffer.
2644
 */
2645
const char *
2646
bgp_error_dsc(uint code, uint subcode)
2647
{
2648
  static char buff[32];
2649
  uint i;
2650

    
2651
  for (i=0; i < ARRAY_SIZE(bgp_msg_table); i++)
2652
    if (bgp_msg_table[i].major == code && bgp_msg_table[i].minor == subcode)
2653
      return bgp_msg_table[i].msg;
2654

    
2655
  bsprintf(buff, "Unknown error %u.%u", code, subcode);
2656
  return buff;
2657
}
2658

    
2659
void
2660
bgp_log_error(struct bgp_proto *p, u8 class, char *msg, uint code, uint subcode, byte *data, uint len)
2661
{
2662
  const byte *name;
2663
  byte *t, argbuf[36];
2664
  uint i;
2665

    
2666
  /* Don't report Cease messages generated by myself */
2667
  if (code == 6 && class == BE_BGP_TX)
2668
    return;
2669

    
2670
  name = bgp_error_dsc(code, subcode);
2671
  t = argbuf;
2672
  if (len)
2673
    {
2674
      *t++ = ':';
2675
      *t++ = ' ';
2676

    
2677
      if ((code == 2) && (subcode == 2) && ((len == 2) || (len == 4)))
2678
        {
2679
          /* Bad peer AS - we would like to print the AS */
2680
          t += bsprintf(t, "%u", (len == 2) ? get_u16(data) : get_u32(data));
2681
          goto done;
2682
        }
2683
      if (len > 16)
2684
        len = 16;
2685
      for (i=0; i<len; i++)
2686
        t += bsprintf(t, "%02x", data[i]);
2687
    }
2688
 done:
2689
  *t = 0;
2690
  log(L_REMOTE "%s: %s: %s%s", p->p.name, msg, name, argbuf);
2691
}
2692

    
2693
static void
2694
bgp_rx_notification(struct bgp_conn *conn, byte *pkt, uint len)
2695
{
2696
  struct bgp_proto *p = conn->bgp;
2697

    
2698
  if (len < 21)
2699
  { bgp_error(conn, 1, 2, pkt+16, 2); return; }
2700

    
2701
  uint code = pkt[19];
2702
  uint subcode = pkt[20];
2703
  int err = (code != 6);
2704

    
2705
  bgp_log_error(p, BE_BGP_RX, "Received", code, subcode, pkt+21, len-21);
2706
  bgp_store_error(p, conn, BE_BGP_RX, (code << 16) | subcode);
2707

    
2708
  bgp_conn_enter_close_state(conn);
2709
  bgp_schedule_packet(conn, NULL, PKT_SCHEDULE_CLOSE);
2710

    
2711
  if (err)
2712
  {
2713
    bgp_update_startup_delay(p);
2714
    bgp_stop(p, 0);
2715
  }
2716
}
2717

    
2718
static void
2719
bgp_rx_keepalive(struct bgp_conn *conn)
2720
{
2721
  struct bgp_proto *p = conn->bgp;
2722

    
2723
  BGP_TRACE(D_PACKETS, "Got KEEPALIVE");
2724
  bgp_start_timer(conn->hold_timer, conn->hold_time);
2725

    
2726
  if (conn->state == BS_OPENCONFIRM)
2727
  { bgp_conn_enter_established_state(conn); return; }
2728

    
2729
  if (conn->state != BS_ESTABLISHED)
2730
    bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0);
2731
}
2732

    
2733

    
2734
/**
2735
 * bgp_rx_packet - handle a received packet
2736
 * @conn: BGP connection
2737
 * @pkt: start of the packet
2738
 * @len: packet size
2739
 *
2740
 * bgp_rx_packet() takes a newly received packet and calls the corresponding
2741
 * packet handler according to the packet type.
2742
 */
2743
static void
2744
bgp_rx_packet(struct bgp_conn *conn, byte *pkt, uint len)
2745
{
2746
  byte type = pkt[18];
2747

    
2748
  DBG("BGP: Got packet %02x (%d bytes)\n", type, len);
2749

    
2750
  if (conn->bgp->p.mrtdump & MD_MESSAGES)
2751
    mrt_dump_bgp_packet(conn, pkt, len);
2752

    
2753
  switch (type)
2754
  {
2755
  case PKT_OPEN:                return bgp_rx_open(conn, pkt, len);
2756
  case PKT_UPDATE:                return bgp_rx_update(conn, pkt, len);
2757
  case PKT_NOTIFICATION:        return bgp_rx_notification(conn, pkt, len);
2758
  case PKT_KEEPALIVE:                return bgp_rx_keepalive(conn);
2759
  case PKT_ROUTE_REFRESH:        return bgp_rx_route_refresh(conn, pkt, len);
2760
  default:                        bgp_error(conn, 1, 3, pkt+18, 1);
2761
  }
2762
}
2763

    
2764
/**
2765
 * bgp_rx - handle received data
2766
 * @sk: socket
2767
 * @size: amount of data received
2768
 *
2769
 * bgp_rx() is called by the socket layer whenever new data arrive from
2770
 * the underlying TCP connection. It assembles the data fragments to packets,
2771
 * checks their headers and framing and passes complete packets to
2772
 * bgp_rx_packet().
2773
 */
2774
int
2775
bgp_rx(sock *sk, uint size)
2776
{
2777
  struct bgp_conn *conn = sk->data;
2778
  byte *pkt_start = sk->rbuf;
2779
  byte *end = pkt_start + size;
2780
  uint i, len;
2781

    
2782
  DBG("BGP: RX hook: Got %d bytes\n", size);
2783
  while (end >= pkt_start + BGP_HEADER_LENGTH)
2784
    {
2785
      if ((conn->state == BS_CLOSE) || (conn->sk != sk))
2786
        return 0;
2787
      for(i=0; i<16; i++)
2788
        if (pkt_start[i] != 0xff)
2789
          {
2790
            bgp_error(conn, 1, 1, NULL, 0);
2791
            break;
2792
          }
2793
      len = get_u16(pkt_start+16);
2794
      if ((len < BGP_HEADER_LENGTH) || (len > bgp_max_packet_length(conn)))
2795
        {
2796
          bgp_error(conn, 1, 2, pkt_start+16, 2);
2797
          break;
2798
        }
2799
      if (end < pkt_start + len)
2800
        break;
2801
      bgp_rx_packet(conn, pkt_start, len);
2802
      pkt_start += len;
2803
    }
2804
  if (pkt_start != sk->rbuf)
2805
    {
2806
      memmove(sk->rbuf, pkt_start, end - pkt_start);
2807
      sk->rpos = sk->rbuf + (end - pkt_start);
2808
    }
2809
  return 0;
2810
}