Statistics
| Branch: | Revision:

iof-bird-daemon / proto / bgp / packets.c @ ae80a2de

History | View | Annotate | Download (42 KB)

1
/*
2
 *        BIRD -- BGP Packet Processing
3
 *
4
 *        (c) 2000 Martin Mares <mj@ucw.cz>
5
 *
6
 *        Can be freely distributed and used under the terms of the GNU GPL.
7
 */
8

    
9
#undef LOCAL_DEBUG
10

    
11
#include "nest/bird.h"
12
#include "nest/iface.h"
13
#include "nest/protocol.h"
14
#include "nest/route.h"
15
#include "nest/attrs.h"
16
#include "nest/mrtdump.h"
17
#include "conf/conf.h"
18
#include "lib/unaligned.h"
19
#include "lib/socket.h"
20

    
21
#include "nest/cli.h"
22

    
23
#include "bgp.h"
24

    
25

    
26
#define BGP_RR_REQUEST                0
27
#define BGP_RR_BEGIN                1
28
#define BGP_RR_END                2
29

    
30

    
31
static struct tbf rl_rcv_update = TBF_DEFAULT_LOG_LIMITS;
32
static struct tbf rl_snd_update = TBF_DEFAULT_LOG_LIMITS;
33

    
34
/* Table for state -> RFC 6608 FSM error subcodes */
35
static byte fsm_err_subcode[BS_MAX] = {
36
  [BS_OPENSENT] = 1,
37
  [BS_OPENCONFIRM] = 2,
38
  [BS_ESTABLISHED] = 3
39
};
40

    
41
/*
42
 * MRT Dump format is not semantically specified.
43
 * We will use these values in appropriate fields:
44
 *
45
 * Local AS, Remote AS - configured AS numbers for given BGP instance.
46
 * Local IP, Remote IP - IP addresses of the TCP connection (0 if no connection)
47
 *
48
 * We dump two kinds of MRT messages: STATE_CHANGE (for BGP state
49
 * changes) and MESSAGE (for received BGP messages).
50
 *
51
 * STATE_CHANGE uses always AS4 variant, but MESSAGE uses AS4 variant
52
 * only when AS4 session is established and even in that case MESSAGE
53
 * does not use AS4 variant for initial OPEN message. This strange
54
 * behavior is here for compatibility with Quagga and Bgpdump,
55
 */
56

    
57
static byte *
58
mrt_put_bgp4_hdr(byte *buf, struct bgp_conn *conn, int as4)
59
{
60
  struct bgp_proto *p = conn->bgp;
61

    
62
  if (as4)
63
    {
64
      put_u32(buf+0, p->remote_as);
65
      put_u32(buf+4, p->local_as);
66
      buf+=8;
67
    }
68
  else
69
    {
70
      put_u16(buf+0, (p->remote_as <= 0xFFFF) ? p->remote_as : AS_TRANS);
71
      put_u16(buf+2, (p->local_as <= 0xFFFF)  ? p->local_as  : AS_TRANS);
72
      buf+=4;
73
    }
74

    
75
  put_u16(buf+0, (p->neigh && p->neigh->iface) ? p->neigh->iface->index : 0);
76
  put_u16(buf+2, BGP_AF);
77
  buf+=4;
78
  buf = put_ipa(buf, conn->sk ? conn->sk->daddr : IPA_NONE);
79
  buf = put_ipa(buf, conn->sk ? conn->sk->saddr : IPA_NONE);
80

    
81
  return buf;
82
}
83

    
84
static void
85
mrt_dump_bgp_packet(struct bgp_conn *conn, byte *pkt, unsigned len)
86
{
87
  byte buf[BGP_MAX_PACKET_LENGTH + 128];
88
  byte *bp = buf + MRTDUMP_HDR_LENGTH;
89
  int as4 = conn->bgp->as4_session;
90

    
91
  bp = mrt_put_bgp4_hdr(bp, conn, as4);
92
  memcpy(bp, pkt, len);
93
  bp += len;
94
  mrt_dump_message(&conn->bgp->p, BGP4MP, as4 ? BGP4MP_MESSAGE_AS4 : BGP4MP_MESSAGE,
95
                   buf, bp-buf);
96
}
97

    
98
static inline u16
99
convert_state(unsigned state)
100
{
101
  /* Convert state from our BS_* values to values used in MRTDump */
102
  return (state == BS_CLOSE) ? 1 : state + 1;
103
}
104

    
105
void
106
mrt_dump_bgp_state_change(struct bgp_conn *conn, unsigned old, unsigned new)
107
{
108
  byte buf[128];
109
  byte *bp = buf + MRTDUMP_HDR_LENGTH;
110

    
111
  bp = mrt_put_bgp4_hdr(bp, conn, 1);
112
  put_u16(bp+0, convert_state(old));
113
  put_u16(bp+2, convert_state(new));
114
  bp += 4;
115
  mrt_dump_message(&conn->bgp->p, BGP4MP, BGP4MP_STATE_CHANGE_AS4, buf, bp-buf);
116
}
117

    
118
static byte *
119
bgp_create_notification(struct bgp_conn *conn, byte *buf)
120
{
121
  struct bgp_proto *p = conn->bgp;
122

    
123
  BGP_TRACE(D_PACKETS, "Sending NOTIFICATION(code=%d.%d)", conn->notify_code, conn->notify_subcode);
124
  buf[0] = conn->notify_code;
125
  buf[1] = conn->notify_subcode;
126
  memcpy(buf+2, conn->notify_data, conn->notify_size);
127
  return buf + 2 + conn->notify_size;
128
}
129

    
130
#ifdef IPV6
131
static byte *
132
bgp_put_cap_ipv6(struct bgp_proto *p UNUSED, byte *buf)
133
{
134
  *buf++ = 1;                /* Capability 1: Multiprotocol extensions */
135
  *buf++ = 4;                /* Capability data length */
136
  *buf++ = 0;                /* We support AF IPv6 */
137
  *buf++ = BGP_AF_IPV6;
138
  *buf++ = 0;                /* RFU */
139
  *buf++ = 1;                /* and SAFI 1 */
140
  return buf;
141
}
142

    
143
#else
144

    
145
static byte *
146
bgp_put_cap_ipv4(struct bgp_proto *p UNUSED, byte *buf)
147
{
148
  *buf++ = 1;                /* Capability 1: Multiprotocol extensions */
149
  *buf++ = 4;                /* Capability data length */
150
  *buf++ = 0;                /* We support AF IPv4 */
151
  *buf++ = BGP_AF_IPV4;
152
  *buf++ = 0;                /* RFU */
153
  *buf++ = 1;                /* and SAFI 1 */
154
  return buf;
155
}
156
#endif
157

    
158
static byte *
159
bgp_put_cap_rr(struct bgp_proto *p UNUSED, byte *buf)
160
{
161
  *buf++ = 2;                /* Capability 2: Support for route refresh */
162
  *buf++ = 0;                /* Capability data length */
163
  return buf;
164
}
165

    
166
static byte *
167
bgp_put_cap_gr1(struct bgp_proto *p, byte *buf)
168
{
169
  *buf++ = 64;                /* Capability 64: Support for graceful restart */
170
  *buf++ = 6;                /* Capability data length */
171

    
172
  put_u16(buf, p->cf->gr_time);
173
  if (p->p.gr_recovery)
174
    buf[0] |= BGP_GRF_RESTART;
175
  buf += 2;
176

    
177
  *buf++ = 0;                /* Appropriate AF */
178
  *buf++ = BGP_AF;
179
  *buf++ = 1;                /* and SAFI 1 */
180
  *buf++ = p->p.gr_recovery ? BGP_GRF_FORWARDING : 0;
181

    
182
  return buf;
183
}
184

    
185
static byte *
186
bgp_put_cap_gr2(struct bgp_proto *p, byte *buf)
187
{
188
  *buf++ = 64;                /* Capability 64: Support for graceful restart */
189
  *buf++ = 2;                /* Capability data length */
190
  put_u16(buf, 0);
191
  return buf + 2;
192
}
193

    
194
static byte *
195
bgp_put_cap_as4(struct bgp_proto *p, byte *buf)
196
{
197
  *buf++ = 65;                /* Capability 65: Support for 4-octet AS number */
198
  *buf++ = 4;                /* Capability data length */
199
  put_u32(buf, p->local_as);
200
  return buf + 4;
201
}
202

    
203
static byte *
204
bgp_put_cap_add_path(struct bgp_proto *p, byte *buf)
205
{
206
  *buf++ = 69;                /* Capability 69: Support for ADD-PATH */
207
  *buf++ = 4;                /* Capability data length */
208

    
209
  *buf++ = 0;                /* Appropriate AF */
210
  *buf++ = BGP_AF;
211
  *buf++ = 1;                /* SAFI 1 */
212

    
213
  *buf++ = p->cf->add_path;
214

    
215
  return buf;
216
}
217

    
218
static byte *
219
bgp_put_cap_err(struct bgp_proto *p UNUSED, byte *buf)
220
{
221
  *buf++ = 70;                /* Capability 70: Support for enhanced route refresh */
222
  *buf++ = 0;                /* Capability data length */
223
  return buf;
224
}
225

    
226

    
227
static byte *
228
bgp_create_open(struct bgp_conn *conn, byte *buf)
229
{
230
  struct bgp_proto *p = conn->bgp;
231
  byte *cap;
232
  int cap_len;
233

    
234
  BGP_TRACE(D_PACKETS, "Sending OPEN(ver=%d,as=%d,hold=%d,id=%08x)",
235
            BGP_VERSION, p->local_as, p->cf->hold_time, p->local_id);
236
  buf[0] = BGP_VERSION;
237
  put_u16(buf+1, (p->local_as < 0xFFFF) ? p->local_as : AS_TRANS);
238
  put_u16(buf+3, p->cf->hold_time);
239
  put_u32(buf+5, p->local_id);
240

    
241
  if (conn->start_state == BSS_CONNECT_NOCAP)
242
    {
243
      BGP_TRACE(D_PACKETS, "Skipping capabilities");
244
      buf[9] = 0;
245
      return buf + 10;
246
    }
247

    
248
  /* Skipped 3 B for length field and Capabilities parameter header */
249
  cap = buf + 12;
250

    
251
#ifndef IPV6
252
  if (p->cf->advertise_ipv4)
253
    cap = bgp_put_cap_ipv4(p, cap);
254
#endif
255

    
256
#ifdef IPV6
257
  cap = bgp_put_cap_ipv6(p, cap);
258
#endif
259

    
260
  if (p->cf->enable_refresh)
261
    cap = bgp_put_cap_rr(p, cap);
262

    
263
  if (p->cf->gr_mode == BGP_GR_ABLE)
264
    cap = bgp_put_cap_gr1(p, cap);
265
  else if (p->cf->gr_mode == BGP_GR_AWARE)
266
    cap = bgp_put_cap_gr2(p, cap);
267

    
268
  if (p->cf->enable_as4)
269
    cap = bgp_put_cap_as4(p, cap);
270

    
271
  if (p->cf->add_path)
272
    cap = bgp_put_cap_add_path(p, cap);
273

    
274
  if (p->cf->enable_refresh)
275
    cap = bgp_put_cap_err(p, cap);
276

    
277
  cap_len = cap - buf - 12;
278
  if (cap_len > 0)
279
    {
280
      buf[9]  = cap_len + 2;        /* Optional params len */
281
      buf[10] = 2;                /* Option: Capability list */
282
      buf[11] = cap_len;        /* Option length */
283
      return cap;
284
    }
285
  else
286
    {
287
      buf[9] = 0;                /* No optional parameters */
288
      return buf + 10;
289
    }
290
}
291

    
292
static uint
293
bgp_encode_prefixes(struct bgp_proto *p, byte *w, struct bgp_bucket *buck, uint remains)
294
{
295
  byte *start = w;
296
  ip_addr a;
297
  int bytes;
298

    
299
  while (!EMPTY_LIST(buck->prefixes) && (remains >= (5+sizeof(ip_addr))))
300
    {
301
      struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes));
302
      DBG("\tDequeued route %I/%d\n", px->n.prefix, px->n.pxlen);
303

    
304
      if (p->add_path_tx)
305
        {
306
          put_u32(w, px->path_id);
307
          w += 4;
308
          remains -= 4;
309
        }
310

    
311
      *w++ = px->n.pxlen;
312
      bytes = (px->n.pxlen + 7) / 8;
313
      a = px->n.prefix;
314
      ipa_hton(a);
315
      memcpy(w, &a, bytes);
316
      w += bytes;
317
      remains -= bytes + 1;
318
      rem_node(&px->bucket_node);
319
      bgp_free_prefix(p, px);
320
      // fib_delete(&p->prefix_fib, px);
321
    }
322
  return w - start;
323
}
324

    
325
static void
326
bgp_flush_prefixes(struct bgp_proto *p, struct bgp_bucket *buck)
327
{
328
  while (!EMPTY_LIST(buck->prefixes))
329
    {
330
      struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes));
331
      log(L_ERR "%s: - route %I/%d skipped", p->p.name, px->n.prefix, px->n.pxlen);
332
      rem_node(&px->bucket_node);
333
      bgp_free_prefix(p, px);
334
      // fib_delete(&p->prefix_fib, px);
335
    }
336
}
337

    
338
#ifndef IPV6                /* IPv4 version */
339

    
340
static byte *
341
bgp_create_update(struct bgp_conn *conn, byte *buf)
342
{
343
  struct bgp_proto *p = conn->bgp;
344
  struct bgp_bucket *buck;
345
  int remains = BGP_MAX_PACKET_LENGTH - BGP_HEADER_LENGTH - 4;
346
  byte *w;
347
  int wd_size = 0;
348
  int r_size = 0;
349
  int a_size = 0;
350

    
351
  w = buf+2;
352
  if ((buck = p->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
353
    {
354
      DBG("Withdrawn routes:\n");
355
      wd_size = bgp_encode_prefixes(p, w, buck, remains);
356
      w += wd_size;
357
      remains -= wd_size;
358
    }
359
  put_u16(buf, wd_size);
360

    
361
  if (remains >= 3072)
362
    {
363
      while ((buck = (struct bgp_bucket *) HEAD(p->bucket_queue))->send_node.next)
364
        {
365
          if (EMPTY_LIST(buck->prefixes))
366
            {
367
              DBG("Deleting empty bucket %p\n", buck);
368
              rem_node(&buck->send_node);
369
              bgp_free_bucket(p, buck);
370
              continue;
371
            }
372

    
373
          DBG("Processing bucket %p\n", buck);
374
          a_size = bgp_encode_attrs(p, w+2, buck->eattrs, 2048);
375

    
376
          if (a_size < 0)
377
            {
378
              log(L_ERR "%s: Attribute list too long, skipping corresponding routes", p->p.name);
379
              bgp_flush_prefixes(p, buck);
380
              rem_node(&buck->send_node);
381
              bgp_free_bucket(p, buck);
382
              continue;
383
            }
384

    
385
          put_u16(w, a_size);
386
          w += a_size + 2;
387
          r_size = bgp_encode_prefixes(p, w, buck, remains - a_size);
388
          w += r_size;
389
          break;
390
        }
391
    }
392
  if (!a_size)                                /* Attributes not already encoded */
393
    {
394
      put_u16(w, 0);
395
      w += 2;
396
    }
397
  if (wd_size || r_size)
398
    {
399
      BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE");
400
      return w;
401
    }
402
  else
403
    return NULL;
404
}
405

    
406
static byte *
407
bgp_create_end_mark(struct bgp_conn *conn, byte *buf)
408
{
409
  struct bgp_proto *p = conn->bgp;
410
  BGP_TRACE(D_PACKETS, "Sending END-OF-RIB");
411

    
412
  put_u32(buf, 0);
413
  return buf+4;
414
}
415

    
416
#else                /* IPv6 version */
417

    
418
static inline int
419
same_iface(struct bgp_proto *p, ip_addr *ip)
420
{
421
  neighbor *n = neigh_find(&p->p, ip, 0);
422
  return n && p->neigh && n->iface == p->neigh->iface;
423
}
424

    
425
static byte *
426
bgp_create_update(struct bgp_conn *conn, byte *buf)
427
{
428
  struct bgp_proto *p = conn->bgp;
429
  struct bgp_bucket *buck;
430
  int size, second, rem_stored;
431
  int remains = BGP_MAX_PACKET_LENGTH - BGP_HEADER_LENGTH - 4;
432
  byte *w, *w_stored, *tmp, *tstart;
433
  ip_addr *ipp, ip, ip_ll;
434
  ea_list *ea;
435
  eattr *nh;
436

    
437
  put_u16(buf, 0);
438
  w = buf+4;
439

    
440
  if ((buck = p->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
441
    {
442
      DBG("Withdrawn routes:\n");
443
      tmp = bgp_attach_attr_wa(&ea, bgp_linpool, BA_MP_UNREACH_NLRI, remains-8);
444
      *tmp++ = 0;
445
      *tmp++ = BGP_AF_IPV6;
446
      *tmp++ = 1;
447
      ea->attrs[0].u.ptr->length = 3 + bgp_encode_prefixes(p, tmp, buck, remains-11);
448
      size = bgp_encode_attrs(p, w, ea, remains);
449
      ASSERT(size >= 0);
450
      w += size;
451
      remains -= size;
452
    }
453

    
454
  if (remains >= 3072)
455
    {
456
      while ((buck = (struct bgp_bucket *) HEAD(p->bucket_queue))->send_node.next)
457
        {
458
          if (EMPTY_LIST(buck->prefixes))
459
            {
460
              DBG("Deleting empty bucket %p\n", buck);
461
              rem_node(&buck->send_node);
462
              bgp_free_bucket(p, buck);
463
              continue;
464
            }
465

    
466
          DBG("Processing bucket %p\n", buck);
467
          rem_stored = remains;
468
          w_stored = w;
469

    
470
          size = bgp_encode_attrs(p, w, buck->eattrs, 2048);
471
          if (size < 0)
472
            {
473
              log(L_ERR "%s: Attribute list too long, skipping corresponding routes", p->p.name);
474
              bgp_flush_prefixes(p, buck);
475
              rem_node(&buck->send_node);
476
              bgp_free_bucket(p, buck);
477
              continue;
478
            }
479
          w += size;
480
          remains -= size;
481

    
482
          /* We have two addresses here in NEXT_HOP eattr. Really.
483
             Unless NEXT_HOP was modified by filter */
484
          nh = ea_find(buck->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP));
485
          ASSERT(nh);
486
          second = (nh->u.ptr->length == NEXT_HOP_LENGTH);
487
          ipp = (ip_addr *) nh->u.ptr->data;
488
          ip = ipp[0];
489
          ip_ll = IPA_NONE;
490

    
491
          if (ipa_equal(ip, p->source_addr))
492
            ip_ll = p->local_link;
493
          else
494
            {
495
              /* If we send a route with 'third party' next hop destinated 
496
               * in the same interface, we should also send a link local 
497
               * next hop address. We use the received one (stored in the 
498
               * other part of BA_NEXT_HOP eattr). If we didn't received
499
               * it (for example it is a static route), we can't use
500
               * 'third party' next hop and we have to use local IP address
501
               * as next hop. Sending original next hop address without
502
               * link local address seems to be a natural way to solve that
503
               * problem, but it is contrary to RFC 2545 and Quagga does not
504
               * accept such routes.
505
               *
506
               * There are two cases, either we have global IP, or
507
               * IPA_NONE if the neighbor is link-local. For IPA_NONE,
508
               * we suppose it is on the same iface, see bgp_update_attrs().
509
               */
510

    
511
              if (ipa_zero(ip) || same_iface(p, &ip))
512
                {
513
                  if (second && ipa_nonzero(ipp[1]))
514
                    ip_ll = ipp[1];
515
                  else
516
                    {
517
                      switch (p->cf->missing_lladdr)
518
                        {
519
                        case MLL_SELF:
520
                          ip = p->source_addr;
521
                          ip_ll = p->local_link;
522
                          break;
523
                        case MLL_DROP:
524
                          log(L_ERR "%s: Missing link-local next hop address, skipping corresponding routes", p->p.name);
525
                          w = w_stored;
526
                          remains = rem_stored;
527
                          bgp_flush_prefixes(p, buck);
528
                          rem_node(&buck->send_node);
529
                          bgp_free_bucket(p, buck);
530
                          continue;
531
                        case MLL_IGNORE:
532
                          break;
533
                        }
534
                    }
535
                }
536
            }
537

    
538
          tstart = tmp = bgp_attach_attr_wa(&ea, bgp_linpool, BA_MP_REACH_NLRI, remains-8);
539
          *tmp++ = 0;
540
          *tmp++ = BGP_AF_IPV6;
541
          *tmp++ = 1;
542

    
543
          if (ipa_is_link_local(ip))
544
            ip = IPA_NONE;
545

    
546
          if (ipa_nonzero(ip_ll))
547
            {
548
              *tmp++ = 32;
549
              ipa_hton(ip);
550
              memcpy(tmp, &ip, 16);
551
              ipa_hton(ip_ll);
552
              memcpy(tmp+16, &ip_ll, 16);
553
              tmp += 32;
554
            }
555
          else
556
            {
557
              *tmp++ = 16;
558
              ipa_hton(ip);
559
              memcpy(tmp, &ip, 16);
560
              tmp += 16;
561
            }
562

    
563
          *tmp++ = 0;                        /* No SNPA information */
564
          tmp += bgp_encode_prefixes(p, tmp, buck, remains - (8+3+32+1));
565
          ea->attrs[0].u.ptr->length = tmp - tstart;
566
          size = bgp_encode_attrs(p, w, ea, remains);
567
          ASSERT(size >= 0);
568
          w += size;
569
          break;
570
        }
571
    }
572

    
573
  size = w - (buf+4);
574
  put_u16(buf+2, size);
575
  lp_flush(bgp_linpool);
576
  if (size)
577
    {
578
      BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE");
579
      return w;
580
    }
581
  else
582
    return NULL;
583
}
584

    
585
static byte *
586
bgp_create_end_mark(struct bgp_conn *conn, byte *buf)
587
{
588
  struct bgp_proto *p = conn->bgp;
589
  BGP_TRACE(D_PACKETS, "Sending END-OF-RIB");
590

    
591
  put_u16(buf+0, 0);
592
  put_u16(buf+2, 6);        /* length 4-9 */
593
  buf += 4;
594

    
595
  /* Empty MP_UNREACH_NLRI atribute */
596
  *buf++ = BAF_OPTIONAL;
597
  *buf++ = BA_MP_UNREACH_NLRI;
598
  *buf++ = 3;                /* Length 7-9 */
599
  *buf++ = 0;                /* AFI */
600
  *buf++ = BGP_AF_IPV6;
601
  *buf++ = 1;                /* SAFI */
602
  return buf;
603
}
604

    
605
#endif
606

    
607
static inline byte *
608
bgp_create_route_refresh(struct bgp_conn *conn, byte *buf)
609
{
610
  struct bgp_proto *p = conn->bgp;
611
  BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH");
612

    
613
  /* Original original route refresh request, RFC 2918 */
614
  *buf++ = 0;
615
  *buf++ = BGP_AF;
616
  *buf++ = BGP_RR_REQUEST;
617
  *buf++ = 1;                /* SAFI */
618
  return buf;
619
}
620

    
621
static inline byte *
622
bgp_create_begin_refresh(struct bgp_conn *conn, byte *buf)
623
{
624
  struct bgp_proto *p = conn->bgp;
625
  BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR");
626

    
627
  /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */
628
  *buf++ = 0;
629
  *buf++ = BGP_AF;
630
  *buf++ = BGP_RR_BEGIN;
631
  *buf++ = 1;                /* SAFI */
632
  return buf;
633
}
634

    
635
static inline byte *
636
bgp_create_end_refresh(struct bgp_conn *conn, byte *buf)
637
{
638
  struct bgp_proto *p = conn->bgp;
639
  BGP_TRACE(D_PACKETS, "Sending END-OF-RR");
640

    
641
  /* Demarcation of ending of route refresh (EoRR), RFC 7313 */
642
  *buf++ = 0;
643
  *buf++ = BGP_AF;
644
  *buf++ = BGP_RR_END;
645
  *buf++ = 1;                /* SAFI */
646
  return buf;
647
}
648

    
649

    
650
static void
651
bgp_create_header(byte *buf, uint len, uint type)
652
{
653
  memset(buf, 0xff, 16);                /* Marker */
654
  put_u16(buf+16, len);
655
  buf[18] = type;
656
}
657

    
658
/**
659
 * bgp_fire_tx - transmit packets
660
 * @conn: connection
661
 *
662
 * Whenever the transmit buffers of the underlying TCP connection
663
 * are free and we have any packets queued for sending, the socket functions
664
 * call bgp_fire_tx() which takes care of selecting the highest priority packet
665
 * queued (Notification > Keepalive > Open > Update), assembling its header
666
 * and body and sending it to the connection.
667
 */
668
static int
669
bgp_fire_tx(struct bgp_conn *conn)
670
{
671
  struct bgp_proto *p = conn->bgp;
672
  uint s = conn->packets_to_send;
673
  sock *sk = conn->sk;
674
  byte *buf, *pkt, *end;
675
  int type;
676

    
677
  if (!sk)
678
    {
679
      conn->packets_to_send = 0;
680
      return 0;
681
    }
682
  buf = sk->tbuf;
683
  pkt = buf + BGP_HEADER_LENGTH;
684

    
685
  if (s & (1 << PKT_SCHEDULE_CLOSE))
686
    {
687
      /* We can finally close connection and enter idle state */
688
      bgp_conn_enter_idle_state(conn);
689
      return 0;
690
    }
691
  if (s & (1 << PKT_NOTIFICATION))
692
    {
693
      s = 1 << PKT_SCHEDULE_CLOSE;
694
      type = PKT_NOTIFICATION;
695
      end = bgp_create_notification(conn, pkt);
696
    }
697
  else if (s & (1 << PKT_KEEPALIVE))
698
    {
699
      s &= ~(1 << PKT_KEEPALIVE);
700
      type = PKT_KEEPALIVE;
701
      end = pkt;                        /* Keepalives carry no data */
702
      BGP_TRACE(D_PACKETS, "Sending KEEPALIVE");
703
      bgp_start_timer(conn->keepalive_timer, conn->keepalive_time);
704
    }
705
  else if (s & (1 << PKT_OPEN))
706
    {
707
      s &= ~(1 << PKT_OPEN);
708
      type = PKT_OPEN;
709
      end = bgp_create_open(conn, pkt);
710
    }
711
  else if (s & (1 << PKT_ROUTE_REFRESH))
712
    {
713
      s &= ~(1 << PKT_ROUTE_REFRESH);
714
      type = PKT_ROUTE_REFRESH;
715
      end = bgp_create_route_refresh(conn, pkt);
716
    }
717
  else if (s & (1 << PKT_BEGIN_REFRESH))
718
    {
719
      s &= ~(1 << PKT_BEGIN_REFRESH);
720
      type = PKT_ROUTE_REFRESH;        /* BoRR is a subtype of RR */
721
      end = bgp_create_begin_refresh(conn, pkt);
722
    }
723
  else if (s & (1 << PKT_UPDATE))
724
    {
725
      type = PKT_UPDATE;
726
      end = bgp_create_update(conn, pkt);
727

    
728
      if (!end)
729
        {
730
          /* No update to send, perhaps we need to send End-of-RIB or EoRR */
731

    
732
          conn->packets_to_send = 0;
733

    
734
          if (p->feed_state == BFS_LOADED)
735
          {
736
            type = PKT_UPDATE;
737
            end = bgp_create_end_mark(conn, pkt);
738
          }
739

    
740
          else if (p->feed_state == BFS_REFRESHED)
741
          {
742
            type = PKT_ROUTE_REFRESH;
743
            end = bgp_create_end_refresh(conn, pkt);
744
          }
745

    
746
          else /* Really nothing to send */
747
            return 0;
748

    
749
          p->feed_state = BFS_NONE;
750
        }
751
    }
752
  else
753
    return 0;
754

    
755
  conn->packets_to_send = s;
756
  bgp_create_header(buf, end - buf, type);
757
  return sk_send(sk, end - buf);
758
}
759

    
760
/**
761
 * bgp_schedule_packet - schedule a packet for transmission
762
 * @conn: connection
763
 * @type: packet type
764
 *
765
 * Schedule a packet of type @type to be sent as soon as possible.
766
 */
767
void
768
bgp_schedule_packet(struct bgp_conn *conn, int type)
769
{
770
  DBG("BGP: Scheduling packet type %d\n", type);
771
  conn->packets_to_send |= 1 << type;
772
  if (conn->sk && conn->sk->tpos == conn->sk->tbuf && !ev_active(conn->tx_ev))
773
    ev_schedule(conn->tx_ev);
774
}
775

    
776
void
777
bgp_kick_tx(void *vconn)
778
{
779
  struct bgp_conn *conn = vconn;
780

    
781
  DBG("BGP: kicking TX\n");
782
  while (bgp_fire_tx(conn) > 0)
783
    ;
784
}
785

    
786
void
787
bgp_tx(sock *sk)
788
{
789
  struct bgp_conn *conn = sk->data;
790

    
791
  DBG("BGP: TX hook\n");
792
  while (bgp_fire_tx(conn) > 0)
793
    ;
794
}
795

    
796
/* Capatibility negotiation as per RFC 2842 */
797

    
798
void
799
bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len)
800
{
801
  // struct bgp_proto *p = conn->bgp;
802
  int i, cl;
803

    
804
  while (len > 0)
805
    {
806
      if (len < 2 || len < 2 + opt[1])
807
        goto err;
808

    
809
      cl = opt[1];
810

    
811
      switch (opt[0])
812
        {
813
        case 2:        /* Route refresh capability, RFC 2918 */
814
          if (cl != 0)
815
            goto err;
816
          conn->peer_refresh_support = 1;
817
          break;
818

    
819
        case 64: /* Graceful restart capability, RFC 4724 */
820
          if (cl % 4 != 2)
821
            goto err;
822
          conn->peer_gr_aware = 1;
823
          conn->peer_gr_able = 0;
824
          conn->peer_gr_time = get_u16(opt + 2) & 0x0fff;
825
          conn->peer_gr_flags = opt[2] & 0xf0;
826
          conn->peer_gr_aflags = 0;
827
          for (i = 2; i < cl; i += 4)
828
            if (opt[2+i+0] == 0 && opt[2+i+1] == BGP_AF && opt[2+i+2] == 1) /* Match AFI/SAFI */
829
              {
830
                conn->peer_gr_able = 1;
831
                conn->peer_gr_aflags = opt[2+i+3];
832
              }
833
          break;
834

    
835
        case 65: /* AS4 capability, RFC 4893 */
836
          if (cl != 4)
837
            goto err;
838
          conn->peer_as4_support = 1;
839
          if (conn->bgp->cf->enable_as4)
840
            conn->advertised_as = get_u32(opt + 2);
841
          break;
842

    
843
        case 69: /* ADD-PATH capability, draft */
844
          if (cl % 4)
845
            goto err;
846
          for (i = 0; i < cl; i += 4)
847
            if (opt[2+i+0] == 0 && opt[2+i+1] == BGP_AF && opt[2+i+2] == 1) /* Match AFI/SAFI */
848
              conn->peer_add_path = opt[2+i+3];
849
          if (conn->peer_add_path > ADD_PATH_FULL)
850
            goto err;
851
          break;
852

    
853
        case 70: /* Enhanced route refresh capability, RFC 7313 */
854
          if (cl != 0)
855
            goto err;
856
          conn->peer_enhanced_refresh_support = 1;
857
          break;
858

    
859
          /* We can safely ignore all other capabilities */
860
        }
861
      len -= 2 + cl;
862
      opt += 2 + cl;
863
    }
864
  return;
865

    
866
 err:
867
  bgp_error(conn, 2, 0, NULL, 0);
868
  return;
869
}
870

    
871
static int
872
bgp_parse_options(struct bgp_conn *conn, byte *opt, int len)
873
{
874
  struct bgp_proto *p = conn->bgp;
875
  int ol;
876

    
877
  while (len > 0)
878
    {
879
      if (len < 2 || len < 2 + opt[1])
880
        { bgp_error(conn, 2, 0, NULL, 0); return 0; }
881
#ifdef LOCAL_DEBUG
882
      {
883
        int i;
884
        DBG("\tOption %02x:", opt[0]);
885
        for(i=0; i<opt[1]; i++)
886
          DBG(" %02x", opt[2+i]);
887
        DBG("\n");
888
      }
889
#endif
890

    
891
      ol = opt[1];
892
      switch (opt[0])
893
        {
894
        case 2:
895
          if (conn->start_state == BSS_CONNECT_NOCAP)
896
            BGP_TRACE(D_PACKETS, "Ignoring received capabilities");
897
          else
898
            bgp_parse_capabilities(conn, opt + 2, ol);
899
          break;
900

    
901
        default:
902
          /*
903
           *  BGP specs don't tell us to send which option
904
           *  we didn't recognize, but it's common practice
905
           *  to do so. Also, capability negotiation with
906
           *  Cisco routers doesn't work without that.
907
           */
908
          bgp_error(conn, 2, 4, opt, ol);
909
          return 0;
910
        }
911
      len -= 2 + ol;
912
      opt += 2 + ol;
913
    }
914
  return 0;
915
}
916

    
917
static void
918
bgp_rx_open(struct bgp_conn *conn, byte *pkt, int len)
919
{
920
  struct bgp_conn *other;
921
  struct bgp_proto *p = conn->bgp;
922
  unsigned hold;
923
  u16 base_as;
924
  u32 id;
925

    
926
  /* Check state */
927
  if (conn->state != BS_OPENSENT)
928
    { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
929

    
930
  /* Check message contents */
931
  if (len < 29 || len != 29 + pkt[28])
932
    { bgp_error(conn, 1, 2, pkt+16, 2); return; }
933
  if (pkt[19] != BGP_VERSION)
934
    { bgp_error(conn, 2, 1, pkt+19, 1); return; } /* RFC 1771 says 16 bits, draft-09 tells to use 8 */
935
  conn->advertised_as = base_as = get_u16(pkt+20);
936
  hold = get_u16(pkt+22);
937
  id = get_u32(pkt+24);
938
  BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%08x)", conn->advertised_as, hold, id);
939

    
940
  if (bgp_parse_options(conn, pkt+29, pkt[28]))
941
    return;
942

    
943
  if (hold > 0 && hold < 3)
944
    { bgp_error(conn, 2, 6, pkt+22, 2); return; }
945

    
946
  /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */
947
  if (!id || (p->is_internal && id == p->local_id))
948
    { bgp_error(conn, 2, 3, pkt+24, -4); return; }
949

    
950
  if ((conn->advertised_as != base_as) && (base_as != AS_TRANS))
951
    log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name);
952

    
953
  if (conn->advertised_as != p->remote_as)
954
    {
955
      if (conn->peer_as4_support)
956
        {
957
          u32 val = htonl(conn->advertised_as);
958
          bgp_error(conn, 2, 2, (byte *) &val, 4);
959
        }
960
      else
961
        bgp_error(conn, 2, 2, pkt+20, 2);
962

    
963
      return;
964
    }
965

    
966
  /* Check the other connection */
967
  other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn;
968
  switch (other->state)
969
    {
970
    case BS_CONNECT:
971
    case BS_ACTIVE:
972
      /* Stop outgoing connection attempts */
973
      bgp_conn_enter_idle_state(other);
974
      break;
975

    
976
    case BS_IDLE:
977
    case BS_OPENSENT:
978
    case BS_CLOSE:
979
      break;
980

    
981
    case BS_OPENCONFIRM:
982
      /*
983
       * Description of collision detection rules in RFC 4271 is confusing and
984
       * contradictory, but it is essentially:
985
       *
986
       * 1. Router with higher ID is dominant
987
       * 2. If both have the same ID, router with higher ASN is dominant [RFC6286]
988
       * 3. When both connections are in OpenConfirm state, one initiated by
989
       *    the dominant router is kept.
990
       *
991
       * The first line in the expression below evaluates whether the neighbor
992
       * is dominant, the second line whether the new connection was initiated
993
       * by the neighbor. If both are true (or both are false), we keep the new
994
       * connection, otherwise we keep the old one.
995
       */
996
      if (((p->local_id < id) || ((p->local_id == id) && (p->local_as < p->remote_as)))
997
          == (conn == &p->incoming_conn))
998
        {
999
          /* Should close the other connection */
1000
          BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection");
1001
          bgp_error(other, 6, 7, NULL, 0);
1002
          break;
1003
        }
1004
      /* Fall thru */
1005
    case BS_ESTABLISHED:
1006
      /* Should close this connection */
1007
      BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection");
1008
      bgp_error(conn, 6, 7, NULL, 0);
1009
      return;
1010
    default:
1011
      bug("bgp_rx_open: Unknown state");
1012
    }
1013

    
1014
  /* Update our local variables */
1015
  conn->hold_time = MIN(hold, p->cf->hold_time);
1016
  conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3;
1017
  p->remote_id = id;
1018
  p->as4_session = p->cf->enable_as4 && conn->peer_as4_support;
1019
  p->add_path_rx = (p->cf->add_path & ADD_PATH_RX) && (conn->peer_add_path & ADD_PATH_TX);
1020
  p->add_path_tx = (p->cf->add_path & ADD_PATH_TX) && (conn->peer_add_path & ADD_PATH_RX);
1021
  p->gr_ready = p->cf->gr_mode && conn->peer_gr_able;
1022

    
1023
  if (p->add_path_tx)
1024
    p->p.accept_ra_types = RA_ANY;
1025

    
1026
  DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n", conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, p->as4_session);
1027

    
1028
  bgp_schedule_packet(conn, PKT_KEEPALIVE);
1029
  bgp_start_timer(conn->hold_timer, conn->hold_time);
1030
  bgp_conn_enter_openconfirm_state(conn);
1031
}
1032

    
1033

    
1034
static inline void
1035
bgp_rx_end_mark(struct bgp_proto *p)
1036
{
1037
  BGP_TRACE(D_PACKETS, "Got END-OF-RIB");
1038

    
1039
  if (p->load_state == BFS_LOADING)
1040
    p->load_state = BFS_NONE;
1041

    
1042
  if (p->p.gr_recovery)
1043
    proto_graceful_restart_unlock(&p->p);
1044

    
1045
  if (p->gr_active)
1046
    bgp_graceful_restart_done(p);
1047
}
1048

    
1049

    
1050
#define DECODE_PREFIX(pp, ll) do {                \
1051
  if (p->add_path_rx)                                \
1052
  {                                                \
1053
    if (ll < 5) { err=1; goto done; }                \
1054
    path_id = get_u32(pp);                        \
1055
    pp += 4;                                        \
1056
    ll -= 4;                                        \
1057
  }                                                \
1058
  int b = *pp++;                                \
1059
  int q;                                        \
1060
  ll--;                                                \
1061
  if (b > BITS_PER_IP_ADDRESS) { err=10; goto done; } \
1062
  q = (b+7) / 8;                                \
1063
  if (ll < q) { err=1; goto done; }                \
1064
  memcpy(&prefix, pp, q);                        \
1065
  pp += q;                                        \
1066
  ll -= q;                                        \
1067
  ipa_ntoh(prefix);                                \
1068
  prefix = ipa_and(prefix, ipa_mkmask(b));        \
1069
  pxlen = b;                                        \
1070
} while (0)
1071

    
1072

    
1073
static inline void
1074
bgp_rte_update(struct bgp_proto *p, ip_addr prefix, int pxlen,
1075
               u32 path_id, u32 *last_id, struct rte_src **src,
1076
               rta *a0, rta **a)
1077
{
1078
  if (path_id != *last_id)
1079
    {
1080
      *src = rt_get_source(&p->p, path_id);
1081
      *last_id = path_id;
1082

    
1083
      if (*a)
1084
        {
1085
          rta_free(*a);
1086
          *a = NULL;
1087
        }
1088
    }
1089

    
1090
  /* Prepare cached route attributes */
1091
  if (!*a)
1092
    {
1093
      a0->src = *src;
1094

    
1095
      /* Workaround for rta_lookup() breaking eattrs */
1096
      ea_list *ea = a0->eattrs;
1097
      *a = rta_lookup(a0);
1098
      a0->eattrs = ea;
1099
    }
1100

    
1101
  net *n = net_get(p->p.table, prefix, pxlen);
1102
  rte *e = rte_get_temp(rta_clone(*a));
1103
  e->net = n;
1104
  e->pflags = 0;
1105
  e->u.bgp.suppressed = 0;
1106
  rte_update2(p->p.main_ahook, n, e, *src);
1107
}
1108

    
1109
static inline void
1110
bgp_rte_withdraw(struct bgp_proto *p, ip_addr prefix, int pxlen,
1111
                 u32 path_id, u32 *last_id, struct rte_src **src)
1112
{
1113
  if (path_id != *last_id)
1114
    {
1115
      *src = rt_find_source(&p->p, path_id);
1116
      *last_id = path_id;
1117
    }
1118

    
1119
  net *n = net_find(p->p.table, prefix, pxlen);
1120
  rte_update2( p->p.main_ahook, n, NULL, *src);
1121
}
1122

    
1123
static inline int
1124
bgp_set_next_hop(struct bgp_proto *p, rta *a)
1125
{
1126
  struct eattr *nh = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP));
1127
  ip_addr *nexthop = (ip_addr *) nh->u.ptr->data;
1128

    
1129
#ifdef IPV6
1130
  int second = (nh->u.ptr->length == NEXT_HOP_LENGTH) && ipa_nonzero(nexthop[1]);
1131

    
1132
  /* First address should not be link-local, but may be zero in direct mode */
1133
  if (ipa_is_link_local(*nexthop))
1134
    *nexthop = IPA_NONE;
1135
#else
1136
  int second = 0;
1137
#endif
1138

    
1139
  if (p->cf->gw_mode == GW_DIRECT)
1140
    {
1141
      neighbor *ng = NULL;
1142

    
1143
      if (ipa_nonzero(*nexthop))
1144
        ng = neigh_find(&p->p, nexthop, 0);
1145
      else if (second)        /* GW_DIRECT -> single_hop -> p->neigh != NULL */
1146
        ng = neigh_find2(&p->p, nexthop + 1, p->neigh->iface, 0);
1147

    
1148
      /* Fallback */
1149
      if (!ng)
1150
        ng = p->neigh;
1151

    
1152
      if (ng->scope == SCOPE_HOST)
1153
        return 0;
1154

    
1155
      a->dest = RTD_ROUTER;
1156
      a->gw = ng->addr;
1157
      a->iface = ng->iface;
1158
      a->hostentry = NULL;
1159
      a->igp_metric = 0;
1160
    }
1161
  else /* GW_RECURSIVE */
1162
    {
1163
      if (ipa_zero(*nexthop))
1164
          return 0;
1165

    
1166
      rta_set_recursive_next_hop(p->p.table, a, p->igp_table, nexthop, nexthop + second);
1167
    }
1168

    
1169
  return 1;
1170
}
1171

    
1172
#ifndef IPV6                /* IPv4 version */
1173

    
1174
static void
1175
bgp_do_rx_update(struct bgp_conn *conn,
1176
                 byte *withdrawn, int withdrawn_len,
1177
                 byte *nlri, int nlri_len,
1178
                 byte *attrs, int attr_len)
1179
{
1180
  struct bgp_proto *p = conn->bgp;
1181
  struct rte_src *src = p->p.main_source;
1182
  rta *a0, *a = NULL;
1183
  ip_addr prefix;
1184
  int pxlen, err = 0;
1185
  u32 path_id = 0;
1186
  u32 last_id = 0;
1187

    
1188
  /* Check for End-of-RIB marker */
1189
  if (!withdrawn_len && !attr_len && !nlri_len)
1190
    {
1191
      bgp_rx_end_mark(p);
1192
      return;
1193
    }
1194

    
1195
  /* Withdraw routes */
1196
  while (withdrawn_len)
1197
    {
1198
      DECODE_PREFIX(withdrawn, withdrawn_len);
1199
      DBG("Withdraw %I/%d\n", prefix, pxlen);
1200

    
1201
      bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src);
1202
    }
1203

    
1204
  if (!attr_len && !nlri_len)                /* shortcut */
1205
    return;
1206

    
1207
  a0 = bgp_decode_attrs(conn, attrs, attr_len, bgp_linpool, nlri_len);
1208

    
1209
  if (conn->state != BS_ESTABLISHED)        /* fatal error during decoding */
1210
    return;
1211

    
1212
  if (a0 && nlri_len && !bgp_set_next_hop(p, a0))
1213
    a0 = NULL;
1214

    
1215
  last_id = 0;
1216
  src = p->p.main_source;
1217

    
1218
  while (nlri_len)
1219
    {
1220
      DECODE_PREFIX(nlri, nlri_len);
1221
      DBG("Add %I/%d\n", prefix, pxlen);
1222

    
1223
      if (a0)
1224
        bgp_rte_update(p, prefix, pxlen, path_id, &last_id, &src, a0, &a);
1225
      else /* Forced withdraw as a result of soft error */
1226
        bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src);
1227
    }
1228

    
1229
 done:
1230
  if (a)
1231
    rta_free(a);
1232

    
1233
  if (err)
1234
    bgp_error(conn, 3, err, NULL, 0);
1235

    
1236
  return;
1237
}
1238

    
1239
#else                        /* IPv6 version */
1240

    
1241
#define DO_NLRI(name)                                        \
1242
  start = x = p->name##_start;                                \
1243
  len = len0 = p->name##_len;                                \
1244
  if (len)                                                \
1245
    {                                                        \
1246
      if (len < 3) { err=9; goto done; }                \
1247
      af = get_u16(x);                                        \
1248
      sub = x[2];                                        \
1249
      x += 3;                                                \
1250
      len -= 3;                                                \
1251
      DBG("\tNLRI AF=%d sub=%d len=%d\n", af, sub, len);\
1252
    }                                                        \
1253
  else                                                        \
1254
    af = 0;                                                \
1255
  if (af == BGP_AF_IPV6)
1256

    
1257
static void
1258
bgp_attach_next_hop(rta *a0, byte *x)
1259
{
1260
  ip_addr *nh = (ip_addr *) bgp_attach_attr_wa(&a0->eattrs, bgp_linpool, BA_NEXT_HOP, NEXT_HOP_LENGTH);
1261
  memcpy(nh, x+1, 16);
1262
  ipa_ntoh(nh[0]);
1263

    
1264
  /* We store received link local address in the other part of BA_NEXT_HOP eattr. */
1265
  if (*x == 32)
1266
    {
1267
      memcpy(nh+1, x+17, 16);
1268
      ipa_ntoh(nh[1]);
1269
    }
1270
  else
1271
    nh[1] = IPA_NONE;
1272
}
1273

    
1274

    
1275
static void
1276
bgp_do_rx_update(struct bgp_conn *conn,
1277
                 byte *withdrawn, int withdrawn_len,
1278
                 byte *nlri, int nlri_len,
1279
                 byte *attrs, int attr_len)
1280
{
1281
  struct bgp_proto *p = conn->bgp;
1282
  struct rte_src *src = p->p.main_source;
1283
  byte *start, *x;
1284
  int len, len0;
1285
  unsigned af, sub;
1286
  rta *a0, *a = NULL;
1287
  ip_addr prefix;
1288
  int pxlen, err = 0;
1289
  u32 path_id = 0;
1290
  u32 last_id = 0;
1291

    
1292
  p->mp_reach_len = 0;
1293
  p->mp_unreach_len = 0;
1294
  a0 = bgp_decode_attrs(conn, attrs, attr_len, bgp_linpool, 0);
1295

    
1296
  if (conn->state != BS_ESTABLISHED)        /* fatal error during decoding */
1297
    return;
1298

    
1299
  /* Check for End-of-RIB marker */
1300
  if ((attr_len < 8) && !withdrawn_len && !nlri_len && !p->mp_reach_len &&
1301
      (p->mp_unreach_len == 3) && (get_u16(p->mp_unreach_start) == BGP_AF_IPV6))
1302
    {
1303
      bgp_rx_end_mark(p);
1304
      return;
1305
    }
1306

    
1307
  DO_NLRI(mp_unreach)
1308
    {
1309
      while (len)
1310
        {
1311
          DECODE_PREFIX(x, len);
1312
          DBG("Withdraw %I/%d\n", prefix, pxlen);
1313
          bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src);
1314
        }
1315
    }
1316

    
1317
  DO_NLRI(mp_reach)
1318
    {
1319
      /* Create fake NEXT_HOP attribute */
1320
      if (len < 1 || (*x != 16 && *x != 32) || len < *x + 2)
1321
        { err = 9; goto done; }
1322

    
1323
      if (a0)
1324
        bgp_attach_next_hop(a0, x);
1325

    
1326
      /* Also ignore one reserved byte */
1327
      len -= *x + 2;
1328
      x += *x + 2;
1329

    
1330
      if (a0 && ! bgp_set_next_hop(p, a0))
1331
        a0 = NULL;
1332

    
1333
      last_id = 0;
1334
      src = p->p.main_source;
1335

    
1336
      while (len)
1337
        {
1338
          DECODE_PREFIX(x, len);
1339
          DBG("Add %I/%d\n", prefix, pxlen);
1340

    
1341
          if (a0)
1342
            bgp_rte_update(p, prefix, pxlen, path_id, &last_id, &src, a0, &a);
1343
          else /* Forced withdraw as a result of soft error */
1344
            bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src);
1345
        }
1346
    }
1347

    
1348
 done:
1349
  if (a)
1350
    rta_free(a);
1351

    
1352
  if (err) /* Use subcode 9, not err */
1353
    bgp_error(conn, 3, 9, NULL, 0);
1354

    
1355
  return;
1356
}
1357

    
1358
#endif
1359

    
1360
static void
1361
bgp_rx_update(struct bgp_conn *conn, byte *pkt, int len)
1362
{
1363
  struct bgp_proto *p = conn->bgp;
1364
  byte *withdrawn, *attrs, *nlri;
1365
  int withdrawn_len, attr_len, nlri_len;
1366

    
1367
  BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE");
1368

    
1369
  /* Workaround for some BGP implementations that skip initial KEEPALIVE */
1370
  if (conn->state == BS_OPENCONFIRM)
1371
    bgp_conn_enter_established_state(conn);
1372

    
1373
  if (conn->state != BS_ESTABLISHED)
1374
    { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
1375
  bgp_start_timer(conn->hold_timer, conn->hold_time);
1376

    
1377
  /* Find parts of the packet and check sizes */
1378
  if (len < 23)
1379
    {
1380
      bgp_error(conn, 1, 2, pkt+16, 2);
1381
      return;
1382
    }
1383
  withdrawn = pkt + 21;
1384
  withdrawn_len = get_u16(pkt + 19);
1385
  if (withdrawn_len + 23 > len)
1386
    goto malformed;
1387
  attrs = withdrawn + withdrawn_len + 2;
1388
  attr_len = get_u16(attrs - 2);
1389
  if (withdrawn_len + attr_len + 23 > len)
1390
    goto malformed;
1391
  nlri = attrs + attr_len;
1392
  nlri_len = len - withdrawn_len - attr_len - 23;
1393
  if (!attr_len && nlri_len)
1394
    goto malformed;
1395
  DBG("Sizes: withdrawn=%d, attrs=%d, NLRI=%d\n", withdrawn_len, attr_len, nlri_len);
1396

    
1397
  lp_flush(bgp_linpool);
1398

    
1399
  bgp_do_rx_update(conn, withdrawn, withdrawn_len, nlri, nlri_len, attrs, attr_len);
1400
  return;
1401

    
1402
malformed:
1403
  bgp_error(conn, 3, 1, NULL, 0);
1404
}
1405

    
1406
static struct {
1407
  byte major, minor;
1408
  byte *msg;
1409
} bgp_msg_table[] = {
1410
  { 1, 0, "Invalid message header" },
1411
  { 1, 1, "Connection not synchronized" },
1412
  { 1, 2, "Bad message length" },
1413
  { 1, 3, "Bad message type" },
1414
  { 2, 0, "Invalid OPEN message" },
1415
  { 2, 1, "Unsupported version number" },
1416
  { 2, 2, "Bad peer AS" },
1417
  { 2, 3, "Bad BGP identifier" },
1418
  { 2, 4, "Unsupported optional parameter" },
1419
  { 2, 5, "Authentication failure" },
1420
  { 2, 6, "Unacceptable hold time" },
1421
  { 2, 7, "Required capability missing" }, /* [RFC3392] */
1422
  { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */
1423
  { 3, 0, "Invalid UPDATE message" },
1424
  { 3, 1, "Malformed attribute list" },
1425
  { 3, 2, "Unrecognized well-known attribute" },
1426
  { 3, 3, "Missing mandatory attribute" },
1427
  { 3, 4, "Invalid attribute flags" },
1428
  { 3, 5, "Invalid attribute length" },
1429
  { 3, 6, "Invalid ORIGIN attribute" },
1430
  { 3, 7, "AS routing loop" },                /* Deprecated */
1431
  { 3, 8, "Invalid NEXT_HOP attribute" },
1432
  { 3, 9, "Optional attribute error" },
1433
  { 3, 10, "Invalid network field" },
1434
  { 3, 11, "Malformed AS_PATH" },
1435
  { 4, 0, "Hold timer expired" },
1436
  { 5, 0, "Finite state machine error" }, /* Subcodes are according to [RFC6608] */
1437
  { 5, 1, "Unexpected message in OpenSent state" },
1438
  { 5, 2, "Unexpected message in OpenConfirm state" },
1439
  { 5, 3, "Unexpected message in Established state" },
1440
  { 6, 0, "Cease" }, /* Subcodes are according to [RFC4486] */
1441
  { 6, 1, "Maximum number of prefixes reached" },
1442
  { 6, 2, "Administrative shutdown" },
1443
  { 6, 3, "Peer de-configured" },
1444
  { 6, 4, "Administrative reset" },
1445
  { 6, 5, "Connection rejected" },
1446
  { 6, 6, "Other configuration change" },
1447
  { 6, 7, "Connection collision resolution" },
1448
  { 6, 8, "Out of Resources" },
1449
  { 7, 0, "Invalid ROUTE-REFRESH message" }, /* [RFC7313] */
1450
  { 7, 1, "Invalid ROUTE-REFRESH message length" } /* [RFC7313] */
1451
};
1452

    
1453
/**
1454
 * bgp_error_dsc - return BGP error description
1455
 * @code: BGP error code
1456
 * @subcode: BGP error subcode
1457
 *
1458
 * bgp_error_dsc() returns error description for BGP errors
1459
 * which might be static string or given temporary buffer.
1460
 */
1461
const char *
1462
bgp_error_dsc(unsigned code, unsigned subcode)
1463
{
1464
  static char buff[32];
1465
  unsigned i;
1466
  for (i=0; i < ARRAY_SIZE(bgp_msg_table); i++)
1467
    if (bgp_msg_table[i].major == code && bgp_msg_table[i].minor == subcode)
1468
      {
1469
        return bgp_msg_table[i].msg;
1470
      }
1471

    
1472
  bsprintf(buff, "Unknown error %d.%d", code, subcode);
1473
  return buff;
1474
}
1475

    
1476
void
1477
bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsigned subcode, byte *data, unsigned len)
1478
{
1479
  const byte *name;
1480
  byte *t, argbuf[36];
1481
  unsigned i;
1482

    
1483
  /* Don't report Cease messages generated by myself */
1484
  if (code == 6 && class == BE_BGP_TX)
1485
    return;
1486

    
1487
  name = bgp_error_dsc(code, subcode);
1488
  t = argbuf;
1489
  if (len)
1490
    {
1491
      *t++ = ':';
1492
      *t++ = ' ';
1493

    
1494
      if ((code == 2) && (subcode == 2) && ((len == 2) || (len == 4)))
1495
        {
1496
          /* Bad peer AS - we would like to print the AS */
1497
          t += bsprintf(t, "%d", (len == 2) ? get_u16(data) : get_u32(data));
1498
          goto done;
1499
        }
1500
      if (len > 16)
1501
        len = 16;
1502
      for (i=0; i<len; i++)
1503
        t += bsprintf(t, "%02x", data[i]);
1504
    }
1505
 done:
1506
  *t = 0;
1507
  log(L_REMOTE "%s: %s: %s%s", p->p.name, msg, name, argbuf);
1508
}
1509

    
1510
static void
1511
bgp_rx_notification(struct bgp_conn *conn, byte *pkt, int len)
1512
{
1513
  struct bgp_proto *p = conn->bgp;
1514
  if (len < 21)
1515
    {
1516
      bgp_error(conn, 1, 2, pkt+16, 2);
1517
      return;
1518
    }
1519

    
1520
  unsigned code = pkt[19];
1521
  unsigned subcode = pkt[20];
1522
  int err = (code != 6);
1523

    
1524
  bgp_log_error(p, BE_BGP_RX, "Received", code, subcode, pkt+21, len-21);
1525
  bgp_store_error(p, conn, BE_BGP_RX, (code << 16) | subcode);
1526

    
1527
#ifndef IPV6
1528
  if ((code == 2) && ((subcode == 4) || (subcode == 7))
1529
      /* Error related to capability:
1530
       * 4 - Peer does not support capabilities at all.
1531
       * 7 - Peer request some capability. Strange unless it is IPv6 only peer.
1532
       */
1533
      && (p->cf->capabilities == 2)
1534
      /* Capabilities are not explicitly enabled or disabled, therefore heuristic is used */
1535
      && (conn->start_state == BSS_CONNECT)
1536
      /* Failed connection attempt have used capabilities */
1537
      && (p->cf->remote_as <= 0xFFFF))
1538
      /* Not possible with disabled capabilities */
1539
    {
1540
      /* We try connect without capabilities */
1541
      log(L_WARN "%s: Capability related error received, retry with capabilities disabled", p->p.name);
1542
      p->start_state = BSS_CONNECT_NOCAP;
1543
      err = 0;
1544
    }
1545
#endif
1546

    
1547
  bgp_conn_enter_close_state(conn);
1548
  bgp_schedule_packet(conn, PKT_SCHEDULE_CLOSE);
1549

    
1550
  if (err) 
1551
    {
1552
      bgp_update_startup_delay(p);
1553
      bgp_stop(p, 0);
1554
    }
1555
}
1556

    
1557
static void
1558
bgp_rx_keepalive(struct bgp_conn *conn)
1559
{
1560
  struct bgp_proto *p = conn->bgp;
1561

    
1562
  BGP_TRACE(D_PACKETS, "Got KEEPALIVE");
1563
  bgp_start_timer(conn->hold_timer, conn->hold_time);
1564
  switch (conn->state)
1565
    {
1566
    case BS_OPENCONFIRM:
1567
      bgp_conn_enter_established_state(conn);
1568
      break;
1569
    case BS_ESTABLISHED:
1570
      break;
1571
    default:
1572
      bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0);
1573
    }
1574
}
1575

    
1576
static void
1577
bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, int len)
1578
{
1579
  struct bgp_proto *p = conn->bgp;
1580

    
1581
  if (conn->state != BS_ESTABLISHED)
1582
    { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
1583

    
1584
  if (!p->cf->enable_refresh)
1585
    { bgp_error(conn, 1, 3, pkt+18, 1); return; }
1586

    
1587
  if (len < (BGP_HEADER_LENGTH + 4))
1588
    { bgp_error(conn, 1, 2, pkt+16, 2); return; }
1589

    
1590
  if (len > (BGP_HEADER_LENGTH + 4))
1591
    { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; }
1592

    
1593
  /* FIXME - we ignore AFI/SAFI values, as we support
1594
     just one value and even an error code for an invalid
1595
     request is not defined */
1596

    
1597
  /* RFC 7313 redefined reserved field as RR message subtype */
1598
  uint subtype = conn->peer_enhanced_refresh_support ? pkt[21] : BGP_RR_REQUEST;
1599

    
1600
  switch (subtype)
1601
  {
1602
  case BGP_RR_REQUEST:
1603
    BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH");
1604
    proto_request_feeding(&p->p);
1605
    break;
1606

    
1607
  case BGP_RR_BEGIN:
1608
    BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR");
1609
    bgp_refresh_begin(p);
1610
    break;
1611

    
1612
  case BGP_RR_END:
1613
    BGP_TRACE(D_PACKETS, "Got END-OF-RR");
1614
    bgp_refresh_end(p);
1615
    break;
1616

    
1617
  default:
1618
    log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring",
1619
        p->p.name, subtype);
1620
    break;
1621
  }
1622
}
1623

    
1624

    
1625
/**
1626
 * bgp_rx_packet - handle a received packet
1627
 * @conn: BGP connection
1628
 * @pkt: start of the packet
1629
 * @len: packet size
1630
 *
1631
 * bgp_rx_packet() takes a newly received packet and calls the corresponding
1632
 * packet handler according to the packet type.
1633
 */
1634
static void
1635
bgp_rx_packet(struct bgp_conn *conn, byte *pkt, unsigned len)
1636
{
1637
  byte type = pkt[18];
1638

    
1639
  DBG("BGP: Got packet %02x (%d bytes)\n", type, len);
1640

    
1641
  if (conn->bgp->p.mrtdump & MD_MESSAGES)
1642
    mrt_dump_bgp_packet(conn, pkt, len);
1643

    
1644
  switch (type)
1645
    {
1646
    case PKT_OPEN:                return bgp_rx_open(conn, pkt, len);
1647
    case PKT_UPDATE:                return bgp_rx_update(conn, pkt, len);
1648
    case PKT_NOTIFICATION:      return bgp_rx_notification(conn, pkt, len);
1649
    case PKT_KEEPALIVE:                return bgp_rx_keepalive(conn);
1650
    case PKT_ROUTE_REFRESH:        return bgp_rx_route_refresh(conn, pkt, len);
1651
    default:                        bgp_error(conn, 1, 3, pkt+18, 1);
1652
    }
1653
}
1654

    
1655
/**
1656
 * bgp_rx - handle received data
1657
 * @sk: socket
1658
 * @size: amount of data received
1659
 *
1660
 * bgp_rx() is called by the socket layer whenever new data arrive from
1661
 * the underlying TCP connection. It assembles the data fragments to packets,
1662
 * checks their headers and framing and passes complete packets to
1663
 * bgp_rx_packet().
1664
 */
1665
int
1666
bgp_rx(sock *sk, int size)
1667
{
1668
  struct bgp_conn *conn = sk->data;
1669
  byte *pkt_start = sk->rbuf;
1670
  byte *end = pkt_start + size;
1671
  unsigned i, len;
1672

    
1673
  DBG("BGP: RX hook: Got %d bytes\n", size);
1674
  while (end >= pkt_start + BGP_HEADER_LENGTH)
1675
    {
1676
      if ((conn->state == BS_CLOSE) || (conn->sk != sk))
1677
        return 0;
1678
      for(i=0; i<16; i++)
1679
        if (pkt_start[i] != 0xff)
1680
          {
1681
            bgp_error(conn, 1, 1, NULL, 0);
1682
            break;
1683
          }
1684
      len = get_u16(pkt_start+16);
1685
      if (len < BGP_HEADER_LENGTH || len > BGP_MAX_PACKET_LENGTH)
1686
        {
1687
          bgp_error(conn, 1, 2, pkt_start+16, 2);
1688
          break;
1689
        }
1690
      if (end < pkt_start + len)
1691
        break;
1692
      bgp_rx_packet(conn, pkt_start, len);
1693
      pkt_start += len;
1694
    }
1695
  if (pkt_start != sk->rbuf)
1696
    {
1697
      memmove(sk->rbuf, pkt_start, end - pkt_start);
1698
      sk->rpos = sk->rbuf + (end - pkt_start);
1699
    }
1700
  return 0;
1701
}