Statistics
| Branch: | Revision:

iof-bird-daemon / proto / bgp / packets.c @ 3e236955

History | View | Annotate | Download (42.6 KB)

1
/*
2
 *        BIRD -- BGP Packet Processing
3
 *
4
 *        (c) 2000 Martin Mares <mj@ucw.cz>
5
 *
6
 *        Can be freely distributed and used under the terms of the GNU GPL.
7
 */
8

    
9
#undef LOCAL_DEBUG
10

    
11
#include "nest/bird.h"
12
#include "nest/iface.h"
13
#include "nest/protocol.h"
14
#include "nest/route.h"
15
#include "nest/attrs.h"
16
#include "nest/mrtdump.h"
17
#include "conf/conf.h"
18
#include "lib/unaligned.h"
19
#include "lib/socket.h"
20

    
21
#include "nest/cli.h"
22

    
23
#include "bgp.h"
24

    
25

    
26
#define BGP_RR_REQUEST                0
27
#define BGP_RR_BEGIN                1
28
#define BGP_RR_END                2
29

    
30

    
31
static struct tbf rl_rcv_update = TBF_DEFAULT_LOG_LIMITS;
32
static struct tbf rl_snd_update = TBF_DEFAULT_LOG_LIMITS;
33

    
34
/* Table for state -> RFC 6608 FSM error subcodes */
35
static byte fsm_err_subcode[BS_MAX] = {
36
  [BS_OPENSENT] = 1,
37
  [BS_OPENCONFIRM] = 2,
38
  [BS_ESTABLISHED] = 3
39
};
40

    
41
/*
42
 * MRT Dump format is not semantically specified.
43
 * We will use these values in appropriate fields:
44
 *
45
 * Local AS, Remote AS - configured AS numbers for given BGP instance.
46
 * Local IP, Remote IP - IP addresses of the TCP connection (0 if no connection)
47
 *
48
 * We dump two kinds of MRT messages: STATE_CHANGE (for BGP state
49
 * changes) and MESSAGE (for received BGP messages).
50
 *
51
 * STATE_CHANGE uses always AS4 variant, but MESSAGE uses AS4 variant
52
 * only when AS4 session is established and even in that case MESSAGE
53
 * does not use AS4 variant for initial OPEN message. This strange
54
 * behavior is here for compatibility with Quagga and Bgpdump,
55
 */
56

    
57
static byte *
58
mrt_put_bgp4_hdr(byte *buf, struct bgp_conn *conn, int as4)
59
{
60
  struct bgp_proto *p = conn->bgp;
61

    
62
  if (as4)
63
    {
64
      put_u32(buf+0, p->remote_as);
65
      put_u32(buf+4, p->local_as);
66
      buf+=8;
67
    }
68
  else
69
    {
70
      put_u16(buf+0, (p->remote_as <= 0xFFFF) ? p->remote_as : AS_TRANS);
71
      put_u16(buf+2, (p->local_as <= 0xFFFF)  ? p->local_as  : AS_TRANS);
72
      buf+=4;
73
    }
74

    
75
  put_u16(buf+0, (p->neigh && p->neigh->iface) ? p->neigh->iface->index : 0);
76
  put_u16(buf+2, BGP_AF);
77
  buf+=4;
78
  buf = put_ipa(buf, conn->sk ? conn->sk->daddr : IPA_NONE);
79
  buf = put_ipa(buf, conn->sk ? conn->sk->saddr : IPA_NONE);
80

    
81
  return buf;
82
}
83

    
84
static void
85
mrt_dump_bgp_packet(struct bgp_conn *conn, byte *pkt, unsigned len)
86
{
87
  byte *buf = alloca(128+len);        /* 128 is enough for MRT headers */
88
  byte *bp = buf + MRTDUMP_HDR_LENGTH;
89
  int as4 = conn->bgp->as4_session;
90

    
91
  bp = mrt_put_bgp4_hdr(bp, conn, as4);
92
  memcpy(bp, pkt, len);
93
  bp += len;
94
  mrt_dump_message(&conn->bgp->p, BGP4MP, as4 ? BGP4MP_MESSAGE_AS4 : BGP4MP_MESSAGE,
95
                   buf, bp-buf);
96
}
97

    
98
static inline u16
99
convert_state(unsigned state)
100
{
101
  /* Convert state from our BS_* values to values used in MRTDump */
102
  return (state == BS_CLOSE) ? 1 : state + 1;
103
}
104

    
105
void
106
mrt_dump_bgp_state_change(struct bgp_conn *conn, unsigned old, unsigned new)
107
{
108
  byte buf[128];
109
  byte *bp = buf + MRTDUMP_HDR_LENGTH;
110

    
111
  bp = mrt_put_bgp4_hdr(bp, conn, 1);
112
  put_u16(bp+0, convert_state(old));
113
  put_u16(bp+2, convert_state(new));
114
  bp += 4;
115
  mrt_dump_message(&conn->bgp->p, BGP4MP, BGP4MP_STATE_CHANGE_AS4, buf, bp-buf);
116
}
117

    
118
static byte *
119
bgp_create_notification(struct bgp_conn *conn, byte *buf)
120
{
121
  struct bgp_proto *p = conn->bgp;
122

    
123
  BGP_TRACE(D_PACKETS, "Sending NOTIFICATION(code=%d.%d)", conn->notify_code, conn->notify_subcode);
124
  buf[0] = conn->notify_code;
125
  buf[1] = conn->notify_subcode;
126
  memcpy(buf+2, conn->notify_data, conn->notify_size);
127
  return buf + 2 + conn->notify_size;
128
}
129

    
130
#ifdef IPV6
131
static byte *
132
bgp_put_cap_ipv6(struct bgp_proto *p UNUSED, byte *buf)
133
{
134
  *buf++ = 1;                /* Capability 1: Multiprotocol extensions */
135
  *buf++ = 4;                /* Capability data length */
136
  *buf++ = 0;                /* We support AF IPv6 */
137
  *buf++ = BGP_AF_IPV6;
138
  *buf++ = 0;                /* RFU */
139
  *buf++ = 1;                /* and SAFI 1 */
140
  return buf;
141
}
142

    
143
#else
144

    
145
static byte *
146
bgp_put_cap_ipv4(struct bgp_proto *p UNUSED, byte *buf)
147
{
148
  *buf++ = 1;                /* Capability 1: Multiprotocol extensions */
149
  *buf++ = 4;                /* Capability data length */
150
  *buf++ = 0;                /* We support AF IPv4 */
151
  *buf++ = BGP_AF_IPV4;
152
  *buf++ = 0;                /* RFU */
153
  *buf++ = 1;                /* and SAFI 1 */
154
  return buf;
155
}
156
#endif
157

    
158
static byte *
159
bgp_put_cap_rr(struct bgp_proto *p UNUSED, byte *buf)
160
{
161
  *buf++ = 2;                /* Capability 2: Support for route refresh */
162
  *buf++ = 0;                /* Capability data length */
163
  return buf;
164
}
165

    
166
static byte *
167
bgp_put_cap_ext_msg(struct bgp_proto *p UNUSED, byte *buf)
168
{
169
  *buf++ = 6;                /* Capability 6: Support for extended messages */
170
  *buf++ = 0;                /* Capability data length */
171
  return buf;
172
}
173

    
174
static byte *
175
bgp_put_cap_gr1(struct bgp_proto *p, byte *buf)
176
{
177
  *buf++ = 64;                /* Capability 64: Support for graceful restart */
178
  *buf++ = 6;                /* Capability data length */
179

    
180
  put_u16(buf, p->cf->gr_time);
181
  if (p->p.gr_recovery)
182
    buf[0] |= BGP_GRF_RESTART;
183
  buf += 2;
184

    
185
  *buf++ = 0;                /* Appropriate AF */
186
  *buf++ = BGP_AF;
187
  *buf++ = 1;                /* and SAFI 1 */
188
  *buf++ = p->p.gr_recovery ? BGP_GRF_FORWARDING : 0;
189

    
190
  return buf;
191
}
192

    
193
static byte *
194
bgp_put_cap_gr2(struct bgp_proto *p UNUSED, byte *buf)
195
{
196
  *buf++ = 64;                /* Capability 64: Support for graceful restart */
197
  *buf++ = 2;                /* Capability data length */
198
  put_u16(buf, 0);
199
  return buf + 2;
200
}
201

    
202
static byte *
203
bgp_put_cap_as4(struct bgp_proto *p, byte *buf)
204
{
205
  *buf++ = 65;                /* Capability 65: Support for 4-octet AS number */
206
  *buf++ = 4;                /* Capability data length */
207
  put_u32(buf, p->local_as);
208
  return buf + 4;
209
}
210

    
211
static byte *
212
bgp_put_cap_add_path(struct bgp_proto *p, byte *buf)
213
{
214
  *buf++ = 69;                /* Capability 69: Support for ADD-PATH */
215
  *buf++ = 4;                /* Capability data length */
216

    
217
  *buf++ = 0;                /* Appropriate AF */
218
  *buf++ = BGP_AF;
219
  *buf++ = 1;                /* SAFI 1 */
220

    
221
  *buf++ = p->cf->add_path;
222

    
223
  return buf;
224
}
225

    
226
static byte *
227
bgp_put_cap_err(struct bgp_proto *p UNUSED, byte *buf)
228
{
229
  *buf++ = 70;                /* Capability 70: Support for enhanced route refresh */
230
  *buf++ = 0;                /* Capability data length */
231
  return buf;
232
}
233

    
234

    
235
static byte *
236
bgp_create_open(struct bgp_conn *conn, byte *buf)
237
{
238
  struct bgp_proto *p = conn->bgp;
239
  byte *cap;
240
  int cap_len;
241

    
242
  BGP_TRACE(D_PACKETS, "Sending OPEN(ver=%d,as=%d,hold=%d,id=%08x)",
243
            BGP_VERSION, p->local_as, p->cf->hold_time, p->local_id);
244
  buf[0] = BGP_VERSION;
245
  put_u16(buf+1, (p->local_as < 0xFFFF) ? p->local_as : AS_TRANS);
246
  put_u16(buf+3, p->cf->hold_time);
247
  put_u32(buf+5, p->local_id);
248

    
249
  if (conn->start_state == BSS_CONNECT_NOCAP)
250
    {
251
      BGP_TRACE(D_PACKETS, "Skipping capabilities");
252
      buf[9] = 0;
253
      return buf + 10;
254
    }
255

    
256
  /* Skipped 3 B for length field and Capabilities parameter header */
257
  cap = buf + 12;
258

    
259
#ifndef IPV6
260
  if (p->cf->advertise_ipv4)
261
    cap = bgp_put_cap_ipv4(p, cap);
262
#endif
263

    
264
#ifdef IPV6
265
  cap = bgp_put_cap_ipv6(p, cap);
266
#endif
267

    
268
  if (p->cf->enable_refresh)
269
    cap = bgp_put_cap_rr(p, cap);
270

    
271
  if (p->cf->gr_mode == BGP_GR_ABLE)
272
    cap = bgp_put_cap_gr1(p, cap);
273
  else if (p->cf->gr_mode == BGP_GR_AWARE)
274
    cap = bgp_put_cap_gr2(p, cap);
275

    
276
  if (p->cf->enable_as4)
277
    cap = bgp_put_cap_as4(p, cap);
278

    
279
  if (p->cf->add_path)
280
    cap = bgp_put_cap_add_path(p, cap);
281

    
282
  if (p->cf->enable_refresh)
283
    cap = bgp_put_cap_err(p, cap);
284

    
285
  if (p->cf->enable_extended_messages)
286
    cap = bgp_put_cap_ext_msg(p, cap);
287

    
288
  cap_len = cap - buf - 12;
289
  if (cap_len > 0)
290
    {
291
      buf[9]  = cap_len + 2;        /* Optional params len */
292
      buf[10] = 2;                /* Option: Capability list */
293
      buf[11] = cap_len;        /* Option length */
294
      return cap;
295
    }
296
  else
297
    {
298
      buf[9] = 0;                /* No optional parameters */
299
      return buf + 10;
300
    }
301
}
302

    
303
static uint
304
bgp_encode_prefixes(struct bgp_proto *p, byte *w, struct bgp_bucket *buck, uint remains)
305
{
306
  byte *start = w;
307
  ip_addr a;
308
  int bytes;
309

    
310
  while (!EMPTY_LIST(buck->prefixes) && (remains >= (5+sizeof(ip_addr))))
311
    {
312
      struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes));
313
      DBG("\tDequeued route %I/%d\n", px->n.prefix, px->n.pxlen);
314

    
315
      if (p->add_path_tx)
316
        {
317
          put_u32(w, px->path_id);
318
          w += 4;
319
          remains -= 4;
320
        }
321

    
322
      *w++ = px->n.pxlen;
323
      bytes = (px->n.pxlen + 7) / 8;
324
      a = px->n.prefix;
325
      ipa_hton(a);
326
      memcpy(w, &a, bytes);
327
      w += bytes;
328
      remains -= bytes + 1;
329
      rem_node(&px->bucket_node);
330
      bgp_free_prefix(p, px);
331
      // fib_delete(&p->prefix_fib, px);
332
    }
333
  return w - start;
334
}
335

    
336
static void
337
bgp_flush_prefixes(struct bgp_proto *p, struct bgp_bucket *buck)
338
{
339
  while (!EMPTY_LIST(buck->prefixes))
340
    {
341
      struct bgp_prefix *px = SKIP_BACK(struct bgp_prefix, bucket_node, HEAD(buck->prefixes));
342
      log(L_ERR "%s: - route %I/%d skipped", p->p.name, px->n.prefix, px->n.pxlen);
343
      rem_node(&px->bucket_node);
344
      bgp_free_prefix(p, px);
345
      // fib_delete(&p->prefix_fib, px);
346
    }
347
}
348

    
349
#ifndef IPV6                /* IPv4 version */
350

    
351
static byte *
352
bgp_create_update(struct bgp_conn *conn, byte *buf)
353
{
354
  struct bgp_proto *p = conn->bgp;
355
  struct bgp_bucket *buck;
356
  int remains = bgp_max_packet_length(p) - BGP_HEADER_LENGTH - 4;
357
  byte *w;
358
  int wd_size = 0;
359
  int r_size = 0;
360
  int a_size = 0;
361

    
362
  w = buf+2;
363
  if ((buck = p->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
364
    {
365
      DBG("Withdrawn routes:\n");
366
      wd_size = bgp_encode_prefixes(p, w, buck, remains);
367
      w += wd_size;
368
      remains -= wd_size;
369
    }
370
  put_u16(buf, wd_size);
371

    
372
  if (!wd_size)
373
    {
374
      while ((buck = (struct bgp_bucket *) HEAD(p->bucket_queue))->send_node.next)
375
        {
376
          if (EMPTY_LIST(buck->prefixes))
377
            {
378
              DBG("Deleting empty bucket %p\n", buck);
379
              rem_node(&buck->send_node);
380
              bgp_free_bucket(p, buck);
381
              continue;
382
            }
383

    
384
          DBG("Processing bucket %p\n", buck);
385
          a_size = bgp_encode_attrs(p, w+2, buck->eattrs, remains - 1024);
386

    
387
          if (a_size < 0)
388
            {
389
              log(L_ERR "%s: Attribute list too long, skipping corresponding routes", p->p.name);
390
              bgp_flush_prefixes(p, buck);
391
              rem_node(&buck->send_node);
392
              bgp_free_bucket(p, buck);
393
              continue;
394
            }
395

    
396
          put_u16(w, a_size);
397
          w += a_size + 2;
398
          r_size = bgp_encode_prefixes(p, w, buck, remains - a_size);
399
          w += r_size;
400
          break;
401
        }
402
    }
403
  if (!a_size)                                /* Attributes not already encoded */
404
    {
405
      put_u16(w, 0);
406
      w += 2;
407
    }
408
  if (wd_size || r_size)
409
    {
410
      BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE");
411
      return w;
412
    }
413
  else
414
    return NULL;
415
}
416

    
417
static byte *
418
bgp_create_end_mark(struct bgp_conn *conn, byte *buf)
419
{
420
  struct bgp_proto *p = conn->bgp;
421
  BGP_TRACE(D_PACKETS, "Sending END-OF-RIB");
422

    
423
  put_u32(buf, 0);
424
  return buf+4;
425
}
426

    
427
#else                /* IPv6 version */
428

    
429
static inline int
430
same_iface(struct bgp_proto *p, ip_addr *ip)
431
{
432
  neighbor *n = neigh_find(&p->p, ip, 0);
433
  return n && p->neigh && n->iface == p->neigh->iface;
434
}
435

    
436
static byte *
437
bgp_create_update(struct bgp_conn *conn, byte *buf)
438
{
439
  struct bgp_proto *p = conn->bgp;
440
  struct bgp_bucket *buck;
441
  int size, second, rem_stored;
442
  int remains = bgp_max_packet_length(p) - BGP_HEADER_LENGTH - 4;
443
  byte *w, *w_stored, *tmp, *tstart;
444
  ip_addr *ipp, ip, ip_ll;
445
  ea_list *ea;
446
  eattr *nh;
447

    
448
  put_u16(buf, 0);
449
  w = buf+4;
450

    
451
  if ((buck = p->withdraw_bucket) && !EMPTY_LIST(buck->prefixes))
452
    {
453
      DBG("Withdrawn routes:\n");
454
      tmp = bgp_attach_attr_wa(&ea, bgp_linpool, BA_MP_UNREACH_NLRI, remains-8);
455
      *tmp++ = 0;
456
      *tmp++ = BGP_AF_IPV6;
457
      *tmp++ = 1;
458
      ea->attrs[0].u.ptr->length = 3 + bgp_encode_prefixes(p, tmp, buck, remains-11);
459
      size = bgp_encode_attrs(p, w, ea, remains);
460
      ASSERT(size >= 0);
461
      w += size;
462
      remains -= size;
463
    }
464
  else
465
    {
466
      while ((buck = (struct bgp_bucket *) HEAD(p->bucket_queue))->send_node.next)
467
        {
468
          if (EMPTY_LIST(buck->prefixes))
469
            {
470
              DBG("Deleting empty bucket %p\n", buck);
471
              rem_node(&buck->send_node);
472
              bgp_free_bucket(p, buck);
473
              continue;
474
            }
475

    
476
          DBG("Processing bucket %p\n", buck);
477
          rem_stored = remains;
478
          w_stored = w;
479

    
480
          size = bgp_encode_attrs(p, w, buck->eattrs, remains - 1024);
481
          if (size < 0)
482
            {
483
              log(L_ERR "%s: Attribute list too long, skipping corresponding routes", p->p.name);
484
              bgp_flush_prefixes(p, buck);
485
              rem_node(&buck->send_node);
486
              bgp_free_bucket(p, buck);
487
              continue;
488
            }
489
          w += size;
490
          remains -= size;
491

    
492
          /* We have two addresses here in NEXT_HOP eattr. Really.
493
             Unless NEXT_HOP was modified by filter */
494
          nh = ea_find(buck->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP));
495
          ASSERT(nh);
496
          second = (nh->u.ptr->length == NEXT_HOP_LENGTH);
497
          ipp = (ip_addr *) nh->u.ptr->data;
498
          ip = ipp[0];
499
          ip_ll = IPA_NONE;
500

    
501
          if (ipa_equal(ip, p->source_addr))
502
            ip_ll = p->local_link;
503
          else
504
            {
505
              /* If we send a route with 'third party' next hop destinated 
506
               * in the same interface, we should also send a link local 
507
               * next hop address. We use the received one (stored in the 
508
               * other part of BA_NEXT_HOP eattr). If we didn't received
509
               * it (for example it is a static route), we can't use
510
               * 'third party' next hop and we have to use local IP address
511
               * as next hop. Sending original next hop address without
512
               * link local address seems to be a natural way to solve that
513
               * problem, but it is contrary to RFC 2545 and Quagga does not
514
               * accept such routes.
515
               *
516
               * There are two cases, either we have global IP, or
517
               * IPA_NONE if the neighbor is link-local. For IPA_NONE,
518
               * we suppose it is on the same iface, see bgp_update_attrs().
519
               */
520

    
521
              if (ipa_zero(ip) || same_iface(p, &ip))
522
                {
523
                  if (second && ipa_nonzero(ipp[1]))
524
                    ip_ll = ipp[1];
525
                  else
526
                    {
527
                      switch (p->cf->missing_lladdr)
528
                        {
529
                        case MLL_SELF:
530
                          ip = p->source_addr;
531
                          ip_ll = p->local_link;
532
                          break;
533
                        case MLL_DROP:
534
                          log(L_ERR "%s: Missing link-local next hop address, skipping corresponding routes", p->p.name);
535
                          w = w_stored;
536
                          remains = rem_stored;
537
                          bgp_flush_prefixes(p, buck);
538
                          rem_node(&buck->send_node);
539
                          bgp_free_bucket(p, buck);
540
                          continue;
541
                        case MLL_IGNORE:
542
                          break;
543
                        }
544
                    }
545
                }
546
            }
547

    
548
          tstart = tmp = bgp_attach_attr_wa(&ea, bgp_linpool, BA_MP_REACH_NLRI, remains-8);
549
          *tmp++ = 0;
550
          *tmp++ = BGP_AF_IPV6;
551
          *tmp++ = 1;
552

    
553
          if (ipa_is_link_local(ip))
554
            ip = IPA_NONE;
555

    
556
          if (ipa_nonzero(ip_ll))
557
            {
558
              *tmp++ = 32;
559
              ipa_hton(ip);
560
              memcpy(tmp, &ip, 16);
561
              ipa_hton(ip_ll);
562
              memcpy(tmp+16, &ip_ll, 16);
563
              tmp += 32;
564
            }
565
          else
566
            {
567
              *tmp++ = 16;
568
              ipa_hton(ip);
569
              memcpy(tmp, &ip, 16);
570
              tmp += 16;
571
            }
572

    
573
          *tmp++ = 0;                        /* No SNPA information */
574
          tmp += bgp_encode_prefixes(p, tmp, buck, remains - (8+3+32+1));
575
          ea->attrs[0].u.ptr->length = tmp - tstart;
576
          size = bgp_encode_attrs(p, w, ea, remains);
577
          ASSERT(size >= 0);
578
          w += size;
579
          break;
580
        }
581
    }
582

    
583
  size = w - (buf+4);
584
  put_u16(buf+2, size);
585
  lp_flush(bgp_linpool);
586
  if (size)
587
    {
588
      BGP_TRACE_RL(&rl_snd_update, D_PACKETS, "Sending UPDATE");
589
      return w;
590
    }
591
  else
592
    return NULL;
593
}
594

    
595
static byte *
596
bgp_create_end_mark(struct bgp_conn *conn, byte *buf)
597
{
598
  struct bgp_proto *p = conn->bgp;
599
  BGP_TRACE(D_PACKETS, "Sending END-OF-RIB");
600

    
601
  put_u16(buf+0, 0);
602
  put_u16(buf+2, 6);        /* length 4-9 */
603
  buf += 4;
604

    
605
  /* Empty MP_UNREACH_NLRI atribute */
606
  *buf++ = BAF_OPTIONAL;
607
  *buf++ = BA_MP_UNREACH_NLRI;
608
  *buf++ = 3;                /* Length 7-9 */
609
  *buf++ = 0;                /* AFI */
610
  *buf++ = BGP_AF_IPV6;
611
  *buf++ = 1;                /* SAFI */
612
  return buf;
613
}
614

    
615
#endif
616

    
617
static inline byte *
618
bgp_create_route_refresh(struct bgp_conn *conn, byte *buf)
619
{
620
  struct bgp_proto *p = conn->bgp;
621
  BGP_TRACE(D_PACKETS, "Sending ROUTE-REFRESH");
622

    
623
  /* Original original route refresh request, RFC 2918 */
624
  *buf++ = 0;
625
  *buf++ = BGP_AF;
626
  *buf++ = BGP_RR_REQUEST;
627
  *buf++ = 1;                /* SAFI */
628
  return buf;
629
}
630

    
631
static inline byte *
632
bgp_create_begin_refresh(struct bgp_conn *conn, byte *buf)
633
{
634
  struct bgp_proto *p = conn->bgp;
635
  BGP_TRACE(D_PACKETS, "Sending BEGIN-OF-RR");
636

    
637
  /* Demarcation of beginning of route refresh (BoRR), RFC 7313 */
638
  *buf++ = 0;
639
  *buf++ = BGP_AF;
640
  *buf++ = BGP_RR_BEGIN;
641
  *buf++ = 1;                /* SAFI */
642
  return buf;
643
}
644

    
645
static inline byte *
646
bgp_create_end_refresh(struct bgp_conn *conn, byte *buf)
647
{
648
  struct bgp_proto *p = conn->bgp;
649
  BGP_TRACE(D_PACKETS, "Sending END-OF-RR");
650

    
651
  /* Demarcation of ending of route refresh (EoRR), RFC 7313 */
652
  *buf++ = 0;
653
  *buf++ = BGP_AF;
654
  *buf++ = BGP_RR_END;
655
  *buf++ = 1;                /* SAFI */
656
  return buf;
657
}
658

    
659

    
660
static void
661
bgp_create_header(byte *buf, uint len, uint type)
662
{
663
  memset(buf, 0xff, 16);                /* Marker */
664
  put_u16(buf+16, len);
665
  buf[18] = type;
666
}
667

    
668
/**
669
 * bgp_fire_tx - transmit packets
670
 * @conn: connection
671
 *
672
 * Whenever the transmit buffers of the underlying TCP connection
673
 * are free and we have any packets queued for sending, the socket functions
674
 * call bgp_fire_tx() which takes care of selecting the highest priority packet
675
 * queued (Notification > Keepalive > Open > Update), assembling its header
676
 * and body and sending it to the connection.
677
 */
678
static int
679
bgp_fire_tx(struct bgp_conn *conn)
680
{
681
  struct bgp_proto *p = conn->bgp;
682
  uint s = conn->packets_to_send;
683
  sock *sk = conn->sk;
684
  byte *buf, *pkt, *end;
685
  int type;
686

    
687
  if (!sk)
688
    {
689
      conn->packets_to_send = 0;
690
      return 0;
691
    }
692
  buf = sk->tbuf;
693
  pkt = buf + BGP_HEADER_LENGTH;
694

    
695
  if (s & (1 << PKT_SCHEDULE_CLOSE))
696
    {
697
      /* We can finally close connection and enter idle state */
698
      bgp_conn_enter_idle_state(conn);
699
      return 0;
700
    }
701
  if (s & (1 << PKT_NOTIFICATION))
702
    {
703
      s = 1 << PKT_SCHEDULE_CLOSE;
704
      type = PKT_NOTIFICATION;
705
      end = bgp_create_notification(conn, pkt);
706
    }
707
  else if (s & (1 << PKT_KEEPALIVE))
708
    {
709
      s &= ~(1 << PKT_KEEPALIVE);
710
      type = PKT_KEEPALIVE;
711
      end = pkt;                        /* Keepalives carry no data */
712
      BGP_TRACE(D_PACKETS, "Sending KEEPALIVE");
713
      bgp_start_timer(conn->keepalive_timer, conn->keepalive_time);
714
    }
715
  else if (s & (1 << PKT_OPEN))
716
    {
717
      s &= ~(1 << PKT_OPEN);
718
      type = PKT_OPEN;
719
      end = bgp_create_open(conn, pkt);
720
    }
721
  else if (s & (1 << PKT_ROUTE_REFRESH))
722
    {
723
      s &= ~(1 << PKT_ROUTE_REFRESH);
724
      type = PKT_ROUTE_REFRESH;
725
      end = bgp_create_route_refresh(conn, pkt);
726
    }
727
  else if (s & (1 << PKT_BEGIN_REFRESH))
728
    {
729
      s &= ~(1 << PKT_BEGIN_REFRESH);
730
      type = PKT_ROUTE_REFRESH;        /* BoRR is a subtype of RR */
731
      end = bgp_create_begin_refresh(conn, pkt);
732
    }
733
  else if (s & (1 << PKT_UPDATE))
734
    {
735
      type = PKT_UPDATE;
736
      end = bgp_create_update(conn, pkt);
737

    
738
      if (!end)
739
        {
740
          /* No update to send, perhaps we need to send End-of-RIB or EoRR */
741

    
742
          conn->packets_to_send = 0;
743

    
744
          if (p->feed_state == BFS_LOADED)
745
          {
746
            type = PKT_UPDATE;
747
            end = bgp_create_end_mark(conn, pkt);
748
          }
749

    
750
          else if (p->feed_state == BFS_REFRESHED)
751
          {
752
            type = PKT_ROUTE_REFRESH;
753
            end = bgp_create_end_refresh(conn, pkt);
754
          }
755

    
756
          else /* Really nothing to send */
757
            return 0;
758

    
759
          p->feed_state = BFS_NONE;
760
        }
761
    }
762
  else
763
    return 0;
764

    
765
  conn->packets_to_send = s;
766
  bgp_create_header(buf, end - buf, type);
767
  return sk_send(sk, end - buf);
768
}
769

    
770
/**
771
 * bgp_schedule_packet - schedule a packet for transmission
772
 * @conn: connection
773
 * @type: packet type
774
 *
775
 * Schedule a packet of type @type to be sent as soon as possible.
776
 */
777
void
778
bgp_schedule_packet(struct bgp_conn *conn, int type)
779
{
780
  DBG("BGP: Scheduling packet type %d\n", type);
781
  conn->packets_to_send |= 1 << type;
782
  if (conn->sk && conn->sk->tpos == conn->sk->tbuf && !ev_active(conn->tx_ev))
783
    ev_schedule(conn->tx_ev);
784
}
785

    
786
void
787
bgp_kick_tx(void *vconn)
788
{
789
  struct bgp_conn *conn = vconn;
790

    
791
  DBG("BGP: kicking TX\n");
792
  while (bgp_fire_tx(conn) > 0)
793
    ;
794
}
795

    
796
void
797
bgp_tx(sock *sk)
798
{
799
  struct bgp_conn *conn = sk->data;
800

    
801
  DBG("BGP: TX hook\n");
802
  while (bgp_fire_tx(conn) > 0)
803
    ;
804
}
805

    
806
/* Capatibility negotiation as per RFC 2842 */
807

    
808
void
809
bgp_parse_capabilities(struct bgp_conn *conn, byte *opt, int len)
810
{
811
  // struct bgp_proto *p = conn->bgp;
812
  int i, cl;
813

    
814
  while (len > 0)
815
    {
816
      if (len < 2 || len < 2 + opt[1])
817
        goto err;
818

    
819
      cl = opt[1];
820

    
821
      switch (opt[0])
822
        {
823
        case 2:        /* Route refresh capability, RFC 2918 */
824
          if (cl != 0)
825
            goto err;
826
          conn->peer_refresh_support = 1;
827
          break;
828

    
829
        case 6: /* Extended message length capability, draft */
830
          if (cl != 0)
831
            goto err;
832
          conn->peer_ext_messages_support = 1;
833
          break;
834

    
835
        case 64: /* Graceful restart capability, RFC 4724 */
836
          if (cl % 4 != 2)
837
            goto err;
838
          conn->peer_gr_aware = 1;
839
          conn->peer_gr_able = 0;
840
          conn->peer_gr_time = get_u16(opt + 2) & 0x0fff;
841
          conn->peer_gr_flags = opt[2] & 0xf0;
842
          conn->peer_gr_aflags = 0;
843
          for (i = 2; i < cl; i += 4)
844
            if (opt[2+i+0] == 0 && opt[2+i+1] == BGP_AF && opt[2+i+2] == 1) /* Match AFI/SAFI */
845
              {
846
                conn->peer_gr_able = 1;
847
                conn->peer_gr_aflags = opt[2+i+3];
848
              }
849
          break;
850

    
851
        case 65: /* AS4 capability, RFC 4893 */
852
          if (cl != 4)
853
            goto err;
854
          conn->peer_as4_support = 1;
855
          if (conn->bgp->cf->enable_as4)
856
            conn->advertised_as = get_u32(opt + 2);
857
          break;
858

    
859
        case 69: /* ADD-PATH capability, draft */
860
          if (cl % 4)
861
            goto err;
862
          for (i = 0; i < cl; i += 4)
863
            if (opt[2+i+0] == 0 && opt[2+i+1] == BGP_AF && opt[2+i+2] == 1) /* Match AFI/SAFI */
864
              conn->peer_add_path = opt[2+i+3];
865
          if (conn->peer_add_path > ADD_PATH_FULL)
866
            goto err;
867
          break;
868

    
869
        case 70: /* Enhanced route refresh capability, RFC 7313 */
870
          if (cl != 0)
871
            goto err;
872
          conn->peer_enhanced_refresh_support = 1;
873
          break;
874

    
875
          /* We can safely ignore all other capabilities */
876
        }
877
      len -= 2 + cl;
878
      opt += 2 + cl;
879
    }
880
  return;
881

    
882
 err:
883
  bgp_error(conn, 2, 0, NULL, 0);
884
  return;
885
}
886

    
887
static int
888
bgp_parse_options(struct bgp_conn *conn, byte *opt, int len)
889
{
890
  struct bgp_proto *p = conn->bgp;
891
  int ol;
892

    
893
  while (len > 0)
894
    {
895
      if (len < 2 || len < 2 + opt[1])
896
        { bgp_error(conn, 2, 0, NULL, 0); return 0; }
897
#ifdef LOCAL_DEBUG
898
      {
899
        int i;
900
        DBG("\tOption %02x:", opt[0]);
901
        for(i=0; i<opt[1]; i++)
902
          DBG(" %02x", opt[2+i]);
903
        DBG("\n");
904
      }
905
#endif
906

    
907
      ol = opt[1];
908
      switch (opt[0])
909
        {
910
        case 2:
911
          if (conn->start_state == BSS_CONNECT_NOCAP)
912
            BGP_TRACE(D_PACKETS, "Ignoring received capabilities");
913
          else
914
            bgp_parse_capabilities(conn, opt + 2, ol);
915
          break;
916

    
917
        default:
918
          /*
919
           *  BGP specs don't tell us to send which option
920
           *  we didn't recognize, but it's common practice
921
           *  to do so. Also, capability negotiation with
922
           *  Cisco routers doesn't work without that.
923
           */
924
          bgp_error(conn, 2, 4, opt, ol);
925
          return 0;
926
        }
927
      len -= 2 + ol;
928
      opt += 2 + ol;
929
    }
930
  return 0;
931
}
932

    
933
static void
934
bgp_rx_open(struct bgp_conn *conn, byte *pkt, uint len)
935
{
936
  struct bgp_conn *other;
937
  struct bgp_proto *p = conn->bgp;
938
  unsigned hold;
939
  u16 base_as;
940
  u32 id;
941

    
942
  /* Check state */
943
  if (conn->state != BS_OPENSENT)
944
    { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
945

    
946
  /* Check message contents */
947
  if (len < 29 || len != 29U + pkt[28])
948
    { bgp_error(conn, 1, 2, pkt+16, 2); return; }
949
  if (pkt[19] != BGP_VERSION)
950
    { bgp_error(conn, 2, 1, pkt+19, 1); return; } /* RFC 1771 says 16 bits, draft-09 tells to use 8 */
951
  conn->advertised_as = base_as = get_u16(pkt+20);
952
  hold = get_u16(pkt+22);
953
  id = get_u32(pkt+24);
954
  BGP_TRACE(D_PACKETS, "Got OPEN(as=%d,hold=%d,id=%08x)", conn->advertised_as, hold, id);
955

    
956
  if (bgp_parse_options(conn, pkt+29, pkt[28]))
957
    return;
958

    
959
  if (hold > 0 && hold < 3)
960
    { bgp_error(conn, 2, 6, pkt+22, 2); return; }
961

    
962
  /* RFC 6286 2.2 - router ID is nonzero and AS-wide unique */
963
  if (!id || (p->is_internal && id == p->local_id))
964
    { bgp_error(conn, 2, 3, pkt+24, -4); return; }
965

    
966
  if ((conn->advertised_as != base_as) && (base_as != AS_TRANS))
967
    log(L_WARN "%s: Peer advertised inconsistent AS numbers", p->p.name);
968

    
969
  if (conn->advertised_as != p->remote_as)
970
    {
971
      if (conn->peer_as4_support)
972
        {
973
          u32 val = htonl(conn->advertised_as);
974
          bgp_error(conn, 2, 2, (byte *) &val, 4);
975
        }
976
      else
977
        bgp_error(conn, 2, 2, pkt+20, 2);
978

    
979
      return;
980
    }
981

    
982
  /* Check the other connection */
983
  other = (conn == &p->outgoing_conn) ? &p->incoming_conn : &p->outgoing_conn;
984
  switch (other->state)
985
    {
986
    case BS_CONNECT:
987
    case BS_ACTIVE:
988
      /* Stop outgoing connection attempts */
989
      bgp_conn_enter_idle_state(other);
990
      break;
991

    
992
    case BS_IDLE:
993
    case BS_OPENSENT:
994
    case BS_CLOSE:
995
      break;
996

    
997
    case BS_OPENCONFIRM:
998
      /*
999
       * Description of collision detection rules in RFC 4271 is confusing and
1000
       * contradictory, but it is essentially:
1001
       *
1002
       * 1. Router with higher ID is dominant
1003
       * 2. If both have the same ID, router with higher ASN is dominant [RFC6286]
1004
       * 3. When both connections are in OpenConfirm state, one initiated by
1005
       *    the dominant router is kept.
1006
       *
1007
       * The first line in the expression below evaluates whether the neighbor
1008
       * is dominant, the second line whether the new connection was initiated
1009
       * by the neighbor. If both are true (or both are false), we keep the new
1010
       * connection, otherwise we keep the old one.
1011
       */
1012
      if (((p->local_id < id) || ((p->local_id == id) && (p->local_as < p->remote_as)))
1013
          == (conn == &p->incoming_conn))
1014
        {
1015
          /* Should close the other connection */
1016
          BGP_TRACE(D_EVENTS, "Connection collision, giving up the other connection");
1017
          bgp_error(other, 6, 7, NULL, 0);
1018
          break;
1019
        }
1020
      /* Fall thru */
1021
    case BS_ESTABLISHED:
1022
      /* Should close this connection */
1023
      BGP_TRACE(D_EVENTS, "Connection collision, giving up this connection");
1024
      bgp_error(conn, 6, 7, NULL, 0);
1025
      return;
1026
    default:
1027
      bug("bgp_rx_open: Unknown state");
1028
    }
1029

    
1030
  /* Update our local variables */
1031
  conn->hold_time = MIN(hold, p->cf->hold_time);
1032
  conn->keepalive_time = p->cf->keepalive_time ? : conn->hold_time / 3;
1033
  p->remote_id = id;
1034
  p->as4_session = p->cf->enable_as4 && conn->peer_as4_support;
1035
  p->add_path_rx = (p->cf->add_path & ADD_PATH_RX) && (conn->peer_add_path & ADD_PATH_TX);
1036
  p->add_path_tx = (p->cf->add_path & ADD_PATH_TX) && (conn->peer_add_path & ADD_PATH_RX);
1037
  p->gr_ready = p->cf->gr_mode && conn->peer_gr_able;
1038
  p->ext_messages = p->cf->enable_extended_messages && conn->peer_ext_messages_support;
1039

    
1040
  if (p->add_path_tx)
1041
    p->p.accept_ra_types = RA_ANY;
1042

    
1043
  DBG("BGP: Hold timer set to %d, keepalive to %d, AS to %d, ID to %x, AS4 session to %d\n", conn->hold_time, conn->keepalive_time, p->remote_as, p->remote_id, p->as4_session);
1044

    
1045
  bgp_schedule_packet(conn, PKT_KEEPALIVE);
1046
  bgp_start_timer(conn->hold_timer, conn->hold_time);
1047
  bgp_conn_enter_openconfirm_state(conn);
1048
}
1049

    
1050

    
1051
static inline void
1052
bgp_rx_end_mark(struct bgp_proto *p)
1053
{
1054
  BGP_TRACE(D_PACKETS, "Got END-OF-RIB");
1055

    
1056
  if (p->load_state == BFS_LOADING)
1057
    p->load_state = BFS_NONE;
1058

    
1059
  if (p->p.gr_recovery)
1060
    proto_graceful_restart_unlock(&p->p);
1061

    
1062
  if (p->gr_active)
1063
    bgp_graceful_restart_done(p);
1064
}
1065

    
1066

    
1067
#define DECODE_PREFIX(pp, ll) do {                \
1068
  if (p->add_path_rx)                                \
1069
  {                                                \
1070
    if (ll < 5) { err=1; goto done; }                \
1071
    path_id = get_u32(pp);                        \
1072
    pp += 4;                                        \
1073
    ll -= 4;                                        \
1074
  }                                                \
1075
  int b = *pp++;                                \
1076
  int q;                                        \
1077
  ll--;                                                \
1078
  if (b > BITS_PER_IP_ADDRESS) { err=10; goto done; } \
1079
  q = (b+7) / 8;                                \
1080
  if (ll < q) { err=1; goto done; }                \
1081
  memcpy(&prefix, pp, q);                        \
1082
  pp += q;                                        \
1083
  ll -= q;                                        \
1084
  ipa_ntoh(prefix);                                \
1085
  prefix = ipa_and(prefix, ipa_mkmask(b));        \
1086
  pxlen = b;                                        \
1087
} while (0)
1088

    
1089

    
1090
static inline void
1091
bgp_rte_update(struct bgp_proto *p, ip_addr prefix, int pxlen,
1092
               u32 path_id, u32 *last_id, struct rte_src **src,
1093
               rta *a0, rta **a)
1094
{
1095
  if (path_id != *last_id)
1096
    {
1097
      *src = rt_get_source(&p->p, path_id);
1098
      *last_id = path_id;
1099

    
1100
      if (*a)
1101
        {
1102
          rta_free(*a);
1103
          *a = NULL;
1104
        }
1105
    }
1106

    
1107
  /* Prepare cached route attributes */
1108
  if (!*a)
1109
    {
1110
      a0->src = *src;
1111

    
1112
      /* Workaround for rta_lookup() breaking eattrs */
1113
      ea_list *ea = a0->eattrs;
1114
      *a = rta_lookup(a0);
1115
      a0->eattrs = ea;
1116
    }
1117

    
1118
  net *n = net_get(p->p.table, prefix, pxlen);
1119
  rte *e = rte_get_temp(rta_clone(*a));
1120
  e->net = n;
1121
  e->pflags = 0;
1122
  e->u.bgp.suppressed = 0;
1123
  rte_update2(p->p.main_ahook, n, e, *src);
1124
}
1125

    
1126
static inline void
1127
bgp_rte_withdraw(struct bgp_proto *p, ip_addr prefix, int pxlen,
1128
                 u32 path_id, u32 *last_id, struct rte_src **src)
1129
{
1130
  if (path_id != *last_id)
1131
    {
1132
      *src = rt_find_source(&p->p, path_id);
1133
      *last_id = path_id;
1134
    }
1135

    
1136
  net *n = net_find(p->p.table, prefix, pxlen);
1137
  rte_update2( p->p.main_ahook, n, NULL, *src);
1138
}
1139

    
1140
static inline int
1141
bgp_set_next_hop(struct bgp_proto *p, rta *a)
1142
{
1143
  struct eattr *nh = ea_find(a->eattrs, EA_CODE(EAP_BGP, BA_NEXT_HOP));
1144
  ip_addr *nexthop = (ip_addr *) nh->u.ptr->data;
1145

    
1146
#ifdef IPV6
1147
  int second = (nh->u.ptr->length == NEXT_HOP_LENGTH) && ipa_nonzero(nexthop[1]);
1148

    
1149
  /* First address should not be link-local, but may be zero in direct mode */
1150
  if (ipa_is_link_local(*nexthop))
1151
    *nexthop = IPA_NONE;
1152
#else
1153
  int second = 0;
1154
#endif
1155

    
1156
  if (p->cf->gw_mode == GW_DIRECT)
1157
    {
1158
      neighbor *ng = NULL;
1159

    
1160
      if (ipa_nonzero(*nexthop))
1161
        ng = neigh_find(&p->p, nexthop, 0);
1162
      else if (second)        /* GW_DIRECT -> single_hop -> p->neigh != NULL */
1163
        ng = neigh_find2(&p->p, nexthop + 1, p->neigh->iface, 0);
1164

    
1165
      /* Fallback */
1166
      if (!ng)
1167
        ng = p->neigh;
1168

    
1169
      if (ng->scope == SCOPE_HOST)
1170
        return 0;
1171

    
1172
      a->dest = RTD_ROUTER;
1173
      a->gw = ng->addr;
1174
      a->iface = ng->iface;
1175
      a->hostentry = NULL;
1176
      a->igp_metric = 0;
1177
    }
1178
  else /* GW_RECURSIVE */
1179
    {
1180
      if (ipa_zero(*nexthop))
1181
          return 0;
1182

    
1183
      rta_set_recursive_next_hop(p->p.table, a, p->igp_table, nexthop, nexthop + second);
1184
    }
1185

    
1186
  return 1;
1187
}
1188

    
1189
#ifndef IPV6                /* IPv4 version */
1190

    
1191
static void
1192
bgp_do_rx_update(struct bgp_conn *conn,
1193
                 byte *withdrawn, int withdrawn_len,
1194
                 byte *nlri, int nlri_len,
1195
                 byte *attrs, int attr_len)
1196
{
1197
  struct bgp_proto *p = conn->bgp;
1198
  struct rte_src *src = p->p.main_source;
1199
  rta *a0, *a = NULL;
1200
  ip_addr prefix;
1201
  int pxlen, err = 0;
1202
  u32 path_id = 0;
1203
  u32 last_id = 0;
1204

    
1205
  /* Check for End-of-RIB marker */
1206
  if (!withdrawn_len && !attr_len && !nlri_len)
1207
    {
1208
      bgp_rx_end_mark(p);
1209
      return;
1210
    }
1211

    
1212
  /* Withdraw routes */
1213
  while (withdrawn_len)
1214
    {
1215
      DECODE_PREFIX(withdrawn, withdrawn_len);
1216
      DBG("Withdraw %I/%d\n", prefix, pxlen);
1217

    
1218
      bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src);
1219
    }
1220

    
1221
  if (!attr_len && !nlri_len)                /* shortcut */
1222
    return;
1223

    
1224
  a0 = bgp_decode_attrs(conn, attrs, attr_len, bgp_linpool, nlri_len);
1225

    
1226
  if (conn->state != BS_ESTABLISHED)        /* fatal error during decoding */
1227
    return;
1228

    
1229
  if (a0 && nlri_len && !bgp_set_next_hop(p, a0))
1230
    a0 = NULL;
1231

    
1232
  last_id = 0;
1233
  src = p->p.main_source;
1234

    
1235
  while (nlri_len)
1236
    {
1237
      DECODE_PREFIX(nlri, nlri_len);
1238
      DBG("Add %I/%d\n", prefix, pxlen);
1239

    
1240
      if (a0)
1241
        bgp_rte_update(p, prefix, pxlen, path_id, &last_id, &src, a0, &a);
1242
      else /* Forced withdraw as a result of soft error */
1243
        bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src);
1244
    }
1245

    
1246
 done:
1247
  if (a)
1248
    rta_free(a);
1249

    
1250
  if (err)
1251
    bgp_error(conn, 3, err, NULL, 0);
1252

    
1253
  return;
1254
}
1255

    
1256
#else                        /* IPv6 version */
1257

    
1258
#define DO_NLRI(name)                                        \
1259
  x = p->name##_start;                                \
1260
  len = len0 = p->name##_len;                                \
1261
  if (len)                                                \
1262
    {                                                        \
1263
      if (len < 3) { err=9; goto done; }                \
1264
      af = get_u16(x);                                        \
1265
      x += 3;                                                \
1266
      len -= 3;                                                \
1267
      DBG("\tNLRI AF=%d sub=%d len=%d\n", af, x[-1], len);\
1268
    }                                                        \
1269
  else                                                        \
1270
    af = 0;                                                \
1271
  if (af == BGP_AF_IPV6)
1272

    
1273
static void
1274
bgp_attach_next_hop(rta *a0, byte *x)
1275
{
1276
  ip_addr *nh = (ip_addr *) bgp_attach_attr_wa(&a0->eattrs, bgp_linpool, BA_NEXT_HOP, NEXT_HOP_LENGTH);
1277
  memcpy(nh, x+1, 16);
1278
  ipa_ntoh(nh[0]);
1279

    
1280
  /* We store received link local address in the other part of BA_NEXT_HOP eattr. */
1281
  if (*x == 32)
1282
    {
1283
      memcpy(nh+1, x+17, 16);
1284
      ipa_ntoh(nh[1]);
1285
    }
1286
  else
1287
    nh[1] = IPA_NONE;
1288
}
1289

    
1290

    
1291
static void
1292
bgp_do_rx_update(struct bgp_conn *conn,
1293
                 byte *withdrawn UNUSED, int withdrawn_len,
1294
                 byte *nlri UNUSED, int nlri_len,
1295
                 byte *attrs, int attr_len)
1296
{
1297
  struct bgp_proto *p = conn->bgp;
1298
  struct rte_src *src = p->p.main_source;
1299
  byte *x;
1300
  int len, len0;
1301
  unsigned af;
1302
  rta *a0, *a = NULL;
1303
  ip_addr prefix;
1304
  int pxlen, err = 0;
1305
  u32 path_id = 0;
1306
  u32 last_id = 0;
1307

    
1308
  p->mp_reach_len = 0;
1309
  p->mp_unreach_len = 0;
1310
  a0 = bgp_decode_attrs(conn, attrs, attr_len, bgp_linpool, 0);
1311

    
1312
  if (conn->state != BS_ESTABLISHED)        /* fatal error during decoding */
1313
    return;
1314

    
1315
  /* Check for End-of-RIB marker */
1316
  if ((attr_len < 8) && !withdrawn_len && !nlri_len && !p->mp_reach_len &&
1317
      (p->mp_unreach_len == 3) && (get_u16(p->mp_unreach_start) == BGP_AF_IPV6))
1318
    {
1319
      bgp_rx_end_mark(p);
1320
      return;
1321
    }
1322

    
1323
  DO_NLRI(mp_unreach)
1324
    {
1325
      while (len)
1326
        {
1327
          DECODE_PREFIX(x, len);
1328
          DBG("Withdraw %I/%d\n", prefix, pxlen);
1329
          bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src);
1330
        }
1331
    }
1332

    
1333
  DO_NLRI(mp_reach)
1334
    {
1335
      /* Create fake NEXT_HOP attribute */
1336
      if (len < 1 || (*x != 16 && *x != 32) || len < *x + 2)
1337
        { err = 9; goto done; }
1338

    
1339
      if (a0)
1340
        bgp_attach_next_hop(a0, x);
1341

    
1342
      /* Also ignore one reserved byte */
1343
      len -= *x + 2;
1344
      x += *x + 2;
1345

    
1346
      if (a0 && ! bgp_set_next_hop(p, a0))
1347
        a0 = NULL;
1348

    
1349
      last_id = 0;
1350
      src = p->p.main_source;
1351

    
1352
      while (len)
1353
        {
1354
          DECODE_PREFIX(x, len);
1355
          DBG("Add %I/%d\n", prefix, pxlen);
1356

    
1357
          if (a0)
1358
            bgp_rte_update(p, prefix, pxlen, path_id, &last_id, &src, a0, &a);
1359
          else /* Forced withdraw as a result of soft error */
1360
            bgp_rte_withdraw(p, prefix, pxlen, path_id, &last_id, &src);
1361
        }
1362
    }
1363

    
1364
 done:
1365
  if (a)
1366
    rta_free(a);
1367

    
1368
  if (err) /* Use subcode 9, not err */
1369
    bgp_error(conn, 3, 9, NULL, 0);
1370

    
1371
  return;
1372
}
1373

    
1374
#endif
1375

    
1376
static void
1377
bgp_rx_update(struct bgp_conn *conn, byte *pkt, uint len)
1378
{
1379
  struct bgp_proto *p = conn->bgp;
1380
  byte *withdrawn, *attrs, *nlri;
1381
  uint withdrawn_len, attr_len, nlri_len;
1382

    
1383
  BGP_TRACE_RL(&rl_rcv_update, D_PACKETS, "Got UPDATE");
1384

    
1385
  /* Workaround for some BGP implementations that skip initial KEEPALIVE */
1386
  if (conn->state == BS_OPENCONFIRM)
1387
    bgp_conn_enter_established_state(conn);
1388

    
1389
  if (conn->state != BS_ESTABLISHED)
1390
    { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
1391
  bgp_start_timer(conn->hold_timer, conn->hold_time);
1392

    
1393
  /* Find parts of the packet and check sizes */
1394
  if (len < 23)
1395
    {
1396
      bgp_error(conn, 1, 2, pkt+16, 2);
1397
      return;
1398
    }
1399
  withdrawn = pkt + 21;
1400
  withdrawn_len = get_u16(pkt + 19);
1401
  if (withdrawn_len + 23 > len)
1402
    goto malformed;
1403
  attrs = withdrawn + withdrawn_len + 2;
1404
  attr_len = get_u16(attrs - 2);
1405
  if (withdrawn_len + attr_len + 23 > len)
1406
    goto malformed;
1407
  nlri = attrs + attr_len;
1408
  nlri_len = len - withdrawn_len - attr_len - 23;
1409
  if (!attr_len && nlri_len)
1410
    goto malformed;
1411
  DBG("Sizes: withdrawn=%d, attrs=%d, NLRI=%d\n", withdrawn_len, attr_len, nlri_len);
1412

    
1413
  lp_flush(bgp_linpool);
1414

    
1415
  bgp_do_rx_update(conn, withdrawn, withdrawn_len, nlri, nlri_len, attrs, attr_len);
1416
  return;
1417

    
1418
malformed:
1419
  bgp_error(conn, 3, 1, NULL, 0);
1420
}
1421

    
1422
static struct {
1423
  byte major, minor;
1424
  byte *msg;
1425
} bgp_msg_table[] = {
1426
  { 1, 0, "Invalid message header" },
1427
  { 1, 1, "Connection not synchronized" },
1428
  { 1, 2, "Bad message length" },
1429
  { 1, 3, "Bad message type" },
1430
  { 2, 0, "Invalid OPEN message" },
1431
  { 2, 1, "Unsupported version number" },
1432
  { 2, 2, "Bad peer AS" },
1433
  { 2, 3, "Bad BGP identifier" },
1434
  { 2, 4, "Unsupported optional parameter" },
1435
  { 2, 5, "Authentication failure" },
1436
  { 2, 6, "Unacceptable hold time" },
1437
  { 2, 7, "Required capability missing" }, /* [RFC5492] */
1438
  { 2, 8, "No supported AFI/SAFI" }, /* This error msg is nonstandard */
1439
  { 3, 0, "Invalid UPDATE message" },
1440
  { 3, 1, "Malformed attribute list" },
1441
  { 3, 2, "Unrecognized well-known attribute" },
1442
  { 3, 3, "Missing mandatory attribute" },
1443
  { 3, 4, "Invalid attribute flags" },
1444
  { 3, 5, "Invalid attribute length" },
1445
  { 3, 6, "Invalid ORIGIN attribute" },
1446
  { 3, 7, "AS routing loop" },                /* Deprecated */
1447
  { 3, 8, "Invalid NEXT_HOP attribute" },
1448
  { 3, 9, "Optional attribute error" },
1449
  { 3, 10, "Invalid network field" },
1450
  { 3, 11, "Malformed AS_PATH" },
1451
  { 4, 0, "Hold timer expired" },
1452
  { 5, 0, "Finite state machine error" }, /* Subcodes are according to [RFC6608] */
1453
  { 5, 1, "Unexpected message in OpenSent state" },
1454
  { 5, 2, "Unexpected message in OpenConfirm state" },
1455
  { 5, 3, "Unexpected message in Established state" },
1456
  { 6, 0, "Cease" }, /* Subcodes are according to [RFC4486] */
1457
  { 6, 1, "Maximum number of prefixes reached" },
1458
  { 6, 2, "Administrative shutdown" },
1459
  { 6, 3, "Peer de-configured" },
1460
  { 6, 4, "Administrative reset" },
1461
  { 6, 5, "Connection rejected" },
1462
  { 6, 6, "Other configuration change" },
1463
  { 6, 7, "Connection collision resolution" },
1464
  { 6, 8, "Out of Resources" },
1465
  { 7, 0, "Invalid ROUTE-REFRESH message" }, /* [RFC7313] */
1466
  { 7, 1, "Invalid ROUTE-REFRESH message length" } /* [RFC7313] */
1467
};
1468

    
1469
/**
1470
 * bgp_error_dsc - return BGP error description
1471
 * @code: BGP error code
1472
 * @subcode: BGP error subcode
1473
 *
1474
 * bgp_error_dsc() returns error description for BGP errors
1475
 * which might be static string or given temporary buffer.
1476
 */
1477
const char *
1478
bgp_error_dsc(unsigned code, unsigned subcode)
1479
{
1480
  static char buff[32];
1481
  unsigned i;
1482
  for (i=0; i < ARRAY_SIZE(bgp_msg_table); i++)
1483
    if (bgp_msg_table[i].major == code && bgp_msg_table[i].minor == subcode)
1484
      {
1485
        return bgp_msg_table[i].msg;
1486
      }
1487

    
1488
  bsprintf(buff, "Unknown error %d.%d", code, subcode);
1489
  return buff;
1490
}
1491

    
1492
void
1493
bgp_log_error(struct bgp_proto *p, u8 class, char *msg, unsigned code, unsigned subcode, byte *data, unsigned len)
1494
{
1495
  const byte *name;
1496
  byte *t, argbuf[36];
1497
  unsigned i;
1498

    
1499
  /* Don't report Cease messages generated by myself */
1500
  if (code == 6 && class == BE_BGP_TX)
1501
    return;
1502

    
1503
  name = bgp_error_dsc(code, subcode);
1504
  t = argbuf;
1505
  if (len)
1506
    {
1507
      *t++ = ':';
1508
      *t++ = ' ';
1509

    
1510
      if ((code == 2) && (subcode == 2) && ((len == 2) || (len == 4)))
1511
        {
1512
          /* Bad peer AS - we would like to print the AS */
1513
          t += bsprintf(t, "%d", (len == 2) ? get_u16(data) : get_u32(data));
1514
          goto done;
1515
        }
1516
      if (len > 16)
1517
        len = 16;
1518
      for (i=0; i<len; i++)
1519
        t += bsprintf(t, "%02x", data[i]);
1520
    }
1521
 done:
1522
  *t = 0;
1523
  log(L_REMOTE "%s: %s: %s%s", p->p.name, msg, name, argbuf);
1524
}
1525

    
1526
static void
1527
bgp_rx_notification(struct bgp_conn *conn, byte *pkt, uint len)
1528
{
1529
  struct bgp_proto *p = conn->bgp;
1530
  if (len < 21)
1531
    {
1532
      bgp_error(conn, 1, 2, pkt+16, 2);
1533
      return;
1534
    }
1535

    
1536
  unsigned code = pkt[19];
1537
  unsigned subcode = pkt[20];
1538
  int err = (code != 6);
1539

    
1540
  bgp_log_error(p, BE_BGP_RX, "Received", code, subcode, pkt+21, len-21);
1541
  bgp_store_error(p, conn, BE_BGP_RX, (code << 16) | subcode);
1542

    
1543
#ifndef IPV6
1544
  if ((code == 2) && ((subcode == 4) || (subcode == 7))
1545
      /* Error related to capability:
1546
       * 4 - Peer does not support capabilities at all.
1547
       * 7 - Peer request some capability. Strange unless it is IPv6 only peer.
1548
       */
1549
      && (p->cf->capabilities == 2)
1550
      /* Capabilities are not explicitly enabled or disabled, therefore heuristic is used */
1551
      && (conn->start_state == BSS_CONNECT)
1552
      /* Failed connection attempt have used capabilities */
1553
      && (p->cf->remote_as <= 0xFFFF))
1554
      /* Not possible with disabled capabilities */
1555
    {
1556
      /* We try connect without capabilities */
1557
      log(L_WARN "%s: Capability related error received, retry with capabilities disabled", p->p.name);
1558
      p->start_state = BSS_CONNECT_NOCAP;
1559
      err = 0;
1560
    }
1561
#endif
1562

    
1563
  bgp_conn_enter_close_state(conn);
1564
  bgp_schedule_packet(conn, PKT_SCHEDULE_CLOSE);
1565

    
1566
  if (err) 
1567
    {
1568
      bgp_update_startup_delay(p);
1569
      bgp_stop(p, 0);
1570
    }
1571
}
1572

    
1573
static void
1574
bgp_rx_keepalive(struct bgp_conn *conn)
1575
{
1576
  struct bgp_proto *p = conn->bgp;
1577

    
1578
  BGP_TRACE(D_PACKETS, "Got KEEPALIVE");
1579
  bgp_start_timer(conn->hold_timer, conn->hold_time);
1580
  switch (conn->state)
1581
    {
1582
    case BS_OPENCONFIRM:
1583
      bgp_conn_enter_established_state(conn);
1584
      break;
1585
    case BS_ESTABLISHED:
1586
      break;
1587
    default:
1588
      bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0);
1589
    }
1590
}
1591

    
1592
static void
1593
bgp_rx_route_refresh(struct bgp_conn *conn, byte *pkt, uint len)
1594
{
1595
  struct bgp_proto *p = conn->bgp;
1596

    
1597
  if (conn->state != BS_ESTABLISHED)
1598
    { bgp_error(conn, 5, fsm_err_subcode[conn->state], NULL, 0); return; }
1599

    
1600
  if (!p->cf->enable_refresh)
1601
    { bgp_error(conn, 1, 3, pkt+18, 1); return; }
1602

    
1603
  if (len < (BGP_HEADER_LENGTH + 4))
1604
    { bgp_error(conn, 1, 2, pkt+16, 2); return; }
1605

    
1606
  if (len > (BGP_HEADER_LENGTH + 4))
1607
    { bgp_error(conn, 7, 1, pkt, MIN(len, 2048)); return; }
1608

    
1609
  /* FIXME - we ignore AFI/SAFI values, as we support
1610
     just one value and even an error code for an invalid
1611
     request is not defined */
1612

    
1613
  /* RFC 7313 redefined reserved field as RR message subtype */
1614
  uint subtype = conn->peer_enhanced_refresh_support ? pkt[21] : BGP_RR_REQUEST;
1615

    
1616
  switch (subtype)
1617
  {
1618
  case BGP_RR_REQUEST:
1619
    BGP_TRACE(D_PACKETS, "Got ROUTE-REFRESH");
1620
    proto_request_feeding(&p->p);
1621
    break;
1622

    
1623
  case BGP_RR_BEGIN:
1624
    BGP_TRACE(D_PACKETS, "Got BEGIN-OF-RR");
1625
    bgp_refresh_begin(p);
1626
    break;
1627

    
1628
  case BGP_RR_END:
1629
    BGP_TRACE(D_PACKETS, "Got END-OF-RR");
1630
    bgp_refresh_end(p);
1631
    break;
1632

    
1633
  default:
1634
    log(L_WARN "%s: Got ROUTE-REFRESH message with unknown subtype %u, ignoring",
1635
        p->p.name, subtype);
1636
    break;
1637
  }
1638
}
1639

    
1640

    
1641
/**
1642
 * bgp_rx_packet - handle a received packet
1643
 * @conn: BGP connection
1644
 * @pkt: start of the packet
1645
 * @len: packet size
1646
 *
1647
 * bgp_rx_packet() takes a newly received packet and calls the corresponding
1648
 * packet handler according to the packet type.
1649
 */
1650
static void
1651
bgp_rx_packet(struct bgp_conn *conn, byte *pkt, unsigned len)
1652
{
1653
  byte type = pkt[18];
1654

    
1655
  DBG("BGP: Got packet %02x (%d bytes)\n", type, len);
1656

    
1657
  if (conn->bgp->p.mrtdump & MD_MESSAGES)
1658
    mrt_dump_bgp_packet(conn, pkt, len);
1659

    
1660
  switch (type)
1661
    {
1662
    case PKT_OPEN:                return bgp_rx_open(conn, pkt, len);
1663
    case PKT_UPDATE:                return bgp_rx_update(conn, pkt, len);
1664
    case PKT_NOTIFICATION:      return bgp_rx_notification(conn, pkt, len);
1665
    case PKT_KEEPALIVE:                return bgp_rx_keepalive(conn);
1666
    case PKT_ROUTE_REFRESH:        return bgp_rx_route_refresh(conn, pkt, len);
1667
    default:                        bgp_error(conn, 1, 3, pkt+18, 1);
1668
    }
1669
}
1670

    
1671
/**
1672
 * bgp_rx - handle received data
1673
 * @sk: socket
1674
 * @size: amount of data received
1675
 *
1676
 * bgp_rx() is called by the socket layer whenever new data arrive from
1677
 * the underlying TCP connection. It assembles the data fragments to packets,
1678
 * checks their headers and framing and passes complete packets to
1679
 * bgp_rx_packet().
1680
 */
1681
int
1682
bgp_rx(sock *sk, uint size)
1683
{
1684
  struct bgp_conn *conn = sk->data;
1685
  struct bgp_proto *p = conn->bgp;
1686
  byte *pkt_start = sk->rbuf;
1687
  byte *end = pkt_start + size;
1688
  unsigned i, len;
1689

    
1690
  DBG("BGP: RX hook: Got %d bytes\n", size);
1691
  while (end >= pkt_start + BGP_HEADER_LENGTH)
1692
    {
1693
      if ((conn->state == BS_CLOSE) || (conn->sk != sk))
1694
        return 0;
1695
      for(i=0; i<16; i++)
1696
        if (pkt_start[i] != 0xff)
1697
          {
1698
            bgp_error(conn, 1, 1, NULL, 0);
1699
            break;
1700
          }
1701
      len = get_u16(pkt_start+16);
1702
      if (len < BGP_HEADER_LENGTH || len > bgp_max_packet_length(p))
1703
        {
1704
          bgp_error(conn, 1, 2, pkt_start+16, 2);
1705
          break;
1706
        }
1707
      if (end < pkt_start + len)
1708
        break;
1709
      bgp_rx_packet(conn, pkt_start, len);
1710
      pkt_start += len;
1711
    }
1712
  if (pkt_start != sk->rbuf)
1713
    {
1714
      memmove(sk->rbuf, pkt_start, end - pkt_start);
1715
      sk->rpos = sk->rbuf + (end - pkt_start);
1716
    }
1717
  return 0;
1718
}