Statistics
| Branch: | Revision:

iof-bird-daemon / sysdep / linux / netlink.c @ c99050cc

History | View | Annotate | Download (49.5 KB)

1
/*
2
 *        BIRD -- Linux Netlink Interface
3
 *
4
 *        (c) 1999--2000 Martin Mares <mj@ucw.cz>
5
 *
6
 *        Can be freely distributed and used under the terms of the GNU GPL.
7
 */
8

    
9
#include <alloca.h>
10
#include <stdio.h>
11
#include <unistd.h>
12
#include <fcntl.h>
13
#include <sys/socket.h>
14
#include <sys/uio.h>
15
#include <errno.h>
16

    
17
#undef LOCAL_DEBUG
18

    
19
#include "nest/bird.h"
20
#include "nest/route.h"
21
#include "nest/protocol.h"
22
#include "nest/iface.h"
23
#include "lib/alloca.h"
24
#include "sysdep/unix/unix.h"
25
#include "sysdep/unix/krt.h"
26
#include "lib/socket.h"
27
#include "lib/string.h"
28
#include "lib/hash.h"
29
#include "conf/conf.h"
30

    
31
#include <asm/types.h>
32
#include <linux/if.h>
33
#include <linux/netlink.h>
34
#include <linux/rtnetlink.h>
35

    
36
#ifdef HAVE_MPLS_KERNEL
37
#include <linux/lwtunnel.h>
38
#endif
39

    
40
#ifndef MSG_TRUNC                        /* Hack: Several versions of glibc miss this one :( */
41
#define MSG_TRUNC 0x20
42
#endif
43

    
44
#ifndef IFA_FLAGS
45
#define IFA_FLAGS 8
46
#endif
47

    
48
#ifndef IFF_LOWER_UP
49
#define IFF_LOWER_UP 0x10000
50
#endif
51

    
52
#ifndef RTA_TABLE
53
#define RTA_TABLE  15
54
#endif
55

    
56
#ifndef RTA_VIA
57
#define RTA_VIA         18
58
#endif
59

    
60
#ifndef RTA_NEWDST
61
#define RTA_NEWDST  19
62
#endif
63

    
64
#ifndef RTA_ENCAP_TYPE
65
#define RTA_ENCAP_TYPE        21
66
#endif
67

    
68
#ifndef RTA_ENCAP
69
#define RTA_ENCAP  22
70
#endif
71

    
72
#define krt_ecmp6(p) ((p)->af == AF_INET6)
73

    
74
const int rt_default_ecmp = 16;
75

    
76
/*
77
 * Structure nl_parse_state keeps state of received route processing. Ideally,
78
 * we could just independently parse received Netlink messages and immediately
79
 * propagate received routes to the rest of BIRD, but older Linux kernel (before
80
 * version 4.11) represents and announces IPv6 ECMP routes not as one route with
81
 * multiple next hops (like RTA_MULTIPATH in IPv4 ECMP), but as a sequence of
82
 * routes with the same prefix. More recent kernels work as with IPv4.
83
 *
84
 * Therefore, BIRD keeps currently processed route in nl_parse_state structure
85
 * and postpones its propagation until we expect it to be final; i.e., when
86
 * non-matching route is received or when the scan ends. When another matching
87
 * route is received, it is merged with the already processed route to form an
88
 * ECMP route. Note that merging is done only for IPv6 (merge == 1), but the
89
 * postponing is done in both cases (for simplicity). All IPv4 routes or IPv6
90
 * routes with RTA_MULTIPATH set are just considered non-matching.
91
 *
92
 * This is ignored for asynchronous notifications (every notification is handled
93
 * as a separate route). It is not an issue for our routes, as we ignore such
94
 * notifications anyways. But importing alien IPv6 ECMP routes does not work
95
 * properly with older kernels.
96
 *
97
 * Whatever the kernel version is, IPv6 ECMP routes are sent as multiple routes
98
 * for the same prefix.
99
 */
100

    
101
struct nl_parse_state
102
{
103
  struct linpool *pool;
104
  int scan;
105
  int merge;
106

    
107
  net *net;
108
  rta *attrs;
109
  struct krt_proto *proto;
110
  s8 new;
111
  s8 krt_src;
112
  u8 krt_type;
113
  u8 krt_proto;
114
  u32 krt_metric;
115
};
116

    
117
/*
118
 *        Synchronous Netlink interface
119
 */
120

    
121
struct nl_sock
122
{
123
  int fd;
124
  u32 seq;
125
  byte *rx_buffer;                        /* Receive buffer */
126
  struct nlmsghdr *last_hdr;                /* Recently received packet */
127
  uint last_size;
128
};
129

    
130
#define NL_RX_SIZE 8192
131

    
132
#define NL_OP_DELETE        0
133
#define NL_OP_ADD        (NLM_F_CREATE|NLM_F_EXCL)
134
#define NL_OP_REPLACE        (NLM_F_CREATE|NLM_F_REPLACE)
135
#define NL_OP_APPEND        (NLM_F_CREATE|NLM_F_APPEND)
136

    
137
static linpool *nl_linpool;
138

    
139
static struct nl_sock nl_scan = {.fd = -1};        /* Netlink socket for synchronous scan */
140
static struct nl_sock nl_req  = {.fd = -1};        /* Netlink socket for requests */
141

    
142
static void
143
nl_open_sock(struct nl_sock *nl)
144
{
145
  if (nl->fd < 0)
146
    {
147
      nl->fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
148
      if (nl->fd < 0)
149
        die("Unable to open rtnetlink socket: %m");
150
      nl->seq = (u32) (current_time() TO_S); /* Or perhaps random_u32() ? */
151
      nl->rx_buffer = xmalloc(NL_RX_SIZE);
152
      nl->last_hdr = NULL;
153
      nl->last_size = 0;
154
    }
155
}
156

    
157
static void
158
nl_open(void)
159
{
160
  nl_open_sock(&nl_scan);
161
  nl_open_sock(&nl_req);
162
}
163

    
164
static void
165
nl_send(struct nl_sock *nl, struct nlmsghdr *nh)
166
{
167
  struct sockaddr_nl sa;
168

    
169
  memset(&sa, 0, sizeof(sa));
170
  sa.nl_family = AF_NETLINK;
171
  nh->nlmsg_pid = 0;
172
  nh->nlmsg_seq = ++(nl->seq);
173
  if (sendto(nl->fd, nh, nh->nlmsg_len, 0, (struct sockaddr *)&sa, sizeof(sa)) < 0)
174
    die("rtnetlink sendto: %m");
175
  nl->last_hdr = NULL;
176
}
177

    
178
static void
179
nl_request_dump(int af, int cmd)
180
{
181
  struct {
182
    struct nlmsghdr nh;
183
    struct rtgenmsg g;
184
  } req = {
185
    .nh.nlmsg_type = cmd,
186
    .nh.nlmsg_len = sizeof(req),
187
    .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
188
    .g.rtgen_family = af
189
  };
190
  nl_send(&nl_scan, &req.nh);
191
}
192

    
193
static struct nlmsghdr *
194
nl_get_reply(struct nl_sock *nl)
195
{
196
  for(;;)
197
    {
198
      if (!nl->last_hdr)
199
        {
200
          struct iovec iov = { nl->rx_buffer, NL_RX_SIZE };
201
          struct sockaddr_nl sa;
202
          struct msghdr m = {
203
            .msg_name = &sa,
204
            .msg_namelen = sizeof(sa),
205
            .msg_iov = &iov,
206
            .msg_iovlen = 1,
207
          };
208
          int x = recvmsg(nl->fd, &m, 0);
209
          if (x < 0)
210
            die("nl_get_reply: %m");
211
          if (sa.nl_pid)                /* It isn't from the kernel */
212
            {
213
              DBG("Non-kernel packet\n");
214
              continue;
215
            }
216
          nl->last_size = x;
217
          nl->last_hdr = (void *) nl->rx_buffer;
218
          if (m.msg_flags & MSG_TRUNC)
219
            bug("nl_get_reply: got truncated reply which should be impossible");
220
        }
221
      if (NLMSG_OK(nl->last_hdr, nl->last_size))
222
        {
223
          struct nlmsghdr *h = nl->last_hdr;
224
          nl->last_hdr = NLMSG_NEXT(h, nl->last_size);
225
          if (h->nlmsg_seq != nl->seq)
226
            {
227
              log(L_WARN "nl_get_reply: Ignoring out of sequence netlink packet (%x != %x)",
228
                  h->nlmsg_seq, nl->seq);
229
              continue;
230
            }
231
          return h;
232
        }
233
      if (nl->last_size)
234
        log(L_WARN "nl_get_reply: Found packet remnant of size %d", nl->last_size);
235
      nl->last_hdr = NULL;
236
    }
237
}
238

    
239
static struct tbf rl_netlink_err = TBF_DEFAULT_LOG_LIMITS;
240

    
241
static int
242
nl_error(struct nlmsghdr *h, int ignore_esrch)
243
{
244
  struct nlmsgerr *e;
245
  int ec;
246

    
247
  if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr)))
248
    {
249
      log(L_WARN "Netlink: Truncated error message received");
250
      return ENOBUFS;
251
    }
252
  e = (struct nlmsgerr *) NLMSG_DATA(h);
253
  ec = -e->error;
254
  if (ec && !(ignore_esrch && (ec == ESRCH)))
255
    log_rl(&rl_netlink_err, L_WARN "Netlink: %s", strerror(ec));
256
  return ec;
257
}
258

    
259
static struct nlmsghdr *
260
nl_get_scan(void)
261
{
262
  struct nlmsghdr *h = nl_get_reply(&nl_scan);
263

    
264
  if (h->nlmsg_type == NLMSG_DONE)
265
    return NULL;
266
  if (h->nlmsg_type == NLMSG_ERROR)
267
    {
268
      nl_error(h, 0);
269
      return NULL;
270
    }
271
  return h;
272
}
273

    
274
static int
275
nl_exchange(struct nlmsghdr *pkt, int ignore_esrch)
276
{
277
  struct nlmsghdr *h;
278

    
279
  nl_send(&nl_req, pkt);
280
  for(;;)
281
    {
282
      h = nl_get_reply(&nl_req);
283
      if (h->nlmsg_type == NLMSG_ERROR)
284
        break;
285
      log(L_WARN "nl_exchange: Unexpected reply received");
286
    }
287
  return nl_error(h, ignore_esrch) ? -1 : 0;
288
}
289

    
290
/*
291
 *        Netlink attributes
292
 */
293

    
294
static int nl_attr_len;
295

    
296
static void *
297
nl_checkin(struct nlmsghdr *h, int lsize)
298
{
299
  nl_attr_len = h->nlmsg_len - NLMSG_LENGTH(lsize);
300
  if (nl_attr_len < 0)
301
    {
302
      log(L_ERR "nl_checkin: underrun by %d bytes", -nl_attr_len);
303
      return NULL;
304
    }
305
  return NLMSG_DATA(h);
306
}
307

    
308
struct nl_want_attrs {
309
  u8 defined:1;
310
  u8 checksize:1;
311
  u8 size;
312
};
313

    
314

    
315
#define BIRD_IFLA_MAX (IFLA_WIRELESS+1)
316

    
317
static struct nl_want_attrs ifla_attr_want[BIRD_IFLA_MAX] = {
318
  [IFLA_IFNAME]          = { 1, 0, 0 },
319
  [IFLA_MTU]          = { 1, 1, sizeof(u32) },
320
  [IFLA_MASTER]          = { 1, 1, sizeof(u32) },
321
  [IFLA_WIRELESS] = { 1, 0, 0 },
322
};
323

    
324

    
325
#define BIRD_IFA_MAX  (IFA_FLAGS+1)
326

    
327
static struct nl_want_attrs ifa_attr_want4[BIRD_IFA_MAX] = {
328
  [IFA_ADDRESS]          = { 1, 1, sizeof(ip4_addr) },
329
  [IFA_LOCAL]          = { 1, 1, sizeof(ip4_addr) },
330
  [IFA_BROADCAST] = { 1, 1, sizeof(ip4_addr) },
331
  [IFA_FLAGS]     = { 1, 1, sizeof(u32) },
332
};
333

    
334
static struct nl_want_attrs ifa_attr_want6[BIRD_IFA_MAX] = {
335
  [IFA_ADDRESS]          = { 1, 1, sizeof(ip6_addr) },
336
  [IFA_LOCAL]          = { 1, 1, sizeof(ip6_addr) },
337
  [IFA_FLAGS]          = { 1, 1, sizeof(u32) },
338
};
339

    
340

    
341
#define BIRD_RTA_MAX  (RTA_ENCAP+1)
342

    
343
static struct nl_want_attrs nexthop_attr_want4[BIRD_RTA_MAX] = {
344
  [RTA_GATEWAY]          = { 1, 1, sizeof(ip4_addr) },
345
  [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
346
  [RTA_ENCAP]          = { 1, 0, 0 },
347
};
348

    
349
static struct nl_want_attrs nexthop_attr_want6[BIRD_RTA_MAX] = {
350
  [RTA_GATEWAY]          = { 1, 1, sizeof(ip6_addr) },
351
  [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
352
  [RTA_ENCAP]          = { 1, 0, 0 },
353
};
354

    
355
#ifdef HAVE_MPLS_KERNEL
356
static struct nl_want_attrs encap_mpls_want[BIRD_RTA_MAX] = {
357
  [RTA_DST]       = { 1, 0, 0 },
358
};
359
#endif
360

    
361
static struct nl_want_attrs rtm_attr_want4[BIRD_RTA_MAX] = {
362
  [RTA_DST]          = { 1, 1, sizeof(ip4_addr) },
363
  [RTA_OIF]          = { 1, 1, sizeof(u32) },
364
  [RTA_GATEWAY]          = { 1, 1, sizeof(ip4_addr) },
365
  [RTA_PRIORITY]  = { 1, 1, sizeof(u32) },
366
  [RTA_PREFSRC]          = { 1, 1, sizeof(ip4_addr) },
367
  [RTA_METRICS]          = { 1, 0, 0 },
368
  [RTA_MULTIPATH] = { 1, 0, 0 },
369
  [RTA_FLOW]          = { 1, 1, sizeof(u32) },
370
  [RTA_TABLE]          = { 1, 1, sizeof(u32) },
371
  [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
372
  [RTA_ENCAP]          = { 1, 0, 0 },
373
};
374

    
375
static struct nl_want_attrs rtm_attr_want6[BIRD_RTA_MAX] = {
376
  [RTA_DST]          = { 1, 1, sizeof(ip6_addr) },
377
  [RTA_IIF]          = { 1, 1, sizeof(u32) },
378
  [RTA_OIF]          = { 1, 1, sizeof(u32) },
379
  [RTA_GATEWAY]          = { 1, 1, sizeof(ip6_addr) },
380
  [RTA_PRIORITY]  = { 1, 1, sizeof(u32) },
381
  [RTA_PREFSRC]          = { 1, 1, sizeof(ip6_addr) },
382
  [RTA_METRICS]          = { 1, 0, 0 },
383
  [RTA_MULTIPATH] = { 1, 0, 0 },
384
  [RTA_FLOW]          = { 1, 1, sizeof(u32) },
385
  [RTA_TABLE]          = { 1, 1, sizeof(u32) },
386
  [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
387
  [RTA_ENCAP]          = { 1, 0, 0 },
388
};
389

    
390
#ifdef HAVE_MPLS_KERNEL
391
static struct nl_want_attrs rtm_attr_want_mpls[BIRD_RTA_MAX] = {
392
  [RTA_DST]          = { 1, 1, sizeof(u32) },
393
  [RTA_IIF]          = { 1, 1, sizeof(u32) },
394
  [RTA_OIF]          = { 1, 1, sizeof(u32) },
395
  [RTA_PRIORITY]  = { 1, 1, sizeof(u32) },
396
  [RTA_METRICS]          = { 1, 0, 0 },
397
  [RTA_FLOW]          = { 1, 1, sizeof(u32) },
398
  [RTA_TABLE]          = { 1, 1, sizeof(u32) },
399
  [RTA_VIA]          = { 1, 0, 0 },
400
  [RTA_NEWDST]          = { 1, 0, 0 },
401
};
402
#endif
403

    
404

    
405
static int
406
nl_parse_attrs(struct rtattr *a, struct nl_want_attrs *want, struct rtattr **k, int ksize)
407
{
408
  int max = ksize / sizeof(struct rtattr *);
409
  bzero(k, ksize);
410

    
411
  for ( ; RTA_OK(a, nl_attr_len); a = RTA_NEXT(a, nl_attr_len))
412
    {
413
      if ((a->rta_type >= max) || !want[a->rta_type].defined)
414
        continue;
415

    
416
      if (want[a->rta_type].checksize && (RTA_PAYLOAD(a) != want[a->rta_type].size))
417
        {
418
          log(L_ERR "nl_parse_attrs: Malformed attribute received");
419
          return 0;
420
        }
421

    
422
      k[a->rta_type] = a;
423
    }
424

    
425
  if (nl_attr_len)
426
    {
427
      log(L_ERR "nl_parse_attrs: remnant of size %d", nl_attr_len);
428
      return 0;
429
    }
430

    
431
  return 1;
432
}
433

    
434
static inline u16 rta_get_u16(struct rtattr *a)
435
{ return *(u16 *) RTA_DATA(a); }
436

    
437
static inline u32 rta_get_u32(struct rtattr *a)
438
{ return *(u32 *) RTA_DATA(a); }
439

    
440
static inline ip4_addr rta_get_ip4(struct rtattr *a)
441
{ return ip4_ntoh(*(ip4_addr *) RTA_DATA(a)); }
442

    
443
static inline ip6_addr rta_get_ip6(struct rtattr *a)
444
{ return ip6_ntoh(*(ip6_addr *) RTA_DATA(a)); }
445

    
446
static inline ip_addr rta_get_ipa(struct rtattr *a)
447
{
448
  if (RTA_PAYLOAD(a) == sizeof(ip4_addr))
449
    return ipa_from_ip4(rta_get_ip4(a));
450
  else
451
    return ipa_from_ip6(rta_get_ip6(a));
452
}
453

    
454
#ifdef HAVE_MPLS_KERNEL
455
static inline ip_addr rta_get_via(struct rtattr *a)
456
{
457
  struct rtvia *v = RTA_DATA(a);
458
  switch(v->rtvia_family) {
459
    case AF_INET:  return ipa_from_ip4(ip4_ntoh(*(ip4_addr *) v->rtvia_addr));
460
    case AF_INET6: return ipa_from_ip6(ip6_ntoh(*(ip6_addr *) v->rtvia_addr));
461
  }
462
  return IPA_NONE;
463
}
464

    
465
static u32 rta_mpls_stack[MPLS_MAX_LABEL_STACK];
466
static inline int rta_get_mpls(struct rtattr *a, u32 *stack)
467
{
468
  if (RTA_PAYLOAD(a) % 4)
469
    log(L_WARN "KRT: Strange length of received MPLS stack: %u", RTA_PAYLOAD(a));
470

    
471
  return mpls_get(RTA_DATA(a), RTA_PAYLOAD(a) & ~0x3, stack);
472
}
473
#endif
474

    
475
struct rtattr *
476
nl_add_attr(struct nlmsghdr *h, uint bufsize, uint code, const void *data, uint dlen)
477
{
478
  uint pos = NLMSG_ALIGN(h->nlmsg_len);
479
  uint len = RTA_LENGTH(dlen);
480

    
481
  if (pos + len > bufsize)
482
    bug("nl_add_attr: packet buffer overflow");
483

    
484
  struct rtattr *a = (struct rtattr *)((char *)h + pos);
485
  a->rta_type = code;
486
  a->rta_len = len;
487
  h->nlmsg_len = pos + len;
488

    
489
  if (dlen > 0)
490
    memcpy(RTA_DATA(a), data, dlen);
491

    
492
  return a;
493
}
494

    
495
static inline struct rtattr *
496
nl_open_attr(struct nlmsghdr *h, uint bufsize, uint code)
497
{
498
  return nl_add_attr(h, bufsize, code, NULL, 0);
499
}
500

    
501
static inline void
502
nl_close_attr(struct nlmsghdr *h, struct rtattr *a)
503
{
504
  a->rta_len = (void *)h + NLMSG_ALIGN(h->nlmsg_len) - (void *)a;
505
}
506

    
507
static inline void
508
nl_add_attr_u16(struct nlmsghdr *h, uint bufsize, int code, u16 data)
509
{
510
  nl_add_attr(h, bufsize, code, &data, 2);
511
}
512

    
513
static inline void
514
nl_add_attr_u32(struct nlmsghdr *h, uint bufsize, int code, u32 data)
515
{
516
  nl_add_attr(h, bufsize, code, &data, 4);
517
}
518

    
519
static inline void
520
nl_add_attr_ip4(struct nlmsghdr *h, uint bufsize, int code, ip4_addr ip4)
521
{
522
  ip4 = ip4_hton(ip4);
523
  nl_add_attr(h, bufsize, code, &ip4, sizeof(ip4));
524
}
525

    
526
static inline void
527
nl_add_attr_ip6(struct nlmsghdr *h, uint bufsize, int code, ip6_addr ip6)
528
{
529
  ip6 = ip6_hton(ip6);
530
  nl_add_attr(h, bufsize, code, &ip6, sizeof(ip6));
531
}
532

    
533
static inline void
534
nl_add_attr_ipa(struct nlmsghdr *h, uint bufsize, int code, ip_addr ipa)
535
{
536
  if (ipa_is_ip4(ipa))
537
    nl_add_attr_ip4(h, bufsize, code, ipa_to_ip4(ipa));
538
  else
539
    nl_add_attr_ip6(h, bufsize, code, ipa_to_ip6(ipa));
540
}
541

    
542
#ifdef HAVE_MPLS_KERNEL
543
static inline void
544
nl_add_attr_mpls(struct nlmsghdr *h, uint bufsize, int code, int len, u32 *stack)
545
{
546
  char buf[len*4];
547
  mpls_put(buf, len, stack);
548
  nl_add_attr(h, bufsize, code, buf, len*4);
549
}
550

    
551
static inline void
552
nl_add_attr_mpls_encap(struct nlmsghdr *h, uint bufsize, int len, u32 *stack)
553
{
554
  nl_add_attr_u16(h, bufsize, RTA_ENCAP_TYPE, LWTUNNEL_ENCAP_MPLS);
555

    
556
  struct rtattr *nest = nl_open_attr(h, bufsize, RTA_ENCAP);
557
  nl_add_attr_mpls(h, bufsize, RTA_DST, len, stack);
558
  nl_close_attr(h, nest);
559
}
560

    
561
static inline void
562
nl_add_attr_via(struct nlmsghdr *h, uint bufsize, ip_addr ipa)
563
{
564
  struct rtattr *nest = nl_open_attr(h, bufsize, RTA_VIA);
565
  struct rtvia *via = RTA_DATA(nest);
566

    
567
  h->nlmsg_len += sizeof(*via);
568

    
569
  if (ipa_is_ip4(ipa))
570
  {
571
    via->rtvia_family = AF_INET;
572
    put_ip4(via->rtvia_addr, ipa_to_ip4(ipa));
573
    h->nlmsg_len += sizeof(ip4_addr);
574
  }
575
  else
576
  {
577
    via->rtvia_family = AF_INET6;
578
    put_ip6(via->rtvia_addr, ipa_to_ip6(ipa));
579
    h->nlmsg_len += sizeof(ip6_addr);
580
  }
581

    
582
  nl_close_attr(h, nest);
583
}
584
#endif
585

    
586
static inline struct rtnexthop *
587
nl_open_nexthop(struct nlmsghdr *h, uint bufsize)
588
{
589
  uint pos = NLMSG_ALIGN(h->nlmsg_len);
590
  uint len = RTNH_LENGTH(0);
591

    
592
  if (pos + len > bufsize)
593
    bug("nl_open_nexthop: packet buffer overflow");
594

    
595
  h->nlmsg_len = pos + len;
596

    
597
  return (void *)h + pos;
598
}
599

    
600
static inline void
601
nl_close_nexthop(struct nlmsghdr *h, struct rtnexthop *nh)
602
{
603
  nh->rtnh_len = (void *)h + NLMSG_ALIGN(h->nlmsg_len) - (void *)nh;
604
}
605

    
606
static inline void
607
nl_add_nexthop(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af UNUSED)
608
{
609
#ifdef HAVE_MPLS_KERNEL
610
  if (nh->labels > 0)
611
    if (af == AF_MPLS)
612
      nl_add_attr_mpls(h, bufsize, RTA_NEWDST, nh->labels, nh->label);
613
    else
614
      nl_add_attr_mpls_encap(h, bufsize, nh->labels, nh->label);
615

    
616
  if (ipa_nonzero(nh->gw))
617
    if (af == AF_MPLS)
618
      nl_add_attr_via(h, bufsize, nh->gw);
619
    else
620
      nl_add_attr_ipa(h, bufsize, RTA_GATEWAY, nh->gw);
621
#else
622

    
623
  if (ipa_nonzero(nh->gw))
624
    nl_add_attr_ipa(h, bufsize, RTA_GATEWAY, nh->gw);
625
#endif
626
}
627

    
628
static void
629
nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af)
630
{
631
  struct rtattr *a = nl_open_attr(h, bufsize, RTA_MULTIPATH);
632

    
633
  for (; nh; nh = nh->next)
634
  {
635
    struct rtnexthop *rtnh = nl_open_nexthop(h, bufsize);
636

    
637
    rtnh->rtnh_flags = 0;
638
    rtnh->rtnh_hops = nh->weight;
639
    rtnh->rtnh_ifindex = nh->iface->index;
640

    
641
    nl_add_nexthop(h, bufsize, nh, af);
642

    
643
    if (nh->flags & RNF_ONLINK)
644
      rtnh->rtnh_flags |= RTNH_F_ONLINK;
645

    
646
    nl_close_nexthop(h, rtnh);
647
  }
648

    
649
  nl_close_attr(h, a);
650
}
651

    
652
static struct nexthop *
653
nl_parse_multipath(struct krt_proto *p, struct rtattr *ra, int af)
654
{
655
  /* Temporary buffer for multicast nexthops */
656
  static struct nexthop *nh_buffer;
657
  static int nh_buf_size;        /* in number of structures */
658
  static int nh_buf_used;
659

    
660
  struct rtattr *a[BIRD_RTA_MAX];
661
  struct rtnexthop *nh = RTA_DATA(ra);
662
  struct nexthop *rv, *first, **last;
663
  unsigned len = RTA_PAYLOAD(ra);
664

    
665
  first = NULL;
666
  last = &first;
667
  nh_buf_used = 0;
668

    
669
  while (len)
670
    {
671
      /* Use RTNH_OK(nh,len) ?? */
672
      if ((len < sizeof(*nh)) || (len < nh->rtnh_len))
673
        return NULL;
674

    
675
      if (nh_buf_used == nh_buf_size)
676
      {
677
        nh_buf_size = nh_buf_size ? (nh_buf_size * 2) : 4;
678
        nh_buffer = xrealloc(nh_buffer, nh_buf_size * NEXTHOP_MAX_SIZE);
679
      }
680
      /* FIXME: This is really ugly */
681
      *last = rv = (void *) (((byte *) nh_buffer) + (nh_buf_used++ * NEXTHOP_MAX_SIZE));
682
      memset(rv, 0, NEXTHOP_MAX_SIZE);
683
      // rv->next = NULL;
684
      last = &(rv->next);
685

    
686
      rv->flags = 0;
687
      rv->weight = nh->rtnh_hops;
688
      rv->iface = if_find_by_index(nh->rtnh_ifindex);
689
      if (!rv->iface)
690
        return NULL;
691

    
692
      /* Nonexistent RTNH_PAYLOAD ?? */
693
      nl_attr_len = nh->rtnh_len - RTNH_LENGTH(0);
694
      switch (af)
695
        {
696
        case AF_INET:
697
          if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want4, a, sizeof(a)))
698
            return NULL;
699
          break;
700

    
701
        case AF_INET6:
702
          if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want6, a, sizeof(a)))
703
            return NULL;
704
          break;
705

    
706
        default:
707
          return NULL;
708
        }
709

    
710
      if (a[RTA_GATEWAY])
711
        {
712
          rv->gw = rta_get_ipa(a[RTA_GATEWAY]);
713

    
714
          if (nh->rtnh_flags & RTNH_F_ONLINK)
715
            rv->flags |= RNF_ONLINK;
716

    
717
          neighbor *nbr;
718
          nbr = neigh_find2(&p->p, &rv->gw, rv->iface,
719
                            (rv->flags & RNF_ONLINK) ? NEF_ONLINK : 0);
720
          if (!nbr || (nbr->scope == SCOPE_HOST))
721
            return NULL;
722
        }
723
      else
724
        rv->gw = IPA_NONE;
725

    
726
#ifdef HAVE_MPLS_KERNEL
727
      if (a[RTA_ENCAP_TYPE])
728
      {
729
        if (rta_get_u16(a[RTA_ENCAP_TYPE]) != LWTUNNEL_ENCAP_MPLS) {
730
          log(L_WARN "KRT: Unknown encapsulation method %d in multipath", rta_get_u16(a[RTA_ENCAP_TYPE]));
731
          return NULL;
732
        }
733

    
734
        struct rtattr *enca[BIRD_RTA_MAX];
735
        nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]);
736
        nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca));
737
        rv->labels = rta_get_mpls(enca[RTA_DST], rv->label);
738
        break;
739
      }
740
#endif
741

    
742

    
743
      len -= NLMSG_ALIGN(nh->rtnh_len);
744
      nh = RTNH_NEXT(nh);
745
    }
746

    
747
  return first;
748
}
749

    
750
static void
751
nl_add_metrics(struct nlmsghdr *h, uint bufsize, u32 *metrics, int max)
752
{
753
  struct rtattr *a = nl_open_attr(h, bufsize, RTA_METRICS);
754
  int t;
755

    
756
  for (t = 1; t < max; t++)
757
    if (metrics[0] & (1 << t))
758
      nl_add_attr_u32(h, bufsize, t, metrics[t]);
759

    
760
  nl_close_attr(h, a);
761
}
762

    
763
static int
764
nl_parse_metrics(struct rtattr *hdr, u32 *metrics, int max)
765
{
766
  struct rtattr *a = RTA_DATA(hdr);
767
  int len = RTA_PAYLOAD(hdr);
768

    
769
  metrics[0] = 0;
770
  for (; RTA_OK(a, len); a = RTA_NEXT(a, len))
771
  {
772
    if (a->rta_type == RTA_UNSPEC)
773
      continue;
774

    
775
    if (a->rta_type >= max)
776
      continue;
777

    
778
    if (RTA_PAYLOAD(a) != 4)
779
      return -1;
780

    
781
    metrics[0] |= 1 << a->rta_type;
782
    metrics[a->rta_type] = rta_get_u32(a);
783
  }
784

    
785
  if (len > 0)
786
    return -1;
787

    
788
  return 0;
789
}
790

    
791

    
792
/*
793
 *        Scanning of interfaces
794
 */
795

    
796
static void
797
nl_parse_link(struct nlmsghdr *h, int scan)
798
{
799
  struct ifinfomsg *i;
800
  struct rtattr *a[BIRD_IFLA_MAX];
801
  int new = h->nlmsg_type == RTM_NEWLINK;
802
  struct iface f = {};
803
  struct iface *ifi;
804
  char *name;
805
  u32 mtu, master = 0;
806
  uint fl;
807

    
808
  if (!(i = nl_checkin(h, sizeof(*i))) || !nl_parse_attrs(IFLA_RTA(i), ifla_attr_want, a, sizeof(a)))
809
    return;
810
  if (!a[IFLA_IFNAME] || (RTA_PAYLOAD(a[IFLA_IFNAME]) < 2) || !a[IFLA_MTU])
811
    {
812
      /*
813
       * IFLA_IFNAME and IFLA_MTU are required, in fact, but there may also come
814
       * a message with IFLA_WIRELESS set, where (e.g.) no IFLA_IFNAME exists.
815
       * We simply ignore all such messages with IFLA_WIRELESS without notice.
816
       */
817

    
818
      if (a[IFLA_WIRELESS])
819
        return;
820

    
821
      log(L_ERR "KIF: Malformed message received");
822
      return;
823
    }
824

    
825
  name = RTA_DATA(a[IFLA_IFNAME]);
826
  mtu = rta_get_u32(a[IFLA_MTU]);
827

    
828
  if (a[IFLA_MASTER])
829
    master = rta_get_u32(a[IFLA_MASTER]);
830

    
831
  ifi = if_find_by_index(i->ifi_index);
832
  if (!new)
833
    {
834
      DBG("KIF: IF%d(%s) goes down\n", i->ifi_index, name);
835
      if (!ifi)
836
        return;
837

    
838
      if_delete(ifi);
839
    }
840
  else
841
    {
842
      DBG("KIF: IF%d(%s) goes up (mtu=%d,flg=%x)\n", i->ifi_index, name, mtu, i->ifi_flags);
843
      if (ifi && strncmp(ifi->name, name, sizeof(ifi->name)-1))
844
        if_delete(ifi);
845

    
846
      strncpy(f.name, name, sizeof(f.name)-1);
847
      f.index = i->ifi_index;
848
      f.mtu = mtu;
849

    
850
      f.master_index = master;
851
      f.master = if_find_by_index(master);
852

    
853
      fl = i->ifi_flags;
854
      if (fl & IFF_UP)
855
        f.flags |= IF_ADMIN_UP;
856
      if (fl & IFF_LOWER_UP)
857
        f.flags |= IF_LINK_UP;
858
      if (fl & IFF_LOOPBACK)                /* Loopback */
859
        f.flags |= IF_MULTIACCESS | IF_LOOPBACK | IF_IGNORE;
860
      else if (fl & IFF_POINTOPOINT)        /* PtP */
861
        f.flags |= IF_MULTICAST;
862
      else if (fl & IFF_BROADCAST)        /* Broadcast */
863
        f.flags |= IF_MULTIACCESS | IF_BROADCAST | IF_MULTICAST;
864
      else
865
        f.flags |= IF_MULTIACCESS;        /* NBMA */
866

    
867
      if (fl & IFF_MULTICAST)
868
        f.flags |= IF_MULTICAST;
869

    
870
      ifi = if_update(&f);
871

    
872
      if (!scan)
873
        if_end_partial_update(ifi);
874
    }
875
}
876

    
877
static void
878
nl_parse_addr4(struct ifaddrmsg *i, int scan, int new)
879
{
880
  struct rtattr *a[BIRD_IFA_MAX];
881
  struct iface *ifi;
882
  u32 ifa_flags;
883
  int scope;
884

    
885
  if (!nl_parse_attrs(IFA_RTA(i), ifa_attr_want4, a, sizeof(a)))
886
    return;
887

    
888
  if (!a[IFA_LOCAL])
889
    {
890
      log(L_ERR "KIF: Malformed message received (missing IFA_LOCAL)");
891
      return;
892
    }
893
  if (!a[IFA_ADDRESS])
894
    {
895
      log(L_ERR "KIF: Malformed message received (missing IFA_ADDRESS)");
896
      return;
897
    }
898

    
899
  ifi = if_find_by_index(i->ifa_index);
900
  if (!ifi)
901
    {
902
      log(L_ERR "KIF: Received address message for unknown interface %d", i->ifa_index);
903
      return;
904
    }
905

    
906
  if (a[IFA_FLAGS])
907
    ifa_flags = rta_get_u32(a[IFA_FLAGS]);
908
  else
909
    ifa_flags = i->ifa_flags;
910

    
911
  struct ifa ifa;
912
  bzero(&ifa, sizeof(ifa));
913
  ifa.iface = ifi;
914
  if (ifa_flags & IFA_F_SECONDARY)
915
    ifa.flags |= IA_SECONDARY;
916

    
917
  ifa.ip = rta_get_ipa(a[IFA_LOCAL]);
918

    
919
  if (i->ifa_prefixlen > IP4_MAX_PREFIX_LENGTH)
920
    {
921
      log(L_ERR "KIF: Invalid prefix length for interface %s: %d", ifi->name, i->ifa_prefixlen);
922
      new = 0;
923
    }
924
  if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH)
925
    {
926
      ifa.brd = rta_get_ipa(a[IFA_ADDRESS]);
927
      net_fill_ip4(&ifa.prefix, rta_get_ip4(a[IFA_ADDRESS]), i->ifa_prefixlen);
928

    
929
      /* It is either a host address or a peer address */
930
      if (ipa_equal(ifa.ip, ifa.brd))
931
        ifa.flags |= IA_HOST;
932
      else
933
        {
934
          ifa.flags |= IA_PEER;
935
          ifa.opposite = ifa.brd;
936
        }
937
    }
938
  else
939
    {
940
      net_fill_ip4(&ifa.prefix, ipa_to_ip4(ifa.ip), i->ifa_prefixlen);
941
      net_normalize(&ifa.prefix);
942

    
943
      if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH - 1)
944
        ifa.opposite = ipa_opposite_m1(ifa.ip);
945

    
946
      if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH - 2)
947
        ifa.opposite = ipa_opposite_m2(ifa.ip);
948

    
949
      if ((ifi->flags & IF_BROADCAST) && a[IFA_BROADCAST])
950
        {
951
          ip4_addr xbrd = rta_get_ip4(a[IFA_BROADCAST]);
952
          ip4_addr ybrd = ip4_or(ipa_to_ip4(ifa.ip), ip4_not(ip4_mkmask(i->ifa_prefixlen)));
953

    
954
          if (ip4_equal(xbrd, net4_prefix(&ifa.prefix)) || ip4_equal(xbrd, ybrd))
955
            ifa.brd = ipa_from_ip4(xbrd);
956
          else if (ifi->flags & IF_TMP_DOWN) /* Complain only during the first scan */
957
            {
958
              log(L_ERR "KIF: Invalid broadcast address %I4 for %s", xbrd, ifi->name);
959
              ifa.brd = ipa_from_ip4(ybrd);
960
            }
961
        }
962
    }
963

    
964
  scope = ipa_classify(ifa.ip);
965
  if (scope < 0)
966
    {
967
      log(L_ERR "KIF: Invalid interface address %I for %s", ifa.ip, ifi->name);
968
      return;
969
    }
970
  ifa.scope = scope & IADDR_SCOPE_MASK;
971

    
972
  DBG("KIF: IF%d(%s): %s IPA %I, flg %x, net %N, brd %I, opp %I\n",
973
      ifi->index, ifi->name,
974
      new ? "added" : "removed",
975
      ifa.ip, ifa.flags, ifa.prefix, ifa.brd, ifa.opposite);
976

    
977
  if (new)
978
    ifa_update(&ifa);
979
  else
980
    ifa_delete(&ifa);
981

    
982
  if (!scan)
983
    if_end_partial_update(ifi);
984
}
985

    
986
static void
987
nl_parse_addr6(struct ifaddrmsg *i, int scan, int new)
988
{
989
  struct rtattr *a[BIRD_IFA_MAX];
990
  struct iface *ifi;
991
  u32 ifa_flags;
992
  int scope;
993

    
994
  if (!nl_parse_attrs(IFA_RTA(i), ifa_attr_want6, a, sizeof(a)))
995
    return;
996

    
997
  if (!a[IFA_ADDRESS])
998
    {
999
      log(L_ERR "KIF: Malformed message received (missing IFA_ADDRESS)");
1000
      return;
1001
    }
1002

    
1003
  ifi = if_find_by_index(i->ifa_index);
1004
  if (!ifi)
1005
    {
1006
      log(L_ERR "KIF: Received address message for unknown interface %d", i->ifa_index);
1007
      return;
1008
    }
1009

    
1010
  if (a[IFA_FLAGS])
1011
    ifa_flags = rta_get_u32(a[IFA_FLAGS]);
1012
  else
1013
    ifa_flags = i->ifa_flags;
1014

    
1015
  struct ifa ifa;
1016
  bzero(&ifa, sizeof(ifa));
1017
  ifa.iface = ifi;
1018
  if (ifa_flags & IFA_F_SECONDARY)
1019
    ifa.flags |= IA_SECONDARY;
1020

    
1021
  /* Ignore tentative addresses silently */
1022
  if (ifa_flags & IFA_F_TENTATIVE)
1023
    return;
1024

    
1025
  /* IFA_LOCAL can be unset for IPv6 interfaces */
1026
  ifa.ip = rta_get_ipa(a[IFA_LOCAL] ? : a[IFA_ADDRESS]);
1027

    
1028
  if (i->ifa_prefixlen > IP6_MAX_PREFIX_LENGTH)
1029
    {
1030
      log(L_ERR "KIF: Invalid prefix length for interface %s: %d", ifi->name, i->ifa_prefixlen);
1031
      new = 0;
1032
    }
1033
  if (i->ifa_prefixlen == IP6_MAX_PREFIX_LENGTH)
1034
    {
1035
      ifa.brd = rta_get_ipa(a[IFA_ADDRESS]);
1036
      net_fill_ip6(&ifa.prefix, rta_get_ip6(a[IFA_ADDRESS]), i->ifa_prefixlen);
1037

    
1038
      /* It is either a host address or a peer address */
1039
      if (ipa_equal(ifa.ip, ifa.brd))
1040
        ifa.flags |= IA_HOST;
1041
      else
1042
        {
1043
          ifa.flags |= IA_PEER;
1044
          ifa.opposite = ifa.brd;
1045
        }
1046
    }
1047
  else
1048
    {
1049
      net_fill_ip6(&ifa.prefix, ipa_to_ip6(ifa.ip), i->ifa_prefixlen);
1050
      net_normalize(&ifa.prefix);
1051

    
1052
      if (i->ifa_prefixlen == IP6_MAX_PREFIX_LENGTH - 1)
1053
        ifa.opposite = ipa_opposite_m1(ifa.ip);
1054
    }
1055

    
1056
  scope = ipa_classify(ifa.ip);
1057
  if (scope < 0)
1058
    {
1059
      log(L_ERR "KIF: Invalid interface address %I for %s", ifa.ip, ifi->name);
1060
      return;
1061
    }
1062
  ifa.scope = scope & IADDR_SCOPE_MASK;
1063

    
1064
  DBG("KIF: IF%d(%s): %s IPA %I, flg %x, net %N, brd %I, opp %I\n",
1065
      ifi->index, ifi->name,
1066
      new ? "added" : "removed",
1067
      ifa.ip, ifa.flags, ifa.prefix, ifa.brd, ifa.opposite);
1068

    
1069
  if (new)
1070
    ifa_update(&ifa);
1071
  else
1072
    ifa_delete(&ifa);
1073

    
1074
  if (!scan)
1075
    if_end_partial_update(ifi);
1076
}
1077

    
1078
static void
1079
nl_parse_addr(struct nlmsghdr *h, int scan)
1080
{
1081
  struct ifaddrmsg *i;
1082

    
1083
  if (!(i = nl_checkin(h, sizeof(*i))))
1084
    return;
1085

    
1086
  int new = (h->nlmsg_type == RTM_NEWADDR);
1087

    
1088
  switch (i->ifa_family)
1089
    {
1090
      case AF_INET:
1091
        return nl_parse_addr4(i, scan, new);
1092

    
1093
      case AF_INET6:
1094
        return nl_parse_addr6(i, scan, new);
1095
    }
1096
}
1097

    
1098
void
1099
kif_do_scan(struct kif_proto *p UNUSED)
1100
{
1101
  struct nlmsghdr *h;
1102

    
1103
  if_start_update();
1104

    
1105
  nl_request_dump(AF_UNSPEC, RTM_GETLINK);
1106
  while (h = nl_get_scan())
1107
    if (h->nlmsg_type == RTM_NEWLINK || h->nlmsg_type == RTM_DELLINK)
1108
      nl_parse_link(h, 1);
1109
    else
1110
      log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type);
1111

    
1112
  /* Re-resolve master interface for slaves */
1113
  struct iface *i;
1114
  WALK_LIST(i, iface_list)
1115
    if (i->master_index)
1116
    {
1117
      struct iface f = {
1118
        .flags = i->flags,
1119
        .mtu = i->mtu,
1120
        .index = i->index,
1121
        .master_index = i->master_index,
1122
        .master = if_find_by_index(i->master_index)
1123
      };
1124

    
1125
      if (f.master != i->master)
1126
      {
1127
        memcpy(f.name, i->name, sizeof(f.name));
1128
        if_update(&f);
1129
      }
1130
    }
1131

    
1132
  nl_request_dump(AF_INET, RTM_GETADDR);
1133
  while (h = nl_get_scan())
1134
    if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR)
1135
      nl_parse_addr(h, 1);
1136
    else
1137
      log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type);
1138

    
1139
  nl_request_dump(AF_INET6, RTM_GETADDR);
1140
  while (h = nl_get_scan())
1141
    if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR)
1142
      nl_parse_addr(h, 1);
1143
    else
1144
      log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type);
1145

    
1146
  if_end_update();
1147
}
1148

    
1149
/*
1150
 *        Routes
1151
 */
1152

    
1153
static inline u32
1154
krt_table_id(struct krt_proto *p)
1155
{
1156
  return KRT_CF->sys.table_id;
1157
}
1158

    
1159
static HASH(struct krt_proto) nl_table_map;
1160

    
1161
#define RTH_KEY(p)                p->af, krt_table_id(p)
1162
#define RTH_NEXT(p)                p->sys.hash_next
1163
#define RTH_EQ(a1,i1,a2,i2)        a1 == a2 && i1 == i2
1164
#define RTH_FN(a,i)                a ^ u32_hash(i)
1165

    
1166
#define RTH_REHASH                rth_rehash
1167
#define RTH_PARAMS                /8, *2, 2, 2, 6, 20
1168

    
1169
HASH_DEFINE_REHASH_FN(RTH, struct krt_proto)
1170

    
1171
int
1172
krt_capable(rte *e)
1173
{
1174
  rta *a = e->attrs;
1175

    
1176
  switch (a->dest)
1177
  {
1178
    case RTD_UNICAST:
1179
    case RTD_BLACKHOLE:
1180
    case RTD_UNREACHABLE:
1181
    case RTD_PROHIBIT:
1182
      return 1;
1183

    
1184
    default:
1185
      return 0;
1186
  }
1187
}
1188

    
1189
static inline int
1190
nh_bufsize(struct nexthop *nh)
1191
{
1192
  int rv = 0;
1193
  for (; nh != NULL; nh = nh->next)
1194
    rv += RTNH_LENGTH(RTA_LENGTH(sizeof(ip_addr)));
1195
  return rv;
1196
}
1197

    
1198
static int
1199
nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int op, int dest, struct nexthop *nh)
1200
{
1201
  eattr *ea;
1202
  net *net = e->net;
1203
  rta *a = e->attrs;
1204
  int bufsize = 128 + KRT_METRICS_MAX*8 + nh_bufsize(&(a->nh));
1205
  u32 priority = 0;
1206

    
1207
  struct {
1208
    struct nlmsghdr h;
1209
    struct rtmsg r;
1210
    char buf[0];
1211
  } *r;
1212

    
1213
  int rsize = sizeof(*r) + bufsize;
1214
  r = alloca(rsize);
1215

    
1216
  DBG("nl_send_route(%N,op=%x)\n", net->n.addr, op);
1217

    
1218
  bzero(&r->h, sizeof(r->h));
1219
  bzero(&r->r, sizeof(r->r));
1220
  r->h.nlmsg_type = op ? RTM_NEWROUTE : RTM_DELROUTE;
1221
  r->h.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
1222
  r->h.nlmsg_flags = op | NLM_F_REQUEST | NLM_F_ACK;
1223

    
1224
  r->r.rtm_family = p->af;
1225
  r->r.rtm_dst_len = net_pxlen(net->n.addr);
1226
  r->r.rtm_protocol = RTPROT_BIRD;
1227
  r->r.rtm_scope = RT_SCOPE_NOWHERE;
1228
#ifdef HAVE_MPLS_KERNEL
1229
  if (p->af == AF_MPLS)
1230
  {
1231
    u32 label = net_mpls(net->n.addr);
1232
    nl_add_attr_mpls(&r->h, rsize, RTA_DST, 1, &label);
1233
  }
1234
  else
1235
#endif
1236
    nl_add_attr_ipa(&r->h, rsize, RTA_DST, net_prefix(net->n.addr));
1237

    
1238
  /*
1239
   * Strange behavior for RTM_DELROUTE:
1240
   * 1) rtm_family is ignored in IPv6, works for IPv4
1241
   * 2) not setting RTA_PRIORITY is different from setting default value (on IPv6)
1242
   * 3) not setting RTA_PRIORITY is equivalent to setting 0, which is wildcard
1243
   */
1244

    
1245
  if (krt_table_id(p) < 256)
1246
    r->r.rtm_table = krt_table_id(p);
1247
  else
1248
    nl_add_attr_u32(&r->h, rsize, RTA_TABLE, krt_table_id(p));
1249

    
1250
  if (a->source == RTS_DUMMY)
1251
    priority = e->u.krt.metric;
1252
  else if (KRT_CF->sys.metric)
1253
    priority = KRT_CF->sys.metric;
1254
  else if ((op != NL_OP_DELETE) && (ea = ea_find(eattrs, EA_KRT_METRIC)))
1255
    priority = ea->u.data;
1256

    
1257
  if (priority)
1258
    nl_add_attr_u32(&r->h, rsize, RTA_PRIORITY, priority);
1259

    
1260
  /* For route delete, we do not specify remaining route attributes */
1261
  if (op == NL_OP_DELETE)
1262
    goto dest;
1263

    
1264
  /* Default scope is LINK for device routes, UNIVERSE otherwise */
1265
  if (ea = ea_find(eattrs, EA_KRT_SCOPE))
1266
    r->r.rtm_scope = ea->u.data;
1267
  else
1268
    r->r.rtm_scope = (dest == RTD_UNICAST && ipa_zero(nh->gw)) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
1269

    
1270
  if (ea = ea_find(eattrs, EA_KRT_PREFSRC))
1271
    nl_add_attr_ipa(&r->h, rsize, RTA_PREFSRC, *(ip_addr *)ea->u.ptr->data);
1272

    
1273
  if (ea = ea_find(eattrs, EA_KRT_REALM))
1274
    nl_add_attr_u32(&r->h, rsize, RTA_FLOW, ea->u.data);
1275

    
1276

    
1277
  u32 metrics[KRT_METRICS_MAX];
1278
  metrics[0] = 0;
1279

    
1280
  struct ea_walk_state ews = { .eattrs = eattrs };
1281
  while (ea = ea_walk(&ews, EA_KRT_METRICS, KRT_METRICS_MAX))
1282
  {
1283
    int id = ea->id - EA_KRT_METRICS;
1284
    metrics[0] |= 1 << id;
1285
    metrics[id] = ea->u.data;
1286
  }
1287

    
1288
  if (metrics[0])
1289
    nl_add_metrics(&r->h, rsize, metrics, KRT_METRICS_MAX);
1290

    
1291

    
1292
dest:
1293
  switch (dest)
1294
    {
1295
    case RTD_UNICAST:
1296
      r->r.rtm_type = RTN_UNICAST;
1297
      if (nh->next && !krt_ecmp6(p))
1298
        nl_add_multipath(&r->h, rsize, nh, p->af);
1299
      else
1300
      {
1301
        nl_add_attr_u32(&r->h, rsize, RTA_OIF, nh->iface->index);
1302
        nl_add_nexthop(&r->h, rsize, nh, p->af);
1303

    
1304
        if (nh->flags & RNF_ONLINK)
1305
          r->r.rtm_flags |= RTNH_F_ONLINK;
1306
      }
1307
      break;
1308
    case RTD_BLACKHOLE:
1309
      r->r.rtm_type = RTN_BLACKHOLE;
1310
      break;
1311
    case RTD_UNREACHABLE:
1312
      r->r.rtm_type = RTN_UNREACHABLE;
1313
      break;
1314
    case RTD_PROHIBIT:
1315
      r->r.rtm_type = RTN_PROHIBIT;
1316
      break;
1317
    case RTD_NONE:
1318
      break;
1319
    default:
1320
      bug("krt_capable inconsistent with nl_send_route");
1321
    }
1322

    
1323
  /* Ignore missing for DELETE */
1324
  return nl_exchange(&r->h, (op == NL_OP_DELETE));
1325
}
1326

    
1327
static inline int
1328
nl_add_rte(struct krt_proto *p, rte *e, struct ea_list *eattrs)
1329
{
1330
  rta *a = e->attrs;
1331
  int err = 0;
1332

    
1333
  if (krt_ecmp6(p) && a->nh.next)
1334
  {
1335
    struct nexthop *nh = &(a->nh);
1336

    
1337
    err = nl_send_route(p, e, eattrs, NL_OP_ADD, RTD_UNICAST, nh);
1338
    if (err < 0)
1339
      return err;
1340

    
1341
    for (nh = nh->next; nh; nh = nh->next)
1342
      err += nl_send_route(p, e, eattrs, NL_OP_APPEND, RTD_UNICAST, nh);
1343

    
1344
    return err;
1345
  }
1346

    
1347
  return nl_send_route(p, e, eattrs, NL_OP_ADD, a->dest, &(a->nh));
1348
}
1349

    
1350
static inline int
1351
nl_delete_rte(struct krt_proto *p, rte *e, struct ea_list *eattrs)
1352
{
1353
  int err = 0;
1354

    
1355
  /* For IPv6, we just repeatedly request DELETE until we get error */
1356
  do
1357
    err = nl_send_route(p, e, eattrs, NL_OP_DELETE, RTD_NONE, NULL);
1358
  while (krt_ecmp6(p) && !err);
1359

    
1360
  return err;
1361
}
1362

    
1363
void
1364
krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old, struct ea_list *eattrs)
1365
{
1366
  int err = 0;
1367

    
1368
  /*
1369
   * We could use NL_OP_REPLACE, but route replace on Linux has some problems:
1370
   *
1371
   * 1) Does not check for matching rtm_protocol
1372
   * 2) Has broken semantics for IPv6 ECMP
1373
   * 3) Crashes some kernel version when used for IPv6 ECMP
1374
   *
1375
   * So we use NL_OP_DELETE and then NL_OP_ADD. We also do not trust the old
1376
   * route value, so we do not try to optimize IPv6 ECMP reconfigurations.
1377
   */
1378

    
1379
  if (old)
1380
    nl_delete_rte(p, old, eattrs);
1381

    
1382
  if (new)
1383
    err = nl_add_rte(p, new, eattrs);
1384

    
1385
  if (err < 0)
1386
    n->n.flags |= KRF_SYNC_ERROR;
1387
  else
1388
    n->n.flags &= ~KRF_SYNC_ERROR;
1389
}
1390

    
1391

    
1392
static inline struct nexthop *
1393
nl_alloc_nexthop(struct nl_parse_state *s, ip_addr gw, struct iface *iface, byte weight)
1394
{
1395
  struct nexthop *nh = lp_alloc(s->pool, sizeof(struct nexthop));
1396

    
1397
  nh->gw = gw;
1398
  nh->iface = iface;
1399
  nh->next = NULL;
1400
  nh->weight = weight;
1401

    
1402
  return nh;
1403
}
1404

    
1405
static int
1406
nl_mergable_route(struct nl_parse_state *s, net *net, struct krt_proto *p, uint priority, uint krt_type)
1407
{
1408
  /* Route merging must be active */
1409
  if (!s->merge)
1410
    return 0;
1411

    
1412
  /* Saved and new route must have same network, proto/table, and priority */
1413
  if ((s->net != net) || (s->proto != p) || (s->krt_metric != priority))
1414
    return 0;
1415

    
1416
  /* Both must be regular unicast routes */
1417
  if ((s->krt_type != RTN_UNICAST) || (krt_type != RTN_UNICAST))
1418
    return 0;
1419

    
1420
  return 1;
1421
}
1422

    
1423
static void
1424
nl_announce_route(struct nl_parse_state *s)
1425
{
1426
  rte *e = rte_get_temp(s->attrs);
1427
  e->net = s->net;
1428
  e->u.krt.src = s->krt_src;
1429
  e->u.krt.proto = s->krt_proto;
1430
  e->u.krt.seen = 0;
1431
  e->u.krt.best = 0;
1432
  e->u.krt.metric = s->krt_metric;
1433

    
1434
  if (s->scan)
1435
    krt_got_route(s->proto, e);
1436
  else
1437
    krt_got_route_async(s->proto, e, s->new);
1438

    
1439
  s->net = NULL;
1440
  s->attrs = NULL;
1441
  s->proto = NULL;
1442
  lp_flush(s->pool);
1443
}
1444

    
1445
static inline void
1446
nl_parse_begin(struct nl_parse_state *s, int scan, int merge)
1447
{
1448
  memset(s, 0, sizeof (struct nl_parse_state));
1449
  s->pool = nl_linpool;
1450
  s->scan = scan;
1451
  s->merge = merge;
1452
}
1453

    
1454
static inline void
1455
nl_parse_end(struct nl_parse_state *s)
1456
{
1457
  if (s->net)
1458
    nl_announce_route(s);
1459
}
1460

    
1461

    
1462
#define SKIP(ARG...) do { DBG("KRT: Ignoring route - " ARG); return; } while(0)
1463

    
1464
static void
1465
nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
1466
{
1467
  struct krt_proto *p;
1468
  struct rtmsg *i;
1469
  struct rtattr *a[BIRD_RTA_MAX];
1470
  int new = h->nlmsg_type == RTM_NEWROUTE;
1471

    
1472
  net_addr dst;
1473
  u32 oif = ~0;
1474
  u32 table_id;
1475
  u32 priority = 0;
1476
  u32 def_scope = RT_SCOPE_UNIVERSE;
1477
  int src;
1478

    
1479
  if (!(i = nl_checkin(h, sizeof(*i))))
1480
    return;
1481

    
1482
  switch (i->rtm_family)
1483
    {
1484
    case AF_INET:
1485
      if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want4, a, sizeof(a)))
1486
        return;
1487

    
1488
      if (a[RTA_DST])
1489
        net_fill_ip4(&dst, rta_get_ip4(a[RTA_DST]), i->rtm_dst_len);
1490
      else
1491
        net_fill_ip4(&dst, IP4_NONE, 0);
1492
      break;
1493

    
1494
    case AF_INET6:
1495
      if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want6, a, sizeof(a)))
1496
        return;
1497

    
1498
      if (a[RTA_DST])
1499
        net_fill_ip6(&dst, rta_get_ip6(a[RTA_DST]), i->rtm_dst_len);
1500
      else
1501
        net_fill_ip6(&dst, IP6_NONE, 0);
1502
      break;
1503

    
1504
#ifdef HAVE_MPLS_KERNEL
1505
    case AF_MPLS:
1506
      if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want_mpls, a, sizeof(a)))
1507
        return;
1508

    
1509
      if (!a[RTA_DST])
1510
        SKIP("MPLS route without RTA_DST");
1511

    
1512
      if (rta_get_mpls(a[RTA_DST], rta_mpls_stack) != 1)
1513
        SKIP("MPLS route with multi-label RTA_DST");
1514

    
1515
      net_fill_mpls(&dst, rta_mpls_stack[0]);
1516
      break;
1517
#endif
1518

    
1519
    default:
1520
      return;
1521
    }
1522

    
1523
  if (a[RTA_OIF])
1524
    oif = rta_get_u32(a[RTA_OIF]);
1525

    
1526
  if (a[RTA_TABLE])
1527
    table_id = rta_get_u32(a[RTA_TABLE]);
1528
  else
1529
    table_id = i->rtm_table;
1530

    
1531
  /* Do we know this table? */
1532
  p = HASH_FIND(nl_table_map, RTH, i->rtm_family, table_id);
1533
  if (!p)
1534
    SKIP("unknown table %d\n", table);
1535

    
1536
  if (a[RTA_IIF])
1537
    SKIP("IIF set\n");
1538

    
1539
  if (i->rtm_tos != 0)                        /* We don't support TOS */
1540
    SKIP("TOS %02x\n", i->rtm_tos);
1541

    
1542
  if (s->scan && !new)
1543
    SKIP("RTM_DELROUTE in scan\n");
1544

    
1545
  if (a[RTA_PRIORITY])
1546
    priority = rta_get_u32(a[RTA_PRIORITY]);
1547

    
1548
  int c = net_classify(&dst);
1549
  if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK))
1550
    SKIP("strange class/scope\n");
1551

    
1552
  switch (i->rtm_protocol)
1553
    {
1554
    case RTPROT_UNSPEC:
1555
      SKIP("proto unspec\n");
1556

    
1557
    case RTPROT_REDIRECT:
1558
      src = KRT_SRC_REDIRECT;
1559
      break;
1560

    
1561
    case RTPROT_KERNEL:
1562
      src = KRT_SRC_KERNEL;
1563
      return;
1564

    
1565
    case RTPROT_BIRD:
1566
      if (!s->scan)
1567
        SKIP("echo\n");
1568
      src = KRT_SRC_BIRD;
1569
      break;
1570

    
1571
    case RTPROT_BOOT:
1572
    default:
1573
      src = KRT_SRC_ALIEN;
1574
    }
1575

    
1576
  net *net = net_get(p->p.main_channel->table, &dst);
1577

    
1578
  if (s->net && !nl_mergable_route(s, net, p, priority, i->rtm_type))
1579
    nl_announce_route(s);
1580

    
1581
  rta *ra = lp_allocz(s->pool, RTA_MAX_SIZE);
1582
  ra->src = p->p.main_source;
1583
  ra->source = RTS_INHERIT;
1584
  ra->scope = SCOPE_UNIVERSE;
1585

    
1586
  switch (i->rtm_type)
1587
    {
1588
    case RTN_UNICAST:
1589
      ra->dest = RTD_UNICAST;
1590

    
1591
      if (a[RTA_MULTIPATH])
1592
        {
1593
          struct nexthop *nh = nl_parse_multipath(p, a[RTA_MULTIPATH], i->rtm_family);
1594
          if (!nh)
1595
            {
1596
              log(L_ERR "KRT: Received strange multipath route %N", net->n.addr);
1597
              return;
1598
            }
1599

    
1600
          ra->nh = *nh;
1601
          break;
1602
        }
1603

    
1604
      ra->nh.iface = if_find_by_index(oif);
1605
      if (!ra->nh.iface)
1606
        {
1607
          log(L_ERR "KRT: Received route %N with unknown ifindex %u", net->n.addr, oif);
1608
          return;
1609
        }
1610

    
1611
      if ((i->rtm_family != AF_MPLS) && a[RTA_GATEWAY]
1612
#ifdef HAVE_MPLS_KERNEL
1613
          || (i->rtm_family == AF_MPLS) && a[RTA_VIA]
1614
#endif
1615
          )
1616
        {
1617
#ifdef HAVE_MPLS_KERNEL
1618
          if (i->rtm_family == AF_MPLS)
1619
            ra->nh.gw = rta_get_via(a[RTA_VIA]);
1620
          else
1621
#endif
1622
            ra->nh.gw = rta_get_ipa(a[RTA_GATEWAY]);
1623

    
1624
          /* Silently skip strange 6to4 routes */
1625
          const net_addr_ip6 sit = NET_ADDR_IP6(IP6_NONE, 96);
1626
          if ((i->rtm_family == AF_INET6) && ipa_in_netX(ra->nh.gw, (net_addr *) &sit))
1627
            return;
1628

    
1629
          if (i->rtm_flags & RTNH_F_ONLINK)
1630
            ra->nh.flags |= RNF_ONLINK;
1631

    
1632
          neighbor *nbr;
1633
          nbr = neigh_find2(&p->p, &(ra->nh.gw), ra->nh.iface,
1634
                            (ra->nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0);
1635
          if (!nbr || (nbr->scope == SCOPE_HOST))
1636
            {
1637
              log(L_ERR "KRT: Received route %N with strange next-hop %I", net->n.addr,
1638
                  ra->nh.gw);
1639
              return;
1640
            }
1641
        }
1642

    
1643
      break;
1644
    case RTN_BLACKHOLE:
1645
      ra->dest = RTD_BLACKHOLE;
1646
      break;
1647
    case RTN_UNREACHABLE:
1648
      ra->dest = RTD_UNREACHABLE;
1649
      break;
1650
    case RTN_PROHIBIT:
1651
      ra->dest = RTD_PROHIBIT;
1652
      break;
1653
    /* FIXME: What about RTN_THROW? */
1654
    default:
1655
      SKIP("type %d\n", i->rtm_type);
1656
      return;
1657
    }
1658

    
1659
#ifdef HAVE_MPLS_KERNEL
1660
  int labels = 0;
1661
  if ((i->rtm_family == AF_MPLS) && a[RTA_NEWDST] && !ra->nh.next)
1662
    labels = rta_get_mpls(a[RTA_NEWDST], ra->nh.label);
1663

    
1664
  if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE] && !ra->nh.next)
1665
    {
1666
      switch (rta_get_u16(a[RTA_ENCAP_TYPE]))
1667
        {
1668
          case LWTUNNEL_ENCAP_MPLS:
1669
            {
1670
              struct rtattr *enca[BIRD_RTA_MAX];
1671
              nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]);
1672
              nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca));
1673
              labels = rta_get_mpls(enca[RTA_DST], ra->nh.label);
1674
              break;
1675
            }
1676
          default:
1677
            SKIP("unknown encapsulation method %d\n", rta_get_u16(a[RTA_ENCAP_TYPE]));
1678
            break;
1679
        }
1680
    }
1681

    
1682
  if (labels < 0)
1683
  {
1684
    log(L_WARN "KRT: Too long MPLS stack received, ignoring.");
1685
    ra->nh.labels = 0;
1686
  }
1687
  else
1688
    ra->nh.labels = labels;
1689
#endif
1690

    
1691
  rte *e = rte_get_temp(ra);
1692
  e->net = net;
1693
  e->u.krt.src = src;
1694
  e->u.krt.proto = i->rtm_protocol;
1695
  e->u.krt.seen = 0;
1696
  e->u.krt.best = 0;
1697
  e->u.krt.metric = 0;
1698

    
1699
  if (i->rtm_scope != def_scope)
1700
    {
1701
      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
1702
      ea->next = ra->eattrs;
1703
      ra->eattrs = ea;
1704
      ea->flags = EALF_SORTED;
1705
      ea->count = 1;
1706
      ea->attrs[0].id = EA_KRT_SCOPE;
1707
      ea->attrs[0].flags = 0;
1708
      ea->attrs[0].type = EAF_TYPE_INT;
1709
      ea->attrs[0].u.data = i->rtm_scope;
1710
    }
1711

    
1712
  if (a[RTA_PRIORITY])
1713
    e->u.krt.metric = rta_get_u32(a[RTA_PRIORITY]);
1714

    
1715
  if (a[RTA_PREFSRC])
1716
    {
1717
      ip_addr ps = rta_get_ipa(a[RTA_PREFSRC]);
1718

    
1719
      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
1720
      ea->next = ra->eattrs;
1721
      ra->eattrs = ea;
1722
      ea->flags = EALF_SORTED;
1723
      ea->count = 1;
1724
      ea->attrs[0].id = EA_KRT_PREFSRC;
1725
      ea->attrs[0].flags = 0;
1726
      ea->attrs[0].type = EAF_TYPE_IP_ADDRESS;
1727
      ea->attrs[0].u.ptr = lp_alloc(s->pool, sizeof(struct adata) + sizeof(ps));
1728
      ea->attrs[0].u.ptr->length = sizeof(ps);
1729
      memcpy(ea->attrs[0].u.ptr->data, &ps, sizeof(ps));
1730
    }
1731

    
1732
  if (a[RTA_FLOW])
1733
    {
1734
      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
1735
      ea->next = ra->eattrs;
1736
      ra->eattrs = ea;
1737
      ea->flags = EALF_SORTED;
1738
      ea->count = 1;
1739
      ea->attrs[0].id = EA_KRT_REALM;
1740
      ea->attrs[0].flags = 0;
1741
      ea->attrs[0].type = EAF_TYPE_INT;
1742
      ea->attrs[0].u.data = rta_get_u32(a[RTA_FLOW]);
1743
    }
1744

    
1745
  if (a[RTA_METRICS])
1746
    {
1747
      u32 metrics[KRT_METRICS_MAX];
1748
      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + KRT_METRICS_MAX * sizeof(eattr));
1749
      int t, n = 0;
1750

    
1751
      if (nl_parse_metrics(a[RTA_METRICS], metrics, ARRAY_SIZE(metrics)) < 0)
1752
        {
1753
          log(L_ERR "KRT: Received route %N with strange RTA_METRICS attribute", net->n.addr);
1754
          return;
1755
        }
1756

    
1757
      for (t = 1; t < KRT_METRICS_MAX; t++)
1758
        if (metrics[0] & (1 << t))
1759
          {
1760
            ea->attrs[n].id = EA_CODE(EAP_KRT, KRT_METRICS_OFFSET + t);
1761
            ea->attrs[n].flags = 0;
1762
            ea->attrs[n].type = EAF_TYPE_INT; /* FIXME: Some are EAF_TYPE_BITFIELD */
1763
            ea->attrs[n].u.data = metrics[t];
1764
            n++;
1765
          }
1766

    
1767
      if (n > 0)
1768
        {
1769
          ea->next = ra->eattrs;
1770
          ea->flags = EALF_SORTED;
1771
          ea->count = n;
1772
          ra->eattrs = ea;
1773
        }
1774
    }
1775

    
1776
  /*
1777
   * Ideally, now we would send the received route to the rest of kernel code.
1778
   * But IPv6 ECMP routes before 4.11 are sent as a sequence of routes, so we
1779
   * postpone it and merge next hops until the end of the sequence. Note that
1780
   * proper multipath updates are rejected by nl_mergable_route(), so it is
1781
   * always the first case for them.
1782
   */
1783

    
1784
  if (!s->net)
1785
  {
1786
    /* Store the new route */
1787
    s->net = net;
1788
    s->attrs = ra;
1789
    s->proto = p;
1790
    s->new = new;
1791
    s->krt_src = src;
1792
    s->krt_type = i->rtm_type;
1793
    s->krt_proto = i->rtm_protocol;
1794
    s->krt_metric = priority;
1795
  }
1796
  else
1797
  {
1798
    /* Merge next hops with the stored route */
1799
    rta *oa = s->attrs;
1800

    
1801
    struct nexthop *nhs = &oa->nh;
1802
    nexthop_insert(&nhs, &ra->nh);
1803

    
1804
    /* Perhaps new nexthop is inserted at the first position */
1805
    if (nhs == &ra->nh)
1806
    {
1807
      /* Swap rtas */
1808
      s->attrs = ra;
1809

    
1810
      /* Keep old eattrs */
1811
      ra->eattrs = oa->eattrs;
1812
    }
1813
  }
1814
}
1815

    
1816
void
1817
krt_do_scan(struct krt_proto *p UNUSED)        /* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */
1818
{
1819
  struct nlmsghdr *h;
1820
  struct nl_parse_state s;
1821

    
1822
  nl_parse_begin(&s, 1, 0);
1823
  nl_request_dump(AF_INET, RTM_GETROUTE);
1824
  while (h = nl_get_scan())
1825
    if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
1826
      nl_parse_route(&s, h);
1827
    else
1828
      log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
1829
  nl_parse_end(&s);
1830

    
1831
  nl_parse_begin(&s, 1, 1);
1832
  nl_request_dump(AF_INET6, RTM_GETROUTE);
1833
  while (h = nl_get_scan())
1834
    if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
1835
      nl_parse_route(&s, h);
1836
    else
1837
      log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
1838
  nl_parse_end(&s);
1839

    
1840
#ifdef HAVE_MPLS_KERNEL
1841
  nl_parse_begin(&s, 1, 1);
1842
  nl_request_dump(AF_MPLS, RTM_GETROUTE);
1843
  while (h = nl_get_scan())
1844
    if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
1845
      nl_parse_route(&s, h);
1846
    else
1847
      log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
1848
  nl_parse_end(&s);
1849
#endif
1850
}
1851

    
1852
/*
1853
 *        Asynchronous Netlink interface
1854
 */
1855

    
1856
static sock *nl_async_sk;                /* BIRD socket for asynchronous notifications */
1857
static byte *nl_async_rx_buffer;        /* Receive buffer */
1858

    
1859
static void
1860
nl_async_msg(struct nlmsghdr *h)
1861
{
1862
  struct nl_parse_state s;
1863

    
1864
  switch (h->nlmsg_type)
1865
    {
1866
    case RTM_NEWROUTE:
1867
    case RTM_DELROUTE:
1868
      DBG("KRT: Received async route notification (%d)\n", h->nlmsg_type);
1869
      nl_parse_begin(&s, 0, 0);
1870
      nl_parse_route(&s, h);
1871
      nl_parse_end(&s);
1872
      break;
1873
    case RTM_NEWLINK:
1874
    case RTM_DELLINK:
1875
      DBG("KRT: Received async link notification (%d)\n", h->nlmsg_type);
1876
      if (kif_proto)
1877
        nl_parse_link(h, 0);
1878
      break;
1879
    case RTM_NEWADDR:
1880
    case RTM_DELADDR:
1881
      DBG("KRT: Received async address notification (%d)\n", h->nlmsg_type);
1882
      if (kif_proto)
1883
        nl_parse_addr(h, 0);
1884
      break;
1885
    default:
1886
      DBG("KRT: Received unknown async notification (%d)\n", h->nlmsg_type);
1887
    }
1888
}
1889

    
1890
static int
1891
nl_async_hook(sock *sk, uint size UNUSED)
1892
{
1893
  struct iovec iov = { nl_async_rx_buffer, NL_RX_SIZE };
1894
  struct sockaddr_nl sa;
1895
  struct msghdr m = {
1896
    .msg_name = &sa,
1897
    .msg_namelen = sizeof(sa),
1898
    .msg_iov = &iov,
1899
    .msg_iovlen = 1,
1900
  };
1901
  struct nlmsghdr *h;
1902
  int x;
1903
  uint len;
1904

    
1905
  x = recvmsg(sk->fd, &m, 0);
1906
  if (x < 0)
1907
    {
1908
      if (errno == ENOBUFS)
1909
        {
1910
          /*
1911
           *  Netlink reports some packets have been thrown away.
1912
           *  One day we might react to it by asking for route table
1913
           *  scan in near future.
1914
           */
1915
          log(L_WARN "Kernel dropped some netlink messages, will resync on next scan.");
1916
          return 1;        /* More data are likely to be ready */
1917
        }
1918
      else if (errno != EWOULDBLOCK)
1919
        log(L_ERR "Netlink recvmsg: %m");
1920
      return 0;
1921
    }
1922
  if (sa.nl_pid)                /* It isn't from the kernel */
1923
    {
1924
      DBG("Non-kernel packet\n");
1925
      return 1;
1926
    }
1927
  h = (void *) nl_async_rx_buffer;
1928
  len = x;
1929
  if (m.msg_flags & MSG_TRUNC)
1930
    {
1931
      log(L_WARN "Netlink got truncated asynchronous message");
1932
      return 1;
1933
    }
1934
  while (NLMSG_OK(h, len))
1935
    {
1936
      nl_async_msg(h);
1937
      h = NLMSG_NEXT(h, len);
1938
    }
1939
  if (len)
1940
    log(L_WARN "nl_async_hook: Found packet remnant of size %d", len);
1941
  return 1;
1942
}
1943

    
1944
static void
1945
nl_async_err_hook(sock *sk, int e UNUSED)
1946
{
1947
  nl_async_hook(sk, 0);
1948
}
1949

    
1950
static void
1951
nl_open_async(void)
1952
{
1953
  sock *sk;
1954
  struct sockaddr_nl sa;
1955
  int fd;
1956

    
1957
  if (nl_async_sk)
1958
    return;
1959

    
1960
  DBG("KRT: Opening async netlink socket\n");
1961

    
1962
  fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
1963
  if (fd < 0)
1964
    {
1965
      log(L_ERR "Unable to open asynchronous rtnetlink socket: %m");
1966
      return;
1967
    }
1968

    
1969
  bzero(&sa, sizeof(sa));
1970
  sa.nl_family = AF_NETLINK;
1971
  sa.nl_groups = RTMGRP_LINK |
1972
    RTMGRP_IPV4_IFADDR | RTMGRP_IPV4_ROUTE |
1973
    RTMGRP_IPV6_IFADDR | RTMGRP_IPV6_ROUTE;
1974

    
1975
  if (bind(fd, (struct sockaddr *) &sa, sizeof(sa)) < 0)
1976
    {
1977
      log(L_ERR "Unable to bind asynchronous rtnetlink socket: %m");
1978
      close(fd);
1979
      return;
1980
    }
1981

    
1982
  nl_async_rx_buffer = xmalloc(NL_RX_SIZE);
1983

    
1984
  sk = nl_async_sk = sk_new(krt_pool);
1985
  sk->type = SK_MAGIC;
1986
  sk->rx_hook = nl_async_hook;
1987
  sk->err_hook = nl_async_err_hook;
1988
  sk->fd = fd;
1989
  if (sk_open(sk) < 0)
1990
    bug("Netlink: sk_open failed");
1991
}
1992

    
1993

    
1994
/*
1995
 *        Interface to the UNIX krt module
1996
 */
1997

    
1998
void
1999
krt_sys_io_init(void)
2000
{
2001
  nl_linpool = lp_new_default(krt_pool);
2002
  HASH_INIT(nl_table_map, krt_pool, 6);
2003
}
2004

    
2005
int
2006
krt_sys_start(struct krt_proto *p)
2007
{
2008
  struct krt_proto *old = HASH_FIND(nl_table_map, RTH, p->af, krt_table_id(p));
2009

    
2010
  if (old)
2011
    {
2012
      log(L_ERR "%s: Kernel table %u already registered by %s",
2013
          p->p.name, krt_table_id(p), old->p.name);
2014
      return 0;
2015
    }
2016

    
2017
  HASH_INSERT2(nl_table_map, RTH, krt_pool, p);
2018

    
2019
  nl_open();
2020
  nl_open_async();
2021

    
2022
  return 1;
2023
}
2024

    
2025
void
2026
krt_sys_shutdown(struct krt_proto *p)
2027
{
2028
  HASH_REMOVE2(nl_table_map, RTH, krt_pool, p);
2029
}
2030

    
2031
int
2032
krt_sys_reconfigure(struct krt_proto *p UNUSED, struct krt_config *n, struct krt_config *o)
2033
{
2034
  return (n->sys.table_id == o->sys.table_id) && (n->sys.metric == o->sys.metric);
2035
}
2036

    
2037
void
2038
krt_sys_init_config(struct krt_config *cf)
2039
{
2040
  cf->sys.table_id = RT_TABLE_MAIN;
2041
  cf->sys.metric = 32;
2042
}
2043

    
2044
void
2045
krt_sys_copy_config(struct krt_config *d, struct krt_config *s)
2046
{
2047
  d->sys.table_id = s->sys.table_id;
2048
  d->sys.metric = s->sys.metric;
2049
}
2050

    
2051
static const char *krt_metrics_names[KRT_METRICS_MAX] = {
2052
  NULL, "lock", "mtu", "window", "rtt", "rttvar", "sstresh", "cwnd", "advmss",
2053
  "reordering", "hoplimit", "initcwnd", "features", "rto_min", "initrwnd", "quickack"
2054
};
2055

    
2056
static const char *krt_features_names[KRT_FEATURES_MAX] = {
2057
  "ecn", NULL, NULL, "allfrag"
2058
};
2059

    
2060
int
2061
krt_sys_get_attr(eattr *a, byte *buf, int buflen UNUSED)
2062
{
2063
  switch (a->id)
2064
  {
2065
  case EA_KRT_PREFSRC:
2066
    bsprintf(buf, "prefsrc");
2067
    return GA_NAME;
2068

    
2069
  case EA_KRT_REALM:
2070
    bsprintf(buf, "realm");
2071
    return GA_NAME;
2072

    
2073
  case EA_KRT_SCOPE:
2074
    bsprintf(buf, "scope");
2075
    return GA_NAME;
2076

    
2077
  case EA_KRT_LOCK:
2078
    buf += bsprintf(buf, "lock:");
2079
    ea_format_bitfield(a, buf, buflen, krt_metrics_names, 2, KRT_METRICS_MAX);
2080
    return GA_FULL;
2081

    
2082
  case EA_KRT_FEATURES:
2083
    buf += bsprintf(buf, "features:");
2084
    ea_format_bitfield(a, buf, buflen, krt_features_names, 0, KRT_FEATURES_MAX);
2085
    return GA_FULL;
2086

    
2087
  default:;
2088
    int id = (int)EA_ID(a->id) - KRT_METRICS_OFFSET;
2089
    if (id > 0 && id < KRT_METRICS_MAX)
2090
    {
2091
      bsprintf(buf, "%s", krt_metrics_names[id]);
2092
      return GA_NAME;
2093
    }
2094

    
2095
    return GA_UNKNOWN;
2096
  }
2097
}
2098

    
2099

    
2100

    
2101
void
2102
kif_sys_start(struct kif_proto *p UNUSED)
2103
{
2104
  nl_open();
2105
  nl_open_async();
2106
}
2107

    
2108
void
2109
kif_sys_shutdown(struct kif_proto *p UNUSED)
2110
{
2111
}
2112

    
2113
int
2114
kif_update_sysdep_addr(struct iface *i UNUSED)
2115
{
2116
  return 0;
2117
}