Statistics
| Branch: | Revision:

iof-bird / bird-2.0.1 / sysdep / linux / netlink.c @ 6b3f1a54

History | View | Annotate | Download (48.9 KB)

1
/*
2
 *        BIRD -- Linux Netlink Interface
3
 *
4
 *        (c) 1999--2000 Martin Mares <mj@ucw.cz>
5
 *
6
 *        Can be freely distributed and used under the terms of the GNU GPL.
7
 */
8

    
9
#include <alloca.h>
10
#include <stdio.h>
11
#include <unistd.h>
12
#include <fcntl.h>
13
#include <sys/socket.h>
14
#include <sys/uio.h>
15
#include <errno.h>
16

    
17
#undef LOCAL_DEBUG
18

    
19
#include "nest/bird.h"
20
#include "nest/route.h"
21
#include "nest/protocol.h"
22
#include "nest/iface.h"
23
#include "lib/alloca.h"
24
#include "sysdep/unix/unix.h"
25
#include "sysdep/unix/krt.h"
26
#include "lib/socket.h"
27
#include "lib/string.h"
28
#include "lib/hash.h"
29
#include "conf/conf.h"
30

    
31
#include <asm/types.h>
32
#include <linux/if.h>
33
#include <linux/netlink.h>
34
#include <linux/rtnetlink.h>
35

    
36
#ifdef HAVE_MPLS_KERNEL
37
#include <linux/lwtunnel.h>
38
#endif
39

    
40
#ifndef MSG_TRUNC                        /* Hack: Several versions of glibc miss this one :( */
41
#define MSG_TRUNC 0x20
42
#endif
43

    
44
#ifndef IFA_FLAGS
45
#define IFA_FLAGS 8
46
#endif
47

    
48
#ifndef IFF_LOWER_UP
49
#define IFF_LOWER_UP 0x10000
50
#endif
51

    
52
#ifndef RTA_TABLE
53
#define RTA_TABLE  15
54
#endif
55

    
56
#ifndef RTA_VIA
57
#define RTA_VIA         18
58
#endif
59

    
60
#ifndef RTA_NEWDST
61
#define RTA_NEWDST  19
62
#endif
63

    
64
#ifndef RTA_ENCAP_TYPE
65
#define RTA_ENCAP_TYPE        21
66
#endif
67

    
68
#ifndef RTA_ENCAP
69
#define RTA_ENCAP  22
70
#endif
71

    
72
#define krt_ecmp6(p) ((p)->af == AF_INET6)
73

    
74
const int rt_default_ecmp = 16;
75

    
76
/*
77
 * Structure nl_parse_state keeps state of received route processing. Ideally,
78
 * we could just independently parse received Netlink messages and immediately
79
 * propagate received routes to the rest of BIRD, but older Linux kernel (before
80
 * version 4.11) represents and announces IPv6 ECMP routes not as one route with
81
 * multiple next hops (like RTA_MULTIPATH in IPv4 ECMP), but as a sequence of
82
 * routes with the same prefix. More recent kernels work as with IPv4.
83
 *
84
 * Therefore, BIRD keeps currently processed route in nl_parse_state structure
85
 * and postpones its propagation until we expect it to be final; i.e., when
86
 * non-matching route is received or when the scan ends. When another matching
87
 * route is received, it is merged with the already processed route to form an
88
 * ECMP route. Note that merging is done only for IPv6 (merge == 1), but the
89
 * postponing is done in both cases (for simplicity). All IPv4 routes or IPv6
90
 * routes with RTA_MULTIPATH set are just considered non-matching.
91
 *
92
 * This is ignored for asynchronous notifications (every notification is handled
93
 * as a separate route). It is not an issue for our routes, as we ignore such
94
 * notifications anyways. But importing alien IPv6 ECMP routes does not work
95
 * properly with older kernels.
96
 *
97
 * Whatever the kernel version is, IPv6 ECMP routes are sent as multiple routes
98
 * for the same prefix.
99
 */
100

    
101
struct nl_parse_state
102
{
103
  struct linpool *pool;
104
  int scan;
105
  int merge;
106

    
107
  net *net;
108
  rta *attrs;
109
  struct krt_proto *proto;
110
  s8 new;
111
  s8 krt_src;
112
  u8 krt_type;
113
  u8 krt_proto;
114
  u32 krt_metric;
115
};
116

    
117
/*
118
 *        Synchronous Netlink interface
119
 */
120

    
121
struct nl_sock
122
{
123
  int fd;
124
  u32 seq;
125
  byte *rx_buffer;                        /* Receive buffer */
126
  struct nlmsghdr *last_hdr;                /* Recently received packet */
127
  uint last_size;
128
};
129

    
130
#define NL_RX_SIZE 8192
131

    
132
#define NL_OP_DELETE        0
133
#define NL_OP_ADD        (NLM_F_CREATE|NLM_F_EXCL)
134
#define NL_OP_REPLACE        (NLM_F_CREATE|NLM_F_REPLACE)
135
#define NL_OP_APPEND        (NLM_F_CREATE|NLM_F_APPEND)
136

    
137
static linpool *nl_linpool;
138

    
139
static struct nl_sock nl_scan = {.fd = -1};        /* Netlink socket for synchronous scan */
140
static struct nl_sock nl_req  = {.fd = -1};        /* Netlink socket for requests */
141

    
142
static void
143
nl_open_sock(struct nl_sock *nl)
144
{
145
  if (nl->fd < 0)
146
    {
147
      nl->fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
148
      if (nl->fd < 0)
149
        die("Unable to open rtnetlink socket: %m");
150
      nl->seq = (u32) (current_time() TO_S); /* Or perhaps random_u32() ? */
151
      nl->rx_buffer = xmalloc(NL_RX_SIZE);
152
      nl->last_hdr = NULL;
153
      nl->last_size = 0;
154
    }
155
}
156

    
157
static void
158
nl_open(void)
159
{
160
  nl_open_sock(&nl_scan);
161
  nl_open_sock(&nl_req);
162
}
163

    
164
static void
165
nl_send(struct nl_sock *nl, struct nlmsghdr *nh)
166
{
167
  struct sockaddr_nl sa;
168

    
169
  memset(&sa, 0, sizeof(sa));
170
  sa.nl_family = AF_NETLINK;
171
  nh->nlmsg_pid = 0;
172
  nh->nlmsg_seq = ++(nl->seq);
173
  if (sendto(nl->fd, nh, nh->nlmsg_len, 0, (struct sockaddr *)&sa, sizeof(sa)) < 0)
174
    die("rtnetlink sendto: %m");
175
  nl->last_hdr = NULL;
176
}
177

    
178
static void
179
nl_request_dump(int af, int cmd)
180
{
181
  struct {
182
    struct nlmsghdr nh;
183
    struct rtgenmsg g;
184
  } req = {
185
    .nh.nlmsg_type = cmd,
186
    .nh.nlmsg_len = sizeof(req),
187
    .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
188
    .g.rtgen_family = af
189
  };
190
  nl_send(&nl_scan, &req.nh);
191
}
192

    
193
static struct nlmsghdr *
194
nl_get_reply(struct nl_sock *nl)
195
{
196
  for(;;)
197
    {
198
      if (!nl->last_hdr)
199
        {
200
          struct iovec iov = { nl->rx_buffer, NL_RX_SIZE };
201
          struct sockaddr_nl sa;
202
          struct msghdr m = {
203
            .msg_name = &sa,
204
            .msg_namelen = sizeof(sa),
205
            .msg_iov = &iov,
206
            .msg_iovlen = 1,
207
          };
208
          int x = recvmsg(nl->fd, &m, 0);
209
          if (x < 0)
210
            die("nl_get_reply: %m");
211
          if (sa.nl_pid)                /* It isn't from the kernel */
212
            {
213
              DBG("Non-kernel packet\n");
214
              continue;
215
            }
216
          nl->last_size = x;
217
          nl->last_hdr = (void *) nl->rx_buffer;
218
          if (m.msg_flags & MSG_TRUNC)
219
            bug("nl_get_reply: got truncated reply which should be impossible");
220
        }
221
      if (NLMSG_OK(nl->last_hdr, nl->last_size))
222
        {
223
          struct nlmsghdr *h = nl->last_hdr;
224
          nl->last_hdr = NLMSG_NEXT(h, nl->last_size);
225
          if (h->nlmsg_seq != nl->seq)
226
            {
227
              log(L_WARN "nl_get_reply: Ignoring out of sequence netlink packet (%x != %x)",
228
                  h->nlmsg_seq, nl->seq);
229
              continue;
230
            }
231
          return h;
232
        }
233
      if (nl->last_size)
234
        log(L_WARN "nl_get_reply: Found packet remnant of size %d", nl->last_size);
235
      nl->last_hdr = NULL;
236
    }
237
}
238

    
239
static struct tbf rl_netlink_err = TBF_DEFAULT_LOG_LIMITS;
240

    
241
static int
242
nl_error(struct nlmsghdr *h, int ignore_esrch)
243
{
244
  struct nlmsgerr *e;
245
  int ec;
246

    
247
  if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr)))
248
    {
249
      log(L_WARN "Netlink: Truncated error message received");
250
      return ENOBUFS;
251
    }
252
  e = (struct nlmsgerr *) NLMSG_DATA(h);
253
  ec = -e->error;
254
  if (ec && !(ignore_esrch && (ec == ESRCH)))
255
    log_rl(&rl_netlink_err, L_WARN "Netlink: %s", strerror(ec));
256
  return ec;
257
}
258

    
259
static struct nlmsghdr *
260
nl_get_scan(void)
261
{
262
  struct nlmsghdr *h = nl_get_reply(&nl_scan);
263

    
264
  if (h->nlmsg_type == NLMSG_DONE)
265
    return NULL;
266
  if (h->nlmsg_type == NLMSG_ERROR)
267
    {
268
      nl_error(h, 0);
269
      return NULL;
270
    }
271
  return h;
272
}
273

    
274
static int
275
nl_exchange(struct nlmsghdr *pkt, int ignore_esrch)
276
{
277
  struct nlmsghdr *h;
278

    
279
  nl_send(&nl_req, pkt);
280
  for(;;)
281
    {
282
      h = nl_get_reply(&nl_req);
283
      if (h->nlmsg_type == NLMSG_ERROR)
284
        break;
285
      log(L_WARN "nl_exchange: Unexpected reply received");
286
    }
287
  return nl_error(h, ignore_esrch) ? -1 : 0;
288
}
289

    
290
/*
291
 *        Netlink attributes
292
 */
293

    
294
static int nl_attr_len;
295

    
296
static void *
297
nl_checkin(struct nlmsghdr *h, int lsize)
298
{
299
  nl_attr_len = h->nlmsg_len - NLMSG_LENGTH(lsize);
300
  if (nl_attr_len < 0)
301
    {
302
      log(L_ERR "nl_checkin: underrun by %d bytes", -nl_attr_len);
303
      return NULL;
304
    }
305
  return NLMSG_DATA(h);
306
}
307

    
308
struct nl_want_attrs {
309
  u8 defined:1;
310
  u8 checksize:1;
311
  u8 size;
312
};
313

    
314

    
315
#define BIRD_IFLA_MAX (IFLA_WIRELESS+1)
316

    
317
static struct nl_want_attrs ifla_attr_want[BIRD_IFLA_MAX] = {
318
  [IFLA_IFNAME]          = { 1, 0, 0 },
319
  [IFLA_MTU]          = { 1, 1, sizeof(u32) },
320
  [IFLA_MASTER]          = { 1, 1, sizeof(u32) },
321
  [IFLA_WIRELESS] = { 1, 0, 0 },
322
};
323

    
324

    
325
#define BIRD_IFA_MAX  (IFA_FLAGS+1)
326

    
327
static struct nl_want_attrs ifa_attr_want4[BIRD_IFA_MAX] = {
328
  [IFA_ADDRESS]          = { 1, 1, sizeof(ip4_addr) },
329
  [IFA_LOCAL]          = { 1, 1, sizeof(ip4_addr) },
330
  [IFA_BROADCAST] = { 1, 1, sizeof(ip4_addr) },
331
  [IFA_FLAGS]     = { 1, 1, sizeof(u32) },
332
};
333

    
334
static struct nl_want_attrs ifa_attr_want6[BIRD_IFA_MAX] = {
335
  [IFA_ADDRESS]          = { 1, 1, sizeof(ip6_addr) },
336
  [IFA_LOCAL]          = { 1, 1, sizeof(ip6_addr) },
337
  [IFA_FLAGS]          = { 1, 1, sizeof(u32) },
338
};
339

    
340

    
341
#define BIRD_RTA_MAX  (RTA_ENCAP+1)
342

    
343
static struct nl_want_attrs nexthop_attr_want4[BIRD_RTA_MAX] = {
344
  [RTA_GATEWAY]          = { 1, 1, sizeof(ip4_addr) },
345
  [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
346
  [RTA_ENCAP]          = { 1, 0, 0 },
347
};
348

    
349
static struct nl_want_attrs nexthop_attr_want6[BIRD_RTA_MAX] = {
350
  [RTA_GATEWAY]          = { 1, 1, sizeof(ip6_addr) },
351
  [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
352
  [RTA_ENCAP]          = { 1, 0, 0 },
353
};
354

    
355
#ifdef HAVE_MPLS_KERNEL
356
static struct nl_want_attrs encap_mpls_want[BIRD_RTA_MAX] = {
357
  [RTA_DST]       = { 1, 0, 0 },
358
};
359
#endif
360

    
361
static struct nl_want_attrs rtm_attr_want4[BIRD_RTA_MAX] = {
362
  [RTA_DST]          = { 1, 1, sizeof(ip4_addr) },
363
  [RTA_OIF]          = { 1, 1, sizeof(u32) },
364
  [RTA_GATEWAY]          = { 1, 1, sizeof(ip4_addr) },
365
  [RTA_PRIORITY]  = { 1, 1, sizeof(u32) },
366
  [RTA_PREFSRC]          = { 1, 1, sizeof(ip4_addr) },
367
  [RTA_METRICS]          = { 1, 0, 0 },
368
  [RTA_MULTIPATH] = { 1, 0, 0 },
369
  [RTA_FLOW]          = { 1, 1, sizeof(u32) },
370
  [RTA_TABLE]          = { 1, 1, sizeof(u32) },
371
  [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
372
  [RTA_ENCAP]          = { 1, 0, 0 },
373
};
374

    
375
static struct nl_want_attrs rtm_attr_want6[BIRD_RTA_MAX] = {
376
  [RTA_DST]          = { 1, 1, sizeof(ip6_addr) },
377
  [RTA_IIF]          = { 1, 1, sizeof(u32) },
378
  [RTA_OIF]          = { 1, 1, sizeof(u32) },
379
  [RTA_GATEWAY]          = { 1, 1, sizeof(ip6_addr) },
380
  [RTA_PRIORITY]  = { 1, 1, sizeof(u32) },
381
  [RTA_PREFSRC]          = { 1, 1, sizeof(ip6_addr) },
382
  [RTA_METRICS]          = { 1, 0, 0 },
383
  [RTA_MULTIPATH] = { 1, 0, 0 },
384
  [RTA_FLOW]          = { 1, 1, sizeof(u32) },
385
  [RTA_TABLE]          = { 1, 1, sizeof(u32) },
386
  [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
387
  [RTA_ENCAP]          = { 1, 0, 0 },
388
};
389

    
390
#ifdef HAVE_MPLS_KERNEL
391
static struct nl_want_attrs rtm_attr_want_mpls[BIRD_RTA_MAX] = {
392
  [RTA_DST]          = { 1, 1, sizeof(u32) },
393
  [RTA_IIF]          = { 1, 1, sizeof(u32) },
394
  [RTA_OIF]          = { 1, 1, sizeof(u32) },
395
  [RTA_PRIORITY]  = { 1, 1, sizeof(u32) },
396
  [RTA_METRICS]          = { 1, 0, 0 },
397
  [RTA_FLOW]          = { 1, 1, sizeof(u32) },
398
  [RTA_TABLE]          = { 1, 1, sizeof(u32) },
399
  [RTA_VIA]          = { 1, 0, 0 },
400
  [RTA_NEWDST]          = { 1, 0, 0 },
401
};
402
#endif
403

    
404

    
405
static int
406
nl_parse_attrs(struct rtattr *a, struct nl_want_attrs *want, struct rtattr **k, int ksize)
407
{
408
  int max = ksize / sizeof(struct rtattr *);
409
  bzero(k, ksize);
410

    
411
  for ( ; RTA_OK(a, nl_attr_len); a = RTA_NEXT(a, nl_attr_len))
412
    {
413
      if ((a->rta_type >= max) || !want[a->rta_type].defined)
414
        continue;
415

    
416
      if (want[a->rta_type].checksize && (RTA_PAYLOAD(a) != want[a->rta_type].size))
417
        {
418
          log(L_ERR "nl_parse_attrs: Malformed attribute received");
419
          return 0;
420
        }
421

    
422
      k[a->rta_type] = a;
423
    }
424

    
425
  if (nl_attr_len)
426
    {
427
      log(L_ERR "nl_parse_attrs: remnant of size %d", nl_attr_len);
428
      return 0;
429
    }
430

    
431
  return 1;
432
}
433

    
434
static inline u16 rta_get_u16(struct rtattr *a)
435
{ return *(u16 *) RTA_DATA(a); }
436

    
437
static inline u32 rta_get_u32(struct rtattr *a)
438
{ return *(u32 *) RTA_DATA(a); }
439

    
440
static inline ip4_addr rta_get_ip4(struct rtattr *a)
441
{ return ip4_ntoh(*(ip4_addr *) RTA_DATA(a)); }
442

    
443
static inline ip6_addr rta_get_ip6(struct rtattr *a)
444
{ return ip6_ntoh(*(ip6_addr *) RTA_DATA(a)); }
445

    
446
static inline ip_addr rta_get_ipa(struct rtattr *a)
447
{
448
  if (RTA_PAYLOAD(a) == sizeof(ip4_addr))
449
    return ipa_from_ip4(rta_get_ip4(a));
450
  else
451
    return ipa_from_ip6(rta_get_ip6(a));
452
}
453

    
454
#ifdef HAVE_MPLS_KERNEL
455
static inline ip_addr rta_get_via(struct rtattr *a)
456
{
457
  struct rtvia *v = RTA_DATA(a);
458
  switch(v->rtvia_family) {
459
    case AF_INET:  return ipa_from_ip4(ip4_ntoh(*(ip4_addr *) v->rtvia_addr));
460
    case AF_INET6: return ipa_from_ip6(ip6_ntoh(*(ip6_addr *) v->rtvia_addr));
461
  }
462
  return IPA_NONE;
463
}
464

    
465
static u32 rta_mpls_stack[MPLS_MAX_LABEL_STACK];
466
static inline int rta_get_mpls(struct rtattr *a, u32 *stack)
467
{
468
  if (RTA_PAYLOAD(a) % 4)
469
    log(L_WARN "KRT: Strange length of received MPLS stack: %u", RTA_PAYLOAD(a));
470

    
471
  return mpls_get(RTA_DATA(a), RTA_PAYLOAD(a) & ~0x3, stack);
472
}
473
#endif
474

    
475
struct rtattr *
476
nl_add_attr(struct nlmsghdr *h, uint bufsize, uint code, const void *data, uint dlen)
477
{
478
  uint pos = NLMSG_ALIGN(h->nlmsg_len);
479
  uint len = RTA_LENGTH(dlen);
480

    
481
  if (pos + len > bufsize)
482
    bug("nl_add_attr: packet buffer overflow");
483

    
484
  struct rtattr *a = (struct rtattr *)((char *)h + pos);
485
  a->rta_type = code;
486
  a->rta_len = len;
487
  h->nlmsg_len = pos + len;
488

    
489
  if (dlen > 0)
490
    memcpy(RTA_DATA(a), data, dlen);
491

    
492
  return a;
493
}
494

    
495
static inline struct rtattr *
496
nl_open_attr(struct nlmsghdr *h, uint bufsize, uint code)
497
{
498
  return nl_add_attr(h, bufsize, code, NULL, 0);
499
}
500

    
501
static inline void
502
nl_close_attr(struct nlmsghdr *h, struct rtattr *a)
503
{
504
  a->rta_len = (void *)h + NLMSG_ALIGN(h->nlmsg_len) - (void *)a;
505
}
506

    
507
static inline void
508
nl_add_attr_u16(struct nlmsghdr *h, uint bufsize, int code, u16 data)
509
{
510
  nl_add_attr(h, bufsize, code, &data, 2);
511
}
512

    
513
static inline void
514
nl_add_attr_u32(struct nlmsghdr *h, uint bufsize, int code, u32 data)
515
{
516
  nl_add_attr(h, bufsize, code, &data, 4);
517
}
518

    
519
static inline void
520
nl_add_attr_ip4(struct nlmsghdr *h, uint bufsize, int code, ip4_addr ip4)
521
{
522
  ip4 = ip4_hton(ip4);
523
  nl_add_attr(h, bufsize, code, &ip4, sizeof(ip4));
524
}
525

    
526
static inline void
527
nl_add_attr_ip6(struct nlmsghdr *h, uint bufsize, int code, ip6_addr ip6)
528
{
529
  ip6 = ip6_hton(ip6);
530
  nl_add_attr(h, bufsize, code, &ip6, sizeof(ip6));
531
}
532

    
533
static inline void
534
nl_add_attr_ipa(struct nlmsghdr *h, uint bufsize, int code, ip_addr ipa)
535
{
536
  if (ipa_is_ip4(ipa))
537
    nl_add_attr_ip4(h, bufsize, code, ipa_to_ip4(ipa));
538
  else
539
    nl_add_attr_ip6(h, bufsize, code, ipa_to_ip6(ipa));
540
}
541

    
542
#ifdef HAVE_MPLS_KERNEL
543
static inline void
544
nl_add_attr_mpls(struct nlmsghdr *h, uint bufsize, int code, int len, u32 *stack)
545
{
546
  char buf[len*4];
547
  mpls_put(buf, len, stack);
548
  nl_add_attr(h, bufsize, code, buf, len*4);
549
}
550

    
551
static inline void
552
nl_add_attr_mpls_encap(struct nlmsghdr *h, uint bufsize, int len, u32 *stack)
553
{
554
  nl_add_attr_u16(h, bufsize, RTA_ENCAP_TYPE, LWTUNNEL_ENCAP_MPLS);
555

    
556
  struct rtattr *nest = nl_open_attr(h, bufsize, RTA_ENCAP);
557
  nl_add_attr_mpls(h, bufsize, RTA_DST, len, stack);
558
  nl_close_attr(h, nest);
559
}
560

    
561
static inline void
562
nl_add_attr_via(struct nlmsghdr *h, uint bufsize, ip_addr ipa)
563
{
564
  struct rtvia *via = alloca(sizeof(struct rtvia) + 16);
565

    
566
  if (ipa_is_ip4(ipa))
567
  {
568
    via->rtvia_family = AF_INET;
569
    put_ip4(via->rtvia_addr, ipa_to_ip4(ipa));
570
    nl_add_attr(h, bufsize, RTA_VIA, via, sizeof(struct rtvia) + 4);
571
  }
572
  else
573
  {
574
    via->rtvia_family = AF_INET6;
575
    put_ip6(via->rtvia_addr, ipa_to_ip6(ipa));
576
    nl_add_attr(h, bufsize, RTA_VIA, via, sizeof(struct rtvia) + 16);
577
  }
578
}
579
#endif
580

    
581
static inline struct rtnexthop *
582
nl_open_nexthop(struct nlmsghdr *h, uint bufsize)
583
{
584
  uint pos = NLMSG_ALIGN(h->nlmsg_len);
585
  uint len = RTNH_LENGTH(0);
586

    
587
  if (pos + len > bufsize)
588
    bug("nl_open_nexthop: packet buffer overflow");
589

    
590
  h->nlmsg_len = pos + len;
591

    
592
  return (void *)h + pos;
593
}
594

    
595
static inline void
596
nl_close_nexthop(struct nlmsghdr *h, struct rtnexthop *nh)
597
{
598
  nh->rtnh_len = (void *)h + NLMSG_ALIGN(h->nlmsg_len) - (void *)nh;
599
}
600

    
601
static inline void
602
nl_add_nexthop(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af UNUSED)
603
{
604
#ifdef HAVE_MPLS_KERNEL
605
  if (nh->labels > 0)
606
    if (af == AF_MPLS)
607
      nl_add_attr_mpls(h, bufsize, RTA_NEWDST, nh->labels, nh->label);
608
    else
609
      nl_add_attr_mpls_encap(h, bufsize, nh->labels, nh->label);
610

    
611
  if (ipa_nonzero(nh->gw))
612
    if (af == AF_MPLS)
613
      nl_add_attr_via(h, bufsize, nh->gw);
614
    else
615
      nl_add_attr_ipa(h, bufsize, RTA_GATEWAY, nh->gw);
616
#else
617

    
618
  if (ipa_nonzero(nh->gw))
619
    nl_add_attr_ipa(h, bufsize, RTA_GATEWAY, nh->gw);
620
#endif
621
}
622

    
623
static void
624
nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af)
625
{
626
  struct rtattr *a = nl_open_attr(h, bufsize, RTA_MULTIPATH);
627

    
628
  for (; nh; nh = nh->next)
629
  {
630
    struct rtnexthop *rtnh = nl_open_nexthop(h, bufsize);
631

    
632
    rtnh->rtnh_flags = 0;
633
    rtnh->rtnh_hops = nh->weight;
634
    rtnh->rtnh_ifindex = nh->iface->index;
635

    
636
    nl_add_nexthop(h, bufsize, nh, af);
637

    
638
    if (nh->flags & RNF_ONLINK)
639
      rtnh->rtnh_flags |= RTNH_F_ONLINK;
640

    
641
    nl_close_nexthop(h, rtnh);
642
  }
643

    
644
  nl_close_attr(h, a);
645
}
646

    
647
static struct nexthop *
648
nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, struct rtattr *ra, int af)
649
{
650
  struct rtattr *a[BIRD_RTA_MAX];
651
  struct rtnexthop *nh = RTA_DATA(ra);
652
  struct nexthop *rv, *first, **last;
653
  unsigned len = RTA_PAYLOAD(ra);
654

    
655
  first = NULL;
656
  last = &first;
657

    
658
  while (len)
659
    {
660
      /* Use RTNH_OK(nh,len) ?? */
661
      if ((len < sizeof(*nh)) || (len < nh->rtnh_len))
662
        return NULL;
663

    
664
      *last = rv = lp_allocz(s->pool, NEXTHOP_MAX_SIZE);
665
      last = &(rv->next);
666

    
667
      rv->weight = nh->rtnh_hops;
668
      rv->iface = if_find_by_index(nh->rtnh_ifindex);
669
      if (!rv->iface)
670
        return NULL;
671

    
672
      /* Nonexistent RTNH_PAYLOAD ?? */
673
      nl_attr_len = nh->rtnh_len - RTNH_LENGTH(0);
674
      switch (af)
675
        {
676
        case AF_INET:
677
          if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want4, a, sizeof(a)))
678
            return NULL;
679
          break;
680

    
681
        case AF_INET6:
682
          if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want6, a, sizeof(a)))
683
            return NULL;
684
          break;
685

    
686
        default:
687
          return NULL;
688
        }
689

    
690
      if (a[RTA_GATEWAY])
691
        {
692
          rv->gw = rta_get_ipa(a[RTA_GATEWAY]);
693

    
694
          if (nh->rtnh_flags & RTNH_F_ONLINK)
695
            rv->flags |= RNF_ONLINK;
696

    
697
          neighbor *nbr;
698
          nbr = neigh_find2(&p->p, &rv->gw, rv->iface,
699
                            (rv->flags & RNF_ONLINK) ? NEF_ONLINK : 0);
700
          if (!nbr || (nbr->scope == SCOPE_HOST))
701
            return NULL;
702
        }
703
      else
704
        rv->gw = IPA_NONE;
705

    
706
#ifdef HAVE_MPLS_KERNEL
707
      if (a[RTA_ENCAP_TYPE])
708
      {
709
        if (rta_get_u16(a[RTA_ENCAP_TYPE]) != LWTUNNEL_ENCAP_MPLS) {
710
          log(L_WARN "KRT: Unknown encapsulation method %d in multipath", rta_get_u16(a[RTA_ENCAP_TYPE]));
711
          return NULL;
712
        }
713

    
714
        struct rtattr *enca[BIRD_RTA_MAX];
715
        nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]);
716
        nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca));
717
        rv->labels = rta_get_mpls(enca[RTA_DST], rv->label);
718
        break;
719
      }
720
#endif
721

    
722

    
723
      len -= NLMSG_ALIGN(nh->rtnh_len);
724
      nh = RTNH_NEXT(nh);
725
    }
726

    
727
  return first;
728
}
729

    
730
static void
731
nl_add_metrics(struct nlmsghdr *h, uint bufsize, u32 *metrics, int max)
732
{
733
  struct rtattr *a = nl_open_attr(h, bufsize, RTA_METRICS);
734
  int t;
735

    
736
  for (t = 1; t < max; t++)
737
    if (metrics[0] & (1 << t))
738
      nl_add_attr_u32(h, bufsize, t, metrics[t]);
739

    
740
  nl_close_attr(h, a);
741
}
742

    
743
static int
744
nl_parse_metrics(struct rtattr *hdr, u32 *metrics, int max)
745
{
746
  struct rtattr *a = RTA_DATA(hdr);
747
  int len = RTA_PAYLOAD(hdr);
748

    
749
  metrics[0] = 0;
750
  for (; RTA_OK(a, len); a = RTA_NEXT(a, len))
751
  {
752
    if (a->rta_type == RTA_UNSPEC)
753
      continue;
754

    
755
    if (a->rta_type >= max)
756
      continue;
757

    
758
    if (RTA_PAYLOAD(a) != 4)
759
      return -1;
760

    
761
    metrics[0] |= 1 << a->rta_type;
762
    metrics[a->rta_type] = rta_get_u32(a);
763
  }
764

    
765
  if (len > 0)
766
    return -1;
767

    
768
  return 0;
769
}
770

    
771

    
772
/*
773
 *        Scanning of interfaces
774
 */
775

    
776
static void
777
nl_parse_link(struct nlmsghdr *h, int scan)
778
{
779
  struct ifinfomsg *i;
780
  struct rtattr *a[BIRD_IFLA_MAX];
781
  int new = h->nlmsg_type == RTM_NEWLINK;
782
  struct iface f = {};
783
  struct iface *ifi;
784
  char *name;
785
  u32 mtu, master = 0;
786
  uint fl;
787

    
788
  if (!(i = nl_checkin(h, sizeof(*i))) || !nl_parse_attrs(IFLA_RTA(i), ifla_attr_want, a, sizeof(a)))
789
    return;
790
  if (!a[IFLA_IFNAME] || (RTA_PAYLOAD(a[IFLA_IFNAME]) < 2) || !a[IFLA_MTU])
791
    {
792
      /*
793
       * IFLA_IFNAME and IFLA_MTU are required, in fact, but there may also come
794
       * a message with IFLA_WIRELESS set, where (e.g.) no IFLA_IFNAME exists.
795
       * We simply ignore all such messages with IFLA_WIRELESS without notice.
796
       */
797

    
798
      if (a[IFLA_WIRELESS])
799
        return;
800

    
801
      log(L_ERR "KIF: Malformed message received");
802
      return;
803
    }
804

    
805
  name = RTA_DATA(a[IFLA_IFNAME]);
806
  mtu = rta_get_u32(a[IFLA_MTU]);
807

    
808
  if (a[IFLA_MASTER])
809
    master = rta_get_u32(a[IFLA_MASTER]);
810

    
811
  ifi = if_find_by_index(i->ifi_index);
812
  if (!new)
813
    {
814
      DBG("KIF: IF%d(%s) goes down\n", i->ifi_index, name);
815
      if (!ifi)
816
        return;
817

    
818
      if_delete(ifi);
819
    }
820
  else
821
    {
822
      DBG("KIF: IF%d(%s) goes up (mtu=%d,flg=%x)\n", i->ifi_index, name, mtu, i->ifi_flags);
823
      if (ifi && strncmp(ifi->name, name, sizeof(ifi->name)-1))
824
        if_delete(ifi);
825

    
826
      strncpy(f.name, name, sizeof(f.name)-1);
827
      f.index = i->ifi_index;
828
      f.mtu = mtu;
829

    
830
      f.master_index = master;
831
      f.master = if_find_by_index(master);
832

    
833
      fl = i->ifi_flags;
834
      if (fl & IFF_UP)
835
        f.flags |= IF_ADMIN_UP;
836
      if (fl & IFF_LOWER_UP)
837
        f.flags |= IF_LINK_UP;
838
      if (fl & IFF_LOOPBACK)                /* Loopback */
839
        f.flags |= IF_MULTIACCESS | IF_LOOPBACK | IF_IGNORE;
840
      else if (fl & IFF_POINTOPOINT)        /* PtP */
841
        f.flags |= IF_MULTICAST;
842
      else if (fl & IFF_BROADCAST)        /* Broadcast */
843
        f.flags |= IF_MULTIACCESS | IF_BROADCAST | IF_MULTICAST;
844
      else
845
        f.flags |= IF_MULTIACCESS;        /* NBMA */
846

    
847
      if (fl & IFF_MULTICAST)
848
        f.flags |= IF_MULTICAST;
849

    
850
      ifi = if_update(&f);
851

    
852
      if (!scan)
853
        if_end_partial_update(ifi);
854
    }
855
}
856

    
857
static void
858
nl_parse_addr4(struct ifaddrmsg *i, int scan, int new)
859
{
860
  struct rtattr *a[BIRD_IFA_MAX];
861
  struct iface *ifi;
862
  u32 ifa_flags;
863
  int scope;
864

    
865
  if (!nl_parse_attrs(IFA_RTA(i), ifa_attr_want4, a, sizeof(a)))
866
    return;
867

    
868
  if (!a[IFA_LOCAL])
869
    {
870
      log(L_ERR "KIF: Malformed message received (missing IFA_LOCAL)");
871
      return;
872
    }
873
  if (!a[IFA_ADDRESS])
874
    {
875
      log(L_ERR "KIF: Malformed message received (missing IFA_ADDRESS)");
876
      return;
877
    }
878

    
879
  ifi = if_find_by_index(i->ifa_index);
880
  if (!ifi)
881
    {
882
      log(L_ERR "KIF: Received address message for unknown interface %d", i->ifa_index);
883
      return;
884
    }
885

    
886
  if (a[IFA_FLAGS])
887
    ifa_flags = rta_get_u32(a[IFA_FLAGS]);
888
  else
889
    ifa_flags = i->ifa_flags;
890

    
891
  struct ifa ifa;
892
  bzero(&ifa, sizeof(ifa));
893
  ifa.iface = ifi;
894
  if (ifa_flags & IFA_F_SECONDARY)
895
    ifa.flags |= IA_SECONDARY;
896

    
897
  ifa.ip = rta_get_ipa(a[IFA_LOCAL]);
898

    
899
  if (i->ifa_prefixlen > IP4_MAX_PREFIX_LENGTH)
900
    {
901
      log(L_ERR "KIF: Invalid prefix length for interface %s: %d", ifi->name, i->ifa_prefixlen);
902
      new = 0;
903
    }
904
  if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH)
905
    {
906
      ifa.brd = rta_get_ipa(a[IFA_ADDRESS]);
907
      net_fill_ip4(&ifa.prefix, rta_get_ip4(a[IFA_ADDRESS]), i->ifa_prefixlen);
908

    
909
      /* It is either a host address or a peer address */
910
      if (ipa_equal(ifa.ip, ifa.brd))
911
        ifa.flags |= IA_HOST;
912
      else
913
        {
914
          ifa.flags |= IA_PEER;
915
          ifa.opposite = ifa.brd;
916
        }
917
    }
918
  else
919
    {
920
      net_fill_ip4(&ifa.prefix, ipa_to_ip4(ifa.ip), i->ifa_prefixlen);
921
      net_normalize(&ifa.prefix);
922

    
923
      if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH - 1)
924
        ifa.opposite = ipa_opposite_m1(ifa.ip);
925

    
926
      if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH - 2)
927
        ifa.opposite = ipa_opposite_m2(ifa.ip);
928

    
929
      if ((ifi->flags & IF_BROADCAST) && a[IFA_BROADCAST])
930
        {
931
          ip4_addr xbrd = rta_get_ip4(a[IFA_BROADCAST]);
932
          ip4_addr ybrd = ip4_or(ipa_to_ip4(ifa.ip), ip4_not(ip4_mkmask(i->ifa_prefixlen)));
933

    
934
          if (ip4_equal(xbrd, net4_prefix(&ifa.prefix)) || ip4_equal(xbrd, ybrd))
935
            ifa.brd = ipa_from_ip4(xbrd);
936
          else if (ifi->flags & IF_TMP_DOWN) /* Complain only during the first scan */
937
            {
938
              log(L_ERR "KIF: Invalid broadcast address %I4 for %s", xbrd, ifi->name);
939
              ifa.brd = ipa_from_ip4(ybrd);
940
            }
941
        }
942
    }
943

    
944
  scope = ipa_classify(ifa.ip);
945
  if (scope < 0)
946
    {
947
      log(L_ERR "KIF: Invalid interface address %I for %s", ifa.ip, ifi->name);
948
      return;
949
    }
950
  ifa.scope = scope & IADDR_SCOPE_MASK;
951

    
952
  DBG("KIF: IF%d(%s): %s IPA %I, flg %x, net %N, brd %I, opp %I\n",
953
      ifi->index, ifi->name,
954
      new ? "added" : "removed",
955
      ifa.ip, ifa.flags, ifa.prefix, ifa.brd, ifa.opposite);
956

    
957
  if (new)
958
    ifa_update(&ifa);
959
  else
960
    ifa_delete(&ifa);
961

    
962
  if (!scan)
963
    if_end_partial_update(ifi);
964
}
965

    
966
static void
967
nl_parse_addr6(struct ifaddrmsg *i, int scan, int new)
968
{
969
  struct rtattr *a[BIRD_IFA_MAX];
970
  struct iface *ifi;
971
  u32 ifa_flags;
972
  int scope;
973

    
974
  if (!nl_parse_attrs(IFA_RTA(i), ifa_attr_want6, a, sizeof(a)))
975
    return;
976

    
977
  if (!a[IFA_ADDRESS])
978
    {
979
      log(L_ERR "KIF: Malformed message received (missing IFA_ADDRESS)");
980
      return;
981
    }
982

    
983
  ifi = if_find_by_index(i->ifa_index);
984
  if (!ifi)
985
    {
986
      log(L_ERR "KIF: Received address message for unknown interface %d", i->ifa_index);
987
      return;
988
    }
989

    
990
  if (a[IFA_FLAGS])
991
    ifa_flags = rta_get_u32(a[IFA_FLAGS]);
992
  else
993
    ifa_flags = i->ifa_flags;
994

    
995
  struct ifa ifa;
996
  bzero(&ifa, sizeof(ifa));
997
  ifa.iface = ifi;
998
  if (ifa_flags & IFA_F_SECONDARY)
999
    ifa.flags |= IA_SECONDARY;
1000

    
1001
  /* Ignore tentative addresses silently */
1002
  if (ifa_flags & IFA_F_TENTATIVE)
1003
    return;
1004

    
1005
  /* IFA_LOCAL can be unset for IPv6 interfaces */
1006
  ifa.ip = rta_get_ipa(a[IFA_LOCAL] ? : a[IFA_ADDRESS]);
1007

    
1008
  if (i->ifa_prefixlen > IP6_MAX_PREFIX_LENGTH)
1009
    {
1010
      log(L_ERR "KIF: Invalid prefix length for interface %s: %d", ifi->name, i->ifa_prefixlen);
1011
      new = 0;
1012
    }
1013
  if (i->ifa_prefixlen == IP6_MAX_PREFIX_LENGTH)
1014
    {
1015
      ifa.brd = rta_get_ipa(a[IFA_ADDRESS]);
1016
      net_fill_ip6(&ifa.prefix, rta_get_ip6(a[IFA_ADDRESS]), i->ifa_prefixlen);
1017

    
1018
      /* It is either a host address or a peer address */
1019
      if (ipa_equal(ifa.ip, ifa.brd))
1020
        ifa.flags |= IA_HOST;
1021
      else
1022
        {
1023
          ifa.flags |= IA_PEER;
1024
          ifa.opposite = ifa.brd;
1025
        }
1026
    }
1027
  else
1028
    {
1029
      net_fill_ip6(&ifa.prefix, ipa_to_ip6(ifa.ip), i->ifa_prefixlen);
1030
      net_normalize(&ifa.prefix);
1031

    
1032
      if (i->ifa_prefixlen == IP6_MAX_PREFIX_LENGTH - 1)
1033
        ifa.opposite = ipa_opposite_m1(ifa.ip);
1034
    }
1035

    
1036
  scope = ipa_classify(ifa.ip);
1037
  if (scope < 0)
1038
    {
1039
      log(L_ERR "KIF: Invalid interface address %I for %s", ifa.ip, ifi->name);
1040
      return;
1041
    }
1042
  ifa.scope = scope & IADDR_SCOPE_MASK;
1043

    
1044
  DBG("KIF: IF%d(%s): %s IPA %I, flg %x, net %N, brd %I, opp %I\n",
1045
      ifi->index, ifi->name,
1046
      new ? "added" : "removed",
1047
      ifa.ip, ifa.flags, ifa.prefix, ifa.brd, ifa.opposite);
1048

    
1049
  if (new)
1050
    ifa_update(&ifa);
1051
  else
1052
    ifa_delete(&ifa);
1053

    
1054
  if (!scan)
1055
    if_end_partial_update(ifi);
1056
}
1057

    
1058
static void
1059
nl_parse_addr(struct nlmsghdr *h, int scan)
1060
{
1061
  struct ifaddrmsg *i;
1062

    
1063
  if (!(i = nl_checkin(h, sizeof(*i))))
1064
    return;
1065

    
1066
  int new = (h->nlmsg_type == RTM_NEWADDR);
1067

    
1068
  switch (i->ifa_family)
1069
    {
1070
      case AF_INET:
1071
        return nl_parse_addr4(i, scan, new);
1072

    
1073
      case AF_INET6:
1074
        return nl_parse_addr6(i, scan, new);
1075
    }
1076
}
1077

    
1078
void
1079
kif_do_scan(struct kif_proto *p UNUSED)
1080
{
1081
  struct nlmsghdr *h;
1082

    
1083
  if_start_update();
1084

    
1085
  nl_request_dump(AF_UNSPEC, RTM_GETLINK);
1086
  while (h = nl_get_scan())
1087
    if (h->nlmsg_type == RTM_NEWLINK || h->nlmsg_type == RTM_DELLINK)
1088
      nl_parse_link(h, 1);
1089
    else
1090
      log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type);
1091

    
1092
  /* Re-resolve master interface for slaves */
1093
  struct iface *i;
1094
  WALK_LIST(i, iface_list)
1095
    if (i->master_index)
1096
    {
1097
      struct iface f = {
1098
        .flags = i->flags,
1099
        .mtu = i->mtu,
1100
        .index = i->index,
1101
        .master_index = i->master_index,
1102
        .master = if_find_by_index(i->master_index)
1103
      };
1104

    
1105
      if (f.master != i->master)
1106
      {
1107
        memcpy(f.name, i->name, sizeof(f.name));
1108
        if_update(&f);
1109
      }
1110
    }
1111

    
1112
  nl_request_dump(AF_INET, RTM_GETADDR);
1113
  while (h = nl_get_scan())
1114
    if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR)
1115
      nl_parse_addr(h, 1);
1116
    else
1117
      log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type);
1118

    
1119
  nl_request_dump(AF_INET6, RTM_GETADDR);
1120
  while (h = nl_get_scan())
1121
    if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR)
1122
      nl_parse_addr(h, 1);
1123
    else
1124
      log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type);
1125

    
1126
  if_end_update();
1127
}
1128

    
1129
/*
1130
 *        Routes
1131
 */
1132

    
1133
static inline u32
1134
krt_table_id(struct krt_proto *p)
1135
{
1136
  return KRT_CF->sys.table_id;
1137
}
1138

    
1139
static HASH(struct krt_proto) nl_table_map;
1140

    
1141
#define RTH_KEY(p)                p->af, krt_table_id(p)
1142
#define RTH_NEXT(p)                p->sys.hash_next
1143
#define RTH_EQ(a1,i1,a2,i2)        a1 == a2 && i1 == i2
1144
#define RTH_FN(a,i)                a ^ u32_hash(i)
1145

    
1146
#define RTH_REHASH                rth_rehash
1147
#define RTH_PARAMS                /8, *2, 2, 2, 6, 20
1148

    
1149
HASH_DEFINE_REHASH_FN(RTH, struct krt_proto)
1150

    
1151
int
1152
krt_capable(rte *e)
1153
{
1154
  rta *a = e->attrs;
1155

    
1156
  switch (a->dest)
1157
  {
1158
    case RTD_UNICAST:
1159
    case RTD_BLACKHOLE:
1160
    case RTD_UNREACHABLE:
1161
    case RTD_PROHIBIT:
1162
      return 1;
1163

    
1164
    default:
1165
      return 0;
1166
  }
1167
}
1168

    
1169
static inline int
1170
nh_bufsize(struct nexthop *nh)
1171
{
1172
  int rv = 0;
1173
  for (; nh != NULL; nh = nh->next)
1174
    rv += RTNH_LENGTH(RTA_LENGTH(sizeof(ip_addr)));
1175
  return rv;
1176
}
1177

    
1178
static int
1179
nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int op, int dest, struct nexthop *nh)
1180
{
1181
  eattr *ea;
1182
  net *net = e->net;
1183
  rta *a = e->attrs;
1184
  int bufsize = 128 + KRT_METRICS_MAX*8 + nh_bufsize(&(a->nh));
1185
  u32 priority = 0;
1186

    
1187
  struct {
1188
    struct nlmsghdr h;
1189
    struct rtmsg r;
1190
    char buf[0];
1191
  } *r;
1192

    
1193
  int rsize = sizeof(*r) + bufsize;
1194
  r = alloca(rsize);
1195

    
1196
  DBG("nl_send_route(%N,op=%x)\n", net->n.addr, op);
1197

    
1198
  bzero(&r->h, sizeof(r->h));
1199
  bzero(&r->r, sizeof(r->r));
1200
  r->h.nlmsg_type = op ? RTM_NEWROUTE : RTM_DELROUTE;
1201
  r->h.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
1202
  r->h.nlmsg_flags = op | NLM_F_REQUEST | NLM_F_ACK;
1203

    
1204
  r->r.rtm_family = p->af;
1205
  r->r.rtm_dst_len = net_pxlen(net->n.addr);
1206
  r->r.rtm_protocol = RTPROT_BIRD;
1207
  r->r.rtm_scope = RT_SCOPE_NOWHERE;
1208
#ifdef HAVE_MPLS_KERNEL
1209
  if (p->af == AF_MPLS)
1210
  {
1211
    /*
1212
     * Kernel MPLS code is a bit picky. We must:
1213
     * 1) Always set RT_SCOPE_UNIVERSE and RTN_UNICAST (even for RTM_DELROUTE)
1214
     * 2) Never use RTA_PRIORITY
1215
     */
1216

    
1217
    u32 label = net_mpls(net->n.addr);
1218
    nl_add_attr_mpls(&r->h, rsize, RTA_DST, 1, &label);
1219
    r->r.rtm_scope = RT_SCOPE_UNIVERSE;
1220
    r->r.rtm_type = RTN_UNICAST;
1221
  }
1222
  else
1223
#endif
1224
    nl_add_attr_ipa(&r->h, rsize, RTA_DST, net_prefix(net->n.addr));
1225

    
1226
  /*
1227
   * Strange behavior for RTM_DELROUTE:
1228
   * 1) rtm_family is ignored in IPv6, works for IPv4
1229
   * 2) not setting RTA_PRIORITY is different from setting default value (on IPv6)
1230
   * 3) not setting RTA_PRIORITY is equivalent to setting 0, which is wildcard
1231
   */
1232

    
1233
  if (krt_table_id(p) < 256)
1234
    r->r.rtm_table = krt_table_id(p);
1235
  else
1236
    nl_add_attr_u32(&r->h, rsize, RTA_TABLE, krt_table_id(p));
1237

    
1238
  if (p->af == AF_MPLS)
1239
    priority = 0;
1240
  else if (a->source == RTS_DUMMY)
1241
    priority = e->u.krt.metric;
1242
  else if (KRT_CF->sys.metric)
1243
    priority = KRT_CF->sys.metric;
1244
  else if ((op != NL_OP_DELETE) && (ea = ea_find(eattrs, EA_KRT_METRIC)))
1245
    priority = ea->u.data;
1246

    
1247
  if (priority)
1248
    nl_add_attr_u32(&r->h, rsize, RTA_PRIORITY, priority);
1249

    
1250
  /* For route delete, we do not specify remaining route attributes */
1251
  if (op == NL_OP_DELETE)
1252
    goto dest;
1253

    
1254
  /* Default scope is LINK for device routes, UNIVERSE otherwise */
1255
  if (p->af == AF_MPLS)
1256
    r->r.rtm_scope = RT_SCOPE_UNIVERSE;
1257
  else if (ea = ea_find(eattrs, EA_KRT_SCOPE))
1258
    r->r.rtm_scope = ea->u.data;
1259
  else
1260
    r->r.rtm_scope = (dest == RTD_UNICAST && ipa_zero(nh->gw)) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
1261

    
1262
  if (ea = ea_find(eattrs, EA_KRT_PREFSRC))
1263
    nl_add_attr_ipa(&r->h, rsize, RTA_PREFSRC, *(ip_addr *)ea->u.ptr->data);
1264

    
1265
  if (ea = ea_find(eattrs, EA_KRT_REALM))
1266
    nl_add_attr_u32(&r->h, rsize, RTA_FLOW, ea->u.data);
1267

    
1268

    
1269
  u32 metrics[KRT_METRICS_MAX];
1270
  metrics[0] = 0;
1271

    
1272
  struct ea_walk_state ews = { .eattrs = eattrs };
1273
  while (ea = ea_walk(&ews, EA_KRT_METRICS, KRT_METRICS_MAX))
1274
  {
1275
    int id = ea->id - EA_KRT_METRICS;
1276
    metrics[0] |= 1 << id;
1277
    metrics[id] = ea->u.data;
1278
  }
1279

    
1280
  if (metrics[0])
1281
    nl_add_metrics(&r->h, rsize, metrics, KRT_METRICS_MAX);
1282

    
1283

    
1284
dest:
1285
  switch (dest)
1286
    {
1287
    case RTD_UNICAST:
1288
      r->r.rtm_type = RTN_UNICAST;
1289
      if (nh->next && !krt_ecmp6(p))
1290
        nl_add_multipath(&r->h, rsize, nh, p->af);
1291
      else
1292
      {
1293
        nl_add_attr_u32(&r->h, rsize, RTA_OIF, nh->iface->index);
1294
        nl_add_nexthop(&r->h, rsize, nh, p->af);
1295

    
1296
        if (nh->flags & RNF_ONLINK)
1297
          r->r.rtm_flags |= RTNH_F_ONLINK;
1298
      }
1299
      break;
1300
    case RTD_BLACKHOLE:
1301
      r->r.rtm_type = RTN_BLACKHOLE;
1302
      break;
1303
    case RTD_UNREACHABLE:
1304
      r->r.rtm_type = RTN_UNREACHABLE;
1305
      break;
1306
    case RTD_PROHIBIT:
1307
      r->r.rtm_type = RTN_PROHIBIT;
1308
      break;
1309
    case RTD_NONE:
1310
      break;
1311
    default:
1312
      bug("krt_capable inconsistent with nl_send_route");
1313
    }
1314

    
1315
  /* Ignore missing for DELETE */
1316
  return nl_exchange(&r->h, (op == NL_OP_DELETE));
1317
}
1318

    
1319
static inline int
1320
nl_add_rte(struct krt_proto *p, rte *e, struct ea_list *eattrs)
1321
{
1322
  rta *a = e->attrs;
1323
  int err = 0;
1324

    
1325
  if (krt_ecmp6(p) && a->nh.next)
1326
  {
1327
    struct nexthop *nh = &(a->nh);
1328

    
1329
    err = nl_send_route(p, e, eattrs, NL_OP_ADD, RTD_UNICAST, nh);
1330
    if (err < 0)
1331
      return err;
1332

    
1333
    for (nh = nh->next; nh; nh = nh->next)
1334
      err += nl_send_route(p, e, eattrs, NL_OP_APPEND, RTD_UNICAST, nh);
1335

    
1336
    return err;
1337
  }
1338

    
1339
  return nl_send_route(p, e, eattrs, NL_OP_ADD, a->dest, &(a->nh));
1340
}
1341

    
1342
static inline int
1343
nl_delete_rte(struct krt_proto *p, rte *e, struct ea_list *eattrs)
1344
{
1345
  int err = 0;
1346

    
1347
  /* For IPv6, we just repeatedly request DELETE until we get error */
1348
  do
1349
    err = nl_send_route(p, e, eattrs, NL_OP_DELETE, RTD_NONE, NULL);
1350
  while (krt_ecmp6(p) && !err);
1351

    
1352
  return err;
1353
}
1354

    
1355
void
1356
krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old, struct ea_list *eattrs)
1357
{
1358
  int err = 0;
1359

    
1360
  /*
1361
   * We could use NL_OP_REPLACE, but route replace on Linux has some problems:
1362
   *
1363
   * 1) Does not check for matching rtm_protocol
1364
   * 2) Has broken semantics for IPv6 ECMP
1365
   * 3) Crashes some kernel version when used for IPv6 ECMP
1366
   *
1367
   * So we use NL_OP_DELETE and then NL_OP_ADD. We also do not trust the old
1368
   * route value, so we do not try to optimize IPv6 ECMP reconfigurations.
1369
   */
1370

    
1371
  if (old)
1372
    nl_delete_rte(p, old, eattrs);
1373

    
1374
  if (new)
1375
    err = nl_add_rte(p, new, eattrs);
1376

    
1377
  if (err < 0)
1378
    n->n.flags |= KRF_SYNC_ERROR;
1379
  else
1380
    n->n.flags &= ~KRF_SYNC_ERROR;
1381
}
1382

    
1383
static int
1384
nl_mergable_route(struct nl_parse_state *s, net *net, struct krt_proto *p, uint priority, uint krt_type)
1385
{
1386
  /* Route merging must be active */
1387
  if (!s->merge)
1388
    return 0;
1389

    
1390
  /* Saved and new route must have same network, proto/table, and priority */
1391
  if ((s->net != net) || (s->proto != p) || (s->krt_metric != priority))
1392
    return 0;
1393

    
1394
  /* Both must be regular unicast routes */
1395
  if ((s->krt_type != RTN_UNICAST) || (krt_type != RTN_UNICAST))
1396
    return 0;
1397

    
1398
  return 1;
1399
}
1400

    
1401
static void
1402
nl_announce_route(struct nl_parse_state *s)
1403
{
1404
  rte *e = rte_get_temp(s->attrs);
1405
  e->net = s->net;
1406
  e->u.krt.src = s->krt_src;
1407
  e->u.krt.proto = s->krt_proto;
1408
  e->u.krt.seen = 0;
1409
  e->u.krt.best = 0;
1410
  e->u.krt.metric = s->krt_metric;
1411

    
1412
  if (s->scan)
1413
    krt_got_route(s->proto, e);
1414
  else
1415
    krt_got_route_async(s->proto, e, s->new);
1416

    
1417
  s->net = NULL;
1418
  s->attrs = NULL;
1419
  s->proto = NULL;
1420
  lp_flush(s->pool);
1421
}
1422

    
1423
static inline void
1424
nl_parse_begin(struct nl_parse_state *s, int scan, int merge)
1425
{
1426
  memset(s, 0, sizeof (struct nl_parse_state));
1427
  s->pool = nl_linpool;
1428
  s->scan = scan;
1429
  s->merge = merge;
1430
}
1431

    
1432
static inline void
1433
nl_parse_end(struct nl_parse_state *s)
1434
{
1435
  if (s->net)
1436
    nl_announce_route(s);
1437
}
1438

    
1439

    
1440
#define SKIP(ARG...) do { DBG("KRT: Ignoring route - " ARG); return; } while(0)
1441

    
1442
static void
1443
nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
1444
{
1445
  struct krt_proto *p;
1446
  struct rtmsg *i;
1447
  struct rtattr *a[BIRD_RTA_MAX];
1448
  int new = h->nlmsg_type == RTM_NEWROUTE;
1449

    
1450
  net_addr dst;
1451
  u32 oif = ~0;
1452
  u32 table_id;
1453
  u32 priority = 0;
1454
  u32 def_scope = RT_SCOPE_UNIVERSE;
1455
  int src;
1456

    
1457
  if (!(i = nl_checkin(h, sizeof(*i))))
1458
    return;
1459

    
1460
  switch (i->rtm_family)
1461
    {
1462
    case AF_INET:
1463
      if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want4, a, sizeof(a)))
1464
        return;
1465

    
1466
      if (a[RTA_DST])
1467
        net_fill_ip4(&dst, rta_get_ip4(a[RTA_DST]), i->rtm_dst_len);
1468
      else
1469
        net_fill_ip4(&dst, IP4_NONE, 0);
1470
      break;
1471

    
1472
    case AF_INET6:
1473
      if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want6, a, sizeof(a)))
1474
        return;
1475

    
1476
      if (a[RTA_DST])
1477
        net_fill_ip6(&dst, rta_get_ip6(a[RTA_DST]), i->rtm_dst_len);
1478
      else
1479
        net_fill_ip6(&dst, IP6_NONE, 0);
1480
      break;
1481

    
1482
#ifdef HAVE_MPLS_KERNEL
1483
    case AF_MPLS:
1484
      if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want_mpls, a, sizeof(a)))
1485
        return;
1486

    
1487
      if (!a[RTA_DST])
1488
        SKIP("MPLS route without RTA_DST");
1489

    
1490
      if (rta_get_mpls(a[RTA_DST], rta_mpls_stack) != 1)
1491
        SKIP("MPLS route with multi-label RTA_DST");
1492

    
1493
      net_fill_mpls(&dst, rta_mpls_stack[0]);
1494
      break;
1495
#endif
1496

    
1497
    default:
1498
      return;
1499
    }
1500

    
1501
  if (a[RTA_OIF])
1502
    oif = rta_get_u32(a[RTA_OIF]);
1503

    
1504
  if (a[RTA_TABLE])
1505
    table_id = rta_get_u32(a[RTA_TABLE]);
1506
  else
1507
    table_id = i->rtm_table;
1508

    
1509
  /* Do we know this table? */
1510
  p = HASH_FIND(nl_table_map, RTH, i->rtm_family, table_id);
1511
  if (!p)
1512
    SKIP("unknown table %d\n", table);
1513

    
1514
  if (a[RTA_IIF])
1515
    SKIP("IIF set\n");
1516

    
1517
  if (i->rtm_tos != 0)                        /* We don't support TOS */
1518
    SKIP("TOS %02x\n", i->rtm_tos);
1519

    
1520
  if (s->scan && !new)
1521
    SKIP("RTM_DELROUTE in scan\n");
1522

    
1523
  if (a[RTA_PRIORITY])
1524
    priority = rta_get_u32(a[RTA_PRIORITY]);
1525

    
1526
  int c = net_classify(&dst);
1527
  if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK))
1528
    SKIP("strange class/scope\n");
1529

    
1530
  switch (i->rtm_protocol)
1531
    {
1532
    case RTPROT_UNSPEC:
1533
      SKIP("proto unspec\n");
1534

    
1535
    case RTPROT_REDIRECT:
1536
      src = KRT_SRC_REDIRECT;
1537
      break;
1538

    
1539
    case RTPROT_KERNEL:
1540
      src = KRT_SRC_KERNEL;
1541
      return;
1542

    
1543
    case RTPROT_BIRD:
1544
      if (!s->scan)
1545
        SKIP("echo\n");
1546
      src = KRT_SRC_BIRD;
1547
      break;
1548

    
1549
    case RTPROT_BOOT:
1550
    default:
1551
      src = KRT_SRC_ALIEN;
1552
    }
1553

    
1554
  net *net = net_get(p->p.main_channel->table, &dst);
1555

    
1556
  if (s->net && !nl_mergable_route(s, net, p, priority, i->rtm_type))
1557
    nl_announce_route(s);
1558

    
1559
  rta *ra = lp_allocz(s->pool, RTA_MAX_SIZE);
1560
  ra->src = p->p.main_source;
1561
  ra->source = RTS_INHERIT;
1562
  ra->scope = SCOPE_UNIVERSE;
1563

    
1564
  switch (i->rtm_type)
1565
    {
1566
    case RTN_UNICAST:
1567
      ra->dest = RTD_UNICAST;
1568

    
1569
      if (a[RTA_MULTIPATH])
1570
        {
1571
          struct nexthop *nh = nl_parse_multipath(s, p, a[RTA_MULTIPATH], i->rtm_family);
1572
          if (!nh)
1573
            {
1574
              log(L_ERR "KRT: Received strange multipath route %N", net->n.addr);
1575
              return;
1576
            }
1577

    
1578
          ra->nh = *nh;
1579
          break;
1580
        }
1581

    
1582
      ra->nh.iface = if_find_by_index(oif);
1583
      if (!ra->nh.iface)
1584
        {
1585
          log(L_ERR "KRT: Received route %N with unknown ifindex %u", net->n.addr, oif);
1586
          return;
1587
        }
1588

    
1589
      if ((i->rtm_family != AF_MPLS) && a[RTA_GATEWAY]
1590
#ifdef HAVE_MPLS_KERNEL
1591
          || (i->rtm_family == AF_MPLS) && a[RTA_VIA]
1592
#endif
1593
          )
1594
        {
1595
#ifdef HAVE_MPLS_KERNEL
1596
          if (i->rtm_family == AF_MPLS)
1597
            ra->nh.gw = rta_get_via(a[RTA_VIA]);
1598
          else
1599
#endif
1600
            ra->nh.gw = rta_get_ipa(a[RTA_GATEWAY]);
1601

    
1602
          /* Silently skip strange 6to4 routes */
1603
          const net_addr_ip6 sit = NET_ADDR_IP6(IP6_NONE, 96);
1604
          if ((i->rtm_family == AF_INET6) && ipa_in_netX(ra->nh.gw, (net_addr *) &sit))
1605
            return;
1606

    
1607
          if (i->rtm_flags & RTNH_F_ONLINK)
1608
            ra->nh.flags |= RNF_ONLINK;
1609

    
1610
          neighbor *nbr;
1611
          nbr = neigh_find2(&p->p, &(ra->nh.gw), ra->nh.iface,
1612
                            (ra->nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0);
1613
          if (!nbr || (nbr->scope == SCOPE_HOST))
1614
            {
1615
              log(L_ERR "KRT: Received route %N with strange next-hop %I", net->n.addr,
1616
                  ra->nh.gw);
1617
              return;
1618
            }
1619
        }
1620

    
1621
      break;
1622
    case RTN_BLACKHOLE:
1623
      ra->dest = RTD_BLACKHOLE;
1624
      break;
1625
    case RTN_UNREACHABLE:
1626
      ra->dest = RTD_UNREACHABLE;
1627
      break;
1628
    case RTN_PROHIBIT:
1629
      ra->dest = RTD_PROHIBIT;
1630
      break;
1631
    /* FIXME: What about RTN_THROW? */
1632
    default:
1633
      SKIP("type %d\n", i->rtm_type);
1634
      return;
1635
    }
1636

    
1637
#ifdef HAVE_MPLS_KERNEL
1638
  int labels = 0;
1639
  if ((i->rtm_family == AF_MPLS) && a[RTA_NEWDST] && !ra->nh.next)
1640
    labels = rta_get_mpls(a[RTA_NEWDST], ra->nh.label);
1641

    
1642
  if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE] && !ra->nh.next)
1643
    {
1644
      switch (rta_get_u16(a[RTA_ENCAP_TYPE]))
1645
        {
1646
          case LWTUNNEL_ENCAP_MPLS:
1647
            {
1648
              struct rtattr *enca[BIRD_RTA_MAX];
1649
              nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]);
1650
              nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca));
1651
              labels = rta_get_mpls(enca[RTA_DST], ra->nh.label);
1652
              break;
1653
            }
1654
          default:
1655
            SKIP("unknown encapsulation method %d\n", rta_get_u16(a[RTA_ENCAP_TYPE]));
1656
            break;
1657
        }
1658
    }
1659

    
1660
  if (labels < 0)
1661
  {
1662
    log(L_WARN "KRT: Too long MPLS stack received, ignoring.");
1663
    ra->nh.labels = 0;
1664
  }
1665
  else
1666
    ra->nh.labels = labels;
1667
#endif
1668

    
1669
  if (i->rtm_scope != def_scope)
1670
    {
1671
      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
1672
      ea->next = ra->eattrs;
1673
      ra->eattrs = ea;
1674
      ea->flags = EALF_SORTED;
1675
      ea->count = 1;
1676
      ea->attrs[0].id = EA_KRT_SCOPE;
1677
      ea->attrs[0].flags = 0;
1678
      ea->attrs[0].type = EAF_TYPE_INT;
1679
      ea->attrs[0].u.data = i->rtm_scope;
1680
    }
1681

    
1682
  if (a[RTA_PREFSRC])
1683
    {
1684
      ip_addr ps = rta_get_ipa(a[RTA_PREFSRC]);
1685

    
1686
      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
1687
      ea->next = ra->eattrs;
1688
      ra->eattrs = ea;
1689
      ea->flags = EALF_SORTED;
1690
      ea->count = 1;
1691
      ea->attrs[0].id = EA_KRT_PREFSRC;
1692
      ea->attrs[0].flags = 0;
1693
      ea->attrs[0].type = EAF_TYPE_IP_ADDRESS;
1694
      ea->attrs[0].u.ptr = lp_alloc(s->pool, sizeof(struct adata) + sizeof(ps));
1695
      ea->attrs[0].u.ptr->length = sizeof(ps);
1696
      memcpy(ea->attrs[0].u.ptr->data, &ps, sizeof(ps));
1697
    }
1698

    
1699
  if (a[RTA_FLOW])
1700
    {
1701
      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
1702
      ea->next = ra->eattrs;
1703
      ra->eattrs = ea;
1704
      ea->flags = EALF_SORTED;
1705
      ea->count = 1;
1706
      ea->attrs[0].id = EA_KRT_REALM;
1707
      ea->attrs[0].flags = 0;
1708
      ea->attrs[0].type = EAF_TYPE_INT;
1709
      ea->attrs[0].u.data = rta_get_u32(a[RTA_FLOW]);
1710
    }
1711

    
1712
  if (a[RTA_METRICS])
1713
    {
1714
      u32 metrics[KRT_METRICS_MAX];
1715
      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + KRT_METRICS_MAX * sizeof(eattr));
1716
      int t, n = 0;
1717

    
1718
      if (nl_parse_metrics(a[RTA_METRICS], metrics, ARRAY_SIZE(metrics)) < 0)
1719
        {
1720
          log(L_ERR "KRT: Received route %N with strange RTA_METRICS attribute", net->n.addr);
1721
          return;
1722
        }
1723

    
1724
      for (t = 1; t < KRT_METRICS_MAX; t++)
1725
        if (metrics[0] & (1 << t))
1726
          {
1727
            ea->attrs[n].id = EA_CODE(EAP_KRT, KRT_METRICS_OFFSET + t);
1728
            ea->attrs[n].flags = 0;
1729
            ea->attrs[n].type = EAF_TYPE_INT; /* FIXME: Some are EAF_TYPE_BITFIELD */
1730
            ea->attrs[n].u.data = metrics[t];
1731
            n++;
1732
          }
1733

    
1734
      if (n > 0)
1735
        {
1736
          ea->next = ra->eattrs;
1737
          ea->flags = EALF_SORTED;
1738
          ea->count = n;
1739
          ra->eattrs = ea;
1740
        }
1741
    }
1742

    
1743
  /*
1744
   * Ideally, now we would send the received route to the rest of kernel code.
1745
   * But IPv6 ECMP routes before 4.11 are sent as a sequence of routes, so we
1746
   * postpone it and merge next hops until the end of the sequence. Note that
1747
   * when doing merging of next hops, we expect the new route to be unipath.
1748
   * Otherwise, we ignore additional next hops in nexthop_insert().
1749
   */
1750

    
1751
  if (!s->net)
1752
  {
1753
    /* Store the new route */
1754
    s->net = net;
1755
    s->attrs = ra;
1756
    s->proto = p;
1757
    s->new = new;
1758
    s->krt_src = src;
1759
    s->krt_type = i->rtm_type;
1760
    s->krt_proto = i->rtm_protocol;
1761
    s->krt_metric = priority;
1762
  }
1763
  else
1764
  {
1765
    /* Merge next hops with the stored route */
1766
    rta *oa = s->attrs;
1767

    
1768
    struct nexthop *nhs = &oa->nh;
1769
    nexthop_insert(&nhs, &ra->nh);
1770

    
1771
    /* Perhaps new nexthop is inserted at the first position */
1772
    if (nhs == &ra->nh)
1773
    {
1774
      /* Swap rtas */
1775
      s->attrs = ra;
1776

    
1777
      /* Keep old eattrs */
1778
      ra->eattrs = oa->eattrs;
1779
    }
1780
  }
1781
}
1782

    
1783
void
1784
krt_do_scan(struct krt_proto *p UNUSED)        /* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */
1785
{
1786
  struct nlmsghdr *h;
1787
  struct nl_parse_state s;
1788

    
1789
  nl_parse_begin(&s, 1, 0);
1790
  nl_request_dump(AF_INET, RTM_GETROUTE);
1791
  while (h = nl_get_scan())
1792
    if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
1793
      nl_parse_route(&s, h);
1794
    else
1795
      log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
1796
  nl_parse_end(&s);
1797

    
1798
  nl_parse_begin(&s, 1, 1);
1799
  nl_request_dump(AF_INET6, RTM_GETROUTE);
1800
  while (h = nl_get_scan())
1801
    if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
1802
      nl_parse_route(&s, h);
1803
    else
1804
      log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
1805
  nl_parse_end(&s);
1806

    
1807
#ifdef HAVE_MPLS_KERNEL
1808
  nl_parse_begin(&s, 1, 1);
1809
  nl_request_dump(AF_MPLS, RTM_GETROUTE);
1810
  while (h = nl_get_scan())
1811
    if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
1812
      nl_parse_route(&s, h);
1813
    else
1814
      log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
1815
  nl_parse_end(&s);
1816
#endif
1817
}
1818

    
1819
/*
1820
 *        Asynchronous Netlink interface
1821
 */
1822

    
1823
static sock *nl_async_sk;                /* BIRD socket for asynchronous notifications */
1824
static byte *nl_async_rx_buffer;        /* Receive buffer */
1825

    
1826
static void
1827
nl_async_msg(struct nlmsghdr *h)
1828
{
1829
  struct nl_parse_state s;
1830

    
1831
  switch (h->nlmsg_type)
1832
    {
1833
    case RTM_NEWROUTE:
1834
    case RTM_DELROUTE:
1835
      DBG("KRT: Received async route notification (%d)\n", h->nlmsg_type);
1836
      nl_parse_begin(&s, 0, 0);
1837
      nl_parse_route(&s, h);
1838
      nl_parse_end(&s);
1839
      break;
1840
    case RTM_NEWLINK:
1841
    case RTM_DELLINK:
1842
      DBG("KRT: Received async link notification (%d)\n", h->nlmsg_type);
1843
      if (kif_proto)
1844
        nl_parse_link(h, 0);
1845
      break;
1846
    case RTM_NEWADDR:
1847
    case RTM_DELADDR:
1848
      DBG("KRT: Received async address notification (%d)\n", h->nlmsg_type);
1849
      if (kif_proto)
1850
        nl_parse_addr(h, 0);
1851
      break;
1852
    default:
1853
      DBG("KRT: Received unknown async notification (%d)\n", h->nlmsg_type);
1854
    }
1855
}
1856

    
1857
static int
1858
nl_async_hook(sock *sk, uint size UNUSED)
1859
{
1860
  struct iovec iov = { nl_async_rx_buffer, NL_RX_SIZE };
1861
  struct sockaddr_nl sa;
1862
  struct msghdr m = {
1863
    .msg_name = &sa,
1864
    .msg_namelen = sizeof(sa),
1865
    .msg_iov = &iov,
1866
    .msg_iovlen = 1,
1867
  };
1868
  struct nlmsghdr *h;
1869
  int x;
1870
  uint len;
1871

    
1872
  x = recvmsg(sk->fd, &m, 0);
1873
  if (x < 0)
1874
    {
1875
      if (errno == ENOBUFS)
1876
        {
1877
          /*
1878
           *  Netlink reports some packets have been thrown away.
1879
           *  One day we might react to it by asking for route table
1880
           *  scan in near future.
1881
           */
1882
          log(L_WARN "Kernel dropped some netlink messages, will resync on next scan.");
1883
          return 1;        /* More data are likely to be ready */
1884
        }
1885
      else if (errno != EWOULDBLOCK)
1886
        log(L_ERR "Netlink recvmsg: %m");
1887
      return 0;
1888
    }
1889
  if (sa.nl_pid)                /* It isn't from the kernel */
1890
    {
1891
      DBG("Non-kernel packet\n");
1892
      return 1;
1893
    }
1894
  h = (void *) nl_async_rx_buffer;
1895
  len = x;
1896
  if (m.msg_flags & MSG_TRUNC)
1897
    {
1898
      log(L_WARN "Netlink got truncated asynchronous message");
1899
      return 1;
1900
    }
1901
  while (NLMSG_OK(h, len))
1902
    {
1903
      nl_async_msg(h);
1904
      h = NLMSG_NEXT(h, len);
1905
    }
1906
  if (len)
1907
    log(L_WARN "nl_async_hook: Found packet remnant of size %d", len);
1908
  return 1;
1909
}
1910

    
1911
static void
1912
nl_async_err_hook(sock *sk, int e UNUSED)
1913
{
1914
  nl_async_hook(sk, 0);
1915
}
1916

    
1917
static void
1918
nl_open_async(void)
1919
{
1920
  sock *sk;
1921
  struct sockaddr_nl sa;
1922
  int fd;
1923

    
1924
  if (nl_async_sk)
1925
    return;
1926

    
1927
  DBG("KRT: Opening async netlink socket\n");
1928

    
1929
  fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
1930
  if (fd < 0)
1931
    {
1932
      log(L_ERR "Unable to open asynchronous rtnetlink socket: %m");
1933
      return;
1934
    }
1935

    
1936
  bzero(&sa, sizeof(sa));
1937
  sa.nl_family = AF_NETLINK;
1938
  sa.nl_groups = RTMGRP_LINK |
1939
    RTMGRP_IPV4_IFADDR | RTMGRP_IPV4_ROUTE |
1940
    RTMGRP_IPV6_IFADDR | RTMGRP_IPV6_ROUTE;
1941

    
1942
  if (bind(fd, (struct sockaddr *) &sa, sizeof(sa)) < 0)
1943
    {
1944
      log(L_ERR "Unable to bind asynchronous rtnetlink socket: %m");
1945
      close(fd);
1946
      return;
1947
    }
1948

    
1949
  nl_async_rx_buffer = xmalloc(NL_RX_SIZE);
1950

    
1951
  sk = nl_async_sk = sk_new(krt_pool);
1952
  sk->type = SK_MAGIC;
1953
  sk->rx_hook = nl_async_hook;
1954
  sk->err_hook = nl_async_err_hook;
1955
  sk->fd = fd;
1956
  if (sk_open(sk) < 0)
1957
    bug("Netlink: sk_open failed");
1958
}
1959

    
1960

    
1961
/*
1962
 *        Interface to the UNIX krt module
1963
 */
1964

    
1965
void
1966
krt_sys_io_init(void)
1967
{
1968
  nl_linpool = lp_new_default(krt_pool);
1969
  HASH_INIT(nl_table_map, krt_pool, 6);
1970
}
1971

    
1972
int
1973
krt_sys_start(struct krt_proto *p)
1974
{
1975
  struct krt_proto *old = HASH_FIND(nl_table_map, RTH, p->af, krt_table_id(p));
1976

    
1977
  if (old)
1978
    {
1979
      log(L_ERR "%s: Kernel table %u already registered by %s",
1980
          p->p.name, krt_table_id(p), old->p.name);
1981
      return 0;
1982
    }
1983

    
1984
  HASH_INSERT2(nl_table_map, RTH, krt_pool, p);
1985

    
1986
  nl_open();
1987
  nl_open_async();
1988

    
1989
  return 1;
1990
}
1991

    
1992
void
1993
krt_sys_shutdown(struct krt_proto *p)
1994
{
1995
  HASH_REMOVE2(nl_table_map, RTH, krt_pool, p);
1996
}
1997

    
1998
int
1999
krt_sys_reconfigure(struct krt_proto *p UNUSED, struct krt_config *n, struct krt_config *o)
2000
{
2001
  return (n->sys.table_id == o->sys.table_id) && (n->sys.metric == o->sys.metric);
2002
}
2003

    
2004
void
2005
krt_sys_init_config(struct krt_config *cf)
2006
{
2007
  cf->sys.table_id = RT_TABLE_MAIN;
2008
  cf->sys.metric = 32;
2009
}
2010

    
2011
void
2012
krt_sys_copy_config(struct krt_config *d, struct krt_config *s)
2013
{
2014
  d->sys.table_id = s->sys.table_id;
2015
  d->sys.metric = s->sys.metric;
2016
}
2017

    
2018
static const char *krt_metrics_names[KRT_METRICS_MAX] = {
2019
  NULL, "lock", "mtu", "window", "rtt", "rttvar", "sstresh", "cwnd", "advmss",
2020
  "reordering", "hoplimit", "initcwnd", "features", "rto_min", "initrwnd", "quickack"
2021
};
2022

    
2023
static const char *krt_features_names[KRT_FEATURES_MAX] = {
2024
  "ecn", NULL, NULL, "allfrag"
2025
};
2026

    
2027
int
2028
krt_sys_get_attr(eattr *a, byte *buf, int buflen UNUSED)
2029
{
2030
  switch (a->id)
2031
  {
2032
  case EA_KRT_PREFSRC:
2033
    bsprintf(buf, "prefsrc");
2034
    return GA_NAME;
2035

    
2036
  case EA_KRT_REALM:
2037
    bsprintf(buf, "realm");
2038
    return GA_NAME;
2039

    
2040
  case EA_KRT_SCOPE:
2041
    bsprintf(buf, "scope");
2042
    return GA_NAME;
2043

    
2044
  case EA_KRT_LOCK:
2045
    buf += bsprintf(buf, "lock:");
2046
    ea_format_bitfield(a, buf, buflen, krt_metrics_names, 2, KRT_METRICS_MAX);
2047
    return GA_FULL;
2048

    
2049
  case EA_KRT_FEATURES:
2050
    buf += bsprintf(buf, "features:");
2051
    ea_format_bitfield(a, buf, buflen, krt_features_names, 0, KRT_FEATURES_MAX);
2052
    return GA_FULL;
2053

    
2054
  default:;
2055
    int id = (int)EA_ID(a->id) - KRT_METRICS_OFFSET;
2056
    if (id > 0 && id < KRT_METRICS_MAX)
2057
    {
2058
      bsprintf(buf, "%s", krt_metrics_names[id]);
2059
      return GA_NAME;
2060
    }
2061

    
2062
    return GA_UNKNOWN;
2063
  }
2064
}
2065

    
2066

    
2067

    
2068
void
2069
kif_sys_start(struct kif_proto *p UNUSED)
2070
{
2071
  nl_open();
2072
  nl_open_async();
2073
}
2074

    
2075
void
2076
kif_sys_shutdown(struct kif_proto *p UNUSED)
2077
{
2078
}
2079

    
2080
int
2081
kif_update_sysdep_addr(struct iface *i UNUSED)
2082
{
2083
  return 0;
2084
}