Statistics
| Branch: | Revision:

iof-bird-daemon / sysdep / linux / netlink.c @ 517d05df

History | View | Annotate | Download (49.1 KB)

1
/*
2
 *        BIRD -- Linux Netlink Interface
3
 *
4
 *        (c) 1999--2000 Martin Mares <mj@ucw.cz>
5
 *
6
 *        Can be freely distributed and used under the terms of the GNU GPL.
7
 */
8

    
9
#include <alloca.h>
10
#include <stdio.h>
11
#include <unistd.h>
12
#include <fcntl.h>
13
#include <sys/socket.h>
14
#include <sys/uio.h>
15
#include <errno.h>
16

    
17
#undef LOCAL_DEBUG
18

    
19
#include "nest/bird.h"
20
#include "nest/route.h"
21
#include "nest/protocol.h"
22
#include "nest/iface.h"
23
#include "lib/alloca.h"
24
#include "sysdep/unix/unix.h"
25
#include "sysdep/unix/krt.h"
26
#include "lib/socket.h"
27
#include "lib/string.h"
28
#include "lib/hash.h"
29
#include "conf/conf.h"
30

    
31
#include <asm/types.h>
32
#include <linux/if.h>
33
#ifdef HAVE_LWTUNNEL
34
#include <linux/lwtunnel.h>
35
#else
36
#include "sysdep/linux/lwtunnel.h"
37
#endif
38
#include <linux/netlink.h>
39
#include <linux/rtnetlink.h>
40

    
41

    
42
#ifndef MSG_TRUNC                        /* Hack: Several versions of glibc miss this one :( */
43
#define MSG_TRUNC 0x20
44
#endif
45

    
46
#ifndef IFA_FLAGS
47
#define IFA_FLAGS 8
48
#endif
49

    
50
#ifndef IFF_LOWER_UP
51
#define IFF_LOWER_UP 0x10000
52
#endif
53

    
54
#ifndef RTA_TABLE
55
#define RTA_TABLE  15
56
#endif
57

    
58
#ifndef RTA_VIA
59
#define RTA_VIA         18
60
#endif
61

    
62
#ifndef HAVE_STRUCT_RTVIA
63
struct rtvia {
64
        unsigned short        rtvia_family;
65
        u8                rtvia_addr[0];
66
};
67
#endif
68

    
69
#ifndef RTA_NEWDST
70
#define RTA_NEWDST  19
71
#endif
72

    
73
#ifndef RTA_ENCAP_TYPE
74
#define RTA_ENCAP_TYPE        21
75
#endif
76

    
77
#ifndef RTA_ENCAP
78
#define RTA_ENCAP  22
79
#endif
80

    
81
#define krt_ecmp6(p) ((p)->af == AF_INET6)
82

    
83
const int rt_default_ecmp = 16;
84

    
85
/*
86
 * Structure nl_parse_state keeps state of received route processing. Ideally,
87
 * we could just independently parse received Netlink messages and immediately
88
 * propagate received routes to the rest of BIRD, but older Linux kernel (before
89
 * version 4.11) represents and announces IPv6 ECMP routes not as one route with
90
 * multiple next hops (like RTA_MULTIPATH in IPv4 ECMP), but as a sequence of
91
 * routes with the same prefix. More recent kernels work as with IPv4.
92
 *
93
 * Therefore, BIRD keeps currently processed route in nl_parse_state structure
94
 * and postpones its propagation until we expect it to be final; i.e., when
95
 * non-matching route is received or when the scan ends. When another matching
96
 * route is received, it is merged with the already processed route to form an
97
 * ECMP route. Note that merging is done only for IPv6 (merge == 1), but the
98
 * postponing is done in both cases (for simplicity). All IPv4 routes or IPv6
99
 * routes with RTA_MULTIPATH set are just considered non-matching.
100
 *
101
 * This is ignored for asynchronous notifications (every notification is handled
102
 * as a separate route). It is not an issue for our routes, as we ignore such
103
 * notifications anyways. But importing alien IPv6 ECMP routes does not work
104
 * properly with older kernels.
105
 *
106
 * Whatever the kernel version is, IPv6 ECMP routes are sent as multiple routes
107
 * for the same prefix.
108
 */
109

    
110
struct nl_parse_state
111
{
112
  struct linpool *pool;
113
  int scan;
114
  int merge;
115

    
116
  net *net;
117
  rta *attrs;
118
  struct krt_proto *proto;
119
  s8 new;
120
  s8 krt_src;
121
  u8 krt_type;
122
  u8 krt_proto;
123
  u32 krt_metric;
124
};
125

    
126
/*
127
 *        Synchronous Netlink interface
128
 */
129

    
130
struct nl_sock
131
{
132
  int fd;
133
  u32 seq;
134
  byte *rx_buffer;                        /* Receive buffer */
135
  struct nlmsghdr *last_hdr;                /* Recently received packet */
136
  uint last_size;
137
};
138

    
139
#define NL_RX_SIZE 8192
140

    
141
#define NL_OP_DELETE        0
142
#define NL_OP_ADD        (NLM_F_CREATE|NLM_F_EXCL)
143
#define NL_OP_REPLACE        (NLM_F_CREATE|NLM_F_REPLACE)
144
#define NL_OP_APPEND        (NLM_F_CREATE|NLM_F_APPEND)
145

    
146
static linpool *nl_linpool;
147

    
148
static struct nl_sock nl_scan = {.fd = -1};        /* Netlink socket for synchronous scan */
149
static struct nl_sock nl_req  = {.fd = -1};        /* Netlink socket for requests */
150

    
151
static void
152
nl_open_sock(struct nl_sock *nl)
153
{
154
  if (nl->fd < 0)
155
    {
156
      nl->fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
157
      if (nl->fd < 0)
158
        die("Unable to open rtnetlink socket: %m");
159
      nl->seq = (u32) (current_time() TO_S); /* Or perhaps random_u32() ? */
160
      nl->rx_buffer = xmalloc(NL_RX_SIZE);
161
      nl->last_hdr = NULL;
162
      nl->last_size = 0;
163
    }
164
}
165

    
166
static void
167
nl_open(void)
168
{
169
  nl_open_sock(&nl_scan);
170
  nl_open_sock(&nl_req);
171
}
172

    
173
static void
174
nl_send(struct nl_sock *nl, struct nlmsghdr *nh)
175
{
176
  struct sockaddr_nl sa;
177

    
178
  memset(&sa, 0, sizeof(sa));
179
  sa.nl_family = AF_NETLINK;
180
  nh->nlmsg_pid = 0;
181
  nh->nlmsg_seq = ++(nl->seq);
182
  if (sendto(nl->fd, nh, nh->nlmsg_len, 0, (struct sockaddr *)&sa, sizeof(sa)) < 0)
183
    die("rtnetlink sendto: %m");
184
  nl->last_hdr = NULL;
185
}
186

    
187
static void
188
nl_request_dump(int af, int cmd)
189
{
190
  struct {
191
    struct nlmsghdr nh;
192
    struct rtgenmsg g;
193
  } req = {
194
    .nh.nlmsg_type = cmd,
195
    .nh.nlmsg_len = sizeof(req),
196
    .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
197
    .g.rtgen_family = af
198
  };
199
  nl_send(&nl_scan, &req.nh);
200
}
201

    
202
static struct nlmsghdr *
203
nl_get_reply(struct nl_sock *nl)
204
{
205
  for(;;)
206
    {
207
      if (!nl->last_hdr)
208
        {
209
          struct iovec iov = { nl->rx_buffer, NL_RX_SIZE };
210
          struct sockaddr_nl sa;
211
          struct msghdr m = {
212
            .msg_name = &sa,
213
            .msg_namelen = sizeof(sa),
214
            .msg_iov = &iov,
215
            .msg_iovlen = 1,
216
          };
217
          int x = recvmsg(nl->fd, &m, 0);
218
          if (x < 0)
219
            die("nl_get_reply: %m");
220
          if (sa.nl_pid)                /* It isn't from the kernel */
221
            {
222
              DBG("Non-kernel packet\n");
223
              continue;
224
            }
225
          nl->last_size = x;
226
          nl->last_hdr = (void *) nl->rx_buffer;
227
          if (m.msg_flags & MSG_TRUNC)
228
            bug("nl_get_reply: got truncated reply which should be impossible");
229
        }
230
      if (NLMSG_OK(nl->last_hdr, nl->last_size))
231
        {
232
          struct nlmsghdr *h = nl->last_hdr;
233
          nl->last_hdr = NLMSG_NEXT(h, nl->last_size);
234
          if (h->nlmsg_seq != nl->seq)
235
            {
236
              log(L_WARN "nl_get_reply: Ignoring out of sequence netlink packet (%x != %x)",
237
                  h->nlmsg_seq, nl->seq);
238
              continue;
239
            }
240
          return h;
241
        }
242
      if (nl->last_size)
243
        log(L_WARN "nl_get_reply: Found packet remnant of size %d", nl->last_size);
244
      nl->last_hdr = NULL;
245
    }
246
}
247

    
248
static struct tbf rl_netlink_err = TBF_DEFAULT_LOG_LIMITS;
249

    
250
static int
251
nl_error(struct nlmsghdr *h, int ignore_esrch)
252
{
253
  struct nlmsgerr *e;
254
  int ec;
255

    
256
  if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr)))
257
    {
258
      log(L_WARN "Netlink: Truncated error message received");
259
      return ENOBUFS;
260
    }
261
  e = (struct nlmsgerr *) NLMSG_DATA(h);
262
  ec = -e->error;
263
  if (ec && !(ignore_esrch && (ec == ESRCH)))
264
    log_rl(&rl_netlink_err, L_WARN "Netlink: %s", strerror(ec));
265
  return ec;
266
}
267

    
268
static struct nlmsghdr *
269
nl_get_scan(void)
270
{
271
  struct nlmsghdr *h = nl_get_reply(&nl_scan);
272

    
273
  if (h->nlmsg_type == NLMSG_DONE)
274
    return NULL;
275
  if (h->nlmsg_type == NLMSG_ERROR)
276
    {
277
      nl_error(h, 0);
278
      return NULL;
279
    }
280
  return h;
281
}
282

    
283
static int
284
nl_exchange(struct nlmsghdr *pkt, int ignore_esrch)
285
{
286
  struct nlmsghdr *h;
287

    
288
  nl_send(&nl_req, pkt);
289
  for(;;)
290
    {
291
      h = nl_get_reply(&nl_req);
292
      if (h->nlmsg_type == NLMSG_ERROR)
293
        break;
294
      log(L_WARN "nl_exchange: Unexpected reply received");
295
    }
296
  return nl_error(h, ignore_esrch) ? -1 : 0;
297
}
298

    
299
/*
300
 *        Netlink attributes
301
 */
302

    
303
static int nl_attr_len;
304

    
305
static void *
306
nl_checkin(struct nlmsghdr *h, int lsize)
307
{
308
  nl_attr_len = h->nlmsg_len - NLMSG_LENGTH(lsize);
309
  if (nl_attr_len < 0)
310
    {
311
      log(L_ERR "nl_checkin: underrun by %d bytes", -nl_attr_len);
312
      return NULL;
313
    }
314
  return NLMSG_DATA(h);
315
}
316

    
317
struct nl_want_attrs {
318
  u8 defined:1;
319
  u8 checksize:1;
320
  u8 size;
321
};
322

    
323

    
324
#define BIRD_IFLA_MAX (IFLA_WIRELESS+1)
325

    
326
static struct nl_want_attrs ifla_attr_want[BIRD_IFLA_MAX] = {
327
  [IFLA_IFNAME]          = { 1, 0, 0 },
328
  [IFLA_MTU]          = { 1, 1, sizeof(u32) },
329
  [IFLA_MASTER]          = { 1, 1, sizeof(u32) },
330
  [IFLA_WIRELESS] = { 1, 0, 0 },
331
};
332

    
333

    
334
#define BIRD_IFA_MAX  (IFA_FLAGS+1)
335

    
336
static struct nl_want_attrs ifa_attr_want4[BIRD_IFA_MAX] = {
337
  [IFA_ADDRESS]          = { 1, 1, sizeof(ip4_addr) },
338
  [IFA_LOCAL]          = { 1, 1, sizeof(ip4_addr) },
339
  [IFA_BROADCAST] = { 1, 1, sizeof(ip4_addr) },
340
  [IFA_FLAGS]     = { 1, 1, sizeof(u32) },
341
};
342

    
343
static struct nl_want_attrs ifa_attr_want6[BIRD_IFA_MAX] = {
344
  [IFA_ADDRESS]          = { 1, 1, sizeof(ip6_addr) },
345
  [IFA_LOCAL]          = { 1, 1, sizeof(ip6_addr) },
346
  [IFA_FLAGS]          = { 1, 1, sizeof(u32) },
347
};
348

    
349

    
350
#define BIRD_RTA_MAX  (RTA_ENCAP+1)
351

    
352
static struct nl_want_attrs nexthop_attr_want4[BIRD_RTA_MAX] = {
353
  [RTA_GATEWAY]          = { 1, 1, sizeof(ip4_addr) },
354
  [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
355
  [RTA_ENCAP]          = { 1, 0, 0 },
356
};
357

    
358
static struct nl_want_attrs nexthop_attr_want6[BIRD_RTA_MAX] = {
359
  [RTA_GATEWAY]          = { 1, 1, sizeof(ip6_addr) },
360
  [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
361
  [RTA_ENCAP]          = { 1, 0, 0 },
362
};
363

    
364
static struct nl_want_attrs encap_mpls_want[BIRD_RTA_MAX] = {
365
  [RTA_DST]       = { 1, 0, 0 },
366
};
367

    
368
static struct nl_want_attrs rtm_attr_want4[BIRD_RTA_MAX] = {
369
  [RTA_DST]          = { 1, 1, sizeof(ip4_addr) },
370
  [RTA_OIF]          = { 1, 1, sizeof(u32) },
371
  [RTA_GATEWAY]          = { 1, 1, sizeof(ip4_addr) },
372
  [RTA_PRIORITY]  = { 1, 1, sizeof(u32) },
373
  [RTA_PREFSRC]          = { 1, 1, sizeof(ip4_addr) },
374
  [RTA_METRICS]          = { 1, 0, 0 },
375
  [RTA_MULTIPATH] = { 1, 0, 0 },
376
  [RTA_FLOW]          = { 1, 1, sizeof(u32) },
377
  [RTA_TABLE]          = { 1, 1, sizeof(u32) },
378
  [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
379
  [RTA_ENCAP]          = { 1, 0, 0 },
380
};
381

    
382
static struct nl_want_attrs rtm_attr_want6[BIRD_RTA_MAX] = {
383
  [RTA_DST]          = { 1, 1, sizeof(ip6_addr) },
384
  [RTA_IIF]          = { 1, 1, sizeof(u32) },
385
  [RTA_OIF]          = { 1, 1, sizeof(u32) },
386
  [RTA_GATEWAY]          = { 1, 1, sizeof(ip6_addr) },
387
  [RTA_PRIORITY]  = { 1, 1, sizeof(u32) },
388
  [RTA_PREFSRC]          = { 1, 1, sizeof(ip6_addr) },
389
  [RTA_METRICS]          = { 1, 0, 0 },
390
  [RTA_MULTIPATH] = { 1, 0, 0 },
391
  [RTA_FLOW]          = { 1, 1, sizeof(u32) },
392
  [RTA_TABLE]          = { 1, 1, sizeof(u32) },
393
  [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
394
  [RTA_ENCAP]          = { 1, 0, 0 },
395
};
396

    
397
static struct nl_want_attrs rtm_attr_want_mpls[BIRD_RTA_MAX] = {
398
  [RTA_DST]          = { 1, 1, sizeof(u32) },
399
  [RTA_IIF]          = { 1, 1, sizeof(u32) },
400
  [RTA_OIF]          = { 1, 1, sizeof(u32) },
401
  [RTA_PRIORITY]  = { 1, 1, sizeof(u32) },
402
  [RTA_METRICS]          = { 1, 0, 0 },
403
  [RTA_FLOW]          = { 1, 1, sizeof(u32) },
404
  [RTA_TABLE]          = { 1, 1, sizeof(u32) },
405
  [RTA_VIA]          = { 1, 0, 0 },
406
  [RTA_NEWDST]          = { 1, 0, 0 },
407
};
408

    
409

    
410
static int
411
nl_parse_attrs(struct rtattr *a, struct nl_want_attrs *want, struct rtattr **k, int ksize)
412
{
413
  int max = ksize / sizeof(struct rtattr *);
414
  bzero(k, ksize);
415

    
416
  for ( ; RTA_OK(a, nl_attr_len); a = RTA_NEXT(a, nl_attr_len))
417
    {
418
      if ((a->rta_type >= max) || !want[a->rta_type].defined)
419
        continue;
420

    
421
      if (want[a->rta_type].checksize && (RTA_PAYLOAD(a) != want[a->rta_type].size))
422
        {
423
          log(L_ERR "nl_parse_attrs: Malformed attribute received");
424
          return 0;
425
        }
426

    
427
      k[a->rta_type] = a;
428
    }
429

    
430
  if (nl_attr_len)
431
    {
432
      log(L_ERR "nl_parse_attrs: remnant of size %d", nl_attr_len);
433
      return 0;
434
    }
435

    
436
  return 1;
437
}
438

    
439
static inline u16 rta_get_u16(struct rtattr *a)
440
{ return *(u16 *) RTA_DATA(a); }
441

    
442
static inline u32 rta_get_u32(struct rtattr *a)
443
{ return *(u32 *) RTA_DATA(a); }
444

    
445
static inline ip4_addr rta_get_ip4(struct rtattr *a)
446
{ return ip4_ntoh(*(ip4_addr *) RTA_DATA(a)); }
447

    
448
static inline ip6_addr rta_get_ip6(struct rtattr *a)
449
{ return ip6_ntoh(*(ip6_addr *) RTA_DATA(a)); }
450

    
451
static inline ip_addr rta_get_ipa(struct rtattr *a)
452
{
453
  if (RTA_PAYLOAD(a) == sizeof(ip4_addr))
454
    return ipa_from_ip4(rta_get_ip4(a));
455
  else
456
    return ipa_from_ip6(rta_get_ip6(a));
457
}
458

    
459
static inline ip_addr rta_get_via(struct rtattr *a)
460
{
461
  struct rtvia *v = RTA_DATA(a);
462
  switch(v->rtvia_family) {
463
    case AF_INET:  return ipa_from_ip4(ip4_ntoh(*(ip4_addr *) v->rtvia_addr));
464
    case AF_INET6: return ipa_from_ip6(ip6_ntoh(*(ip6_addr *) v->rtvia_addr));
465
  }
466
  return IPA_NONE;
467
}
468

    
469
static u32 rta_mpls_stack[MPLS_MAX_LABEL_STACK];
470
static inline int rta_get_mpls(struct rtattr *a, u32 *stack)
471
{
472
  if (RTA_PAYLOAD(a) % 4)
473
    log(L_WARN "KRT: Strange length of received MPLS stack: %u", RTA_PAYLOAD(a));
474

    
475
  return mpls_get(RTA_DATA(a), RTA_PAYLOAD(a) & ~0x3, stack);
476
}
477

    
478
struct rtattr *
479
nl_add_attr(struct nlmsghdr *h, uint bufsize, uint code, const void *data, uint dlen)
480
{
481
  uint pos = NLMSG_ALIGN(h->nlmsg_len);
482
  uint len = RTA_LENGTH(dlen);
483

    
484
  if (pos + len > bufsize)
485
    bug("nl_add_attr: packet buffer overflow");
486

    
487
  struct rtattr *a = (struct rtattr *)((char *)h + pos);
488
  a->rta_type = code;
489
  a->rta_len = len;
490
  h->nlmsg_len = pos + len;
491

    
492
  if (dlen > 0)
493
    memcpy(RTA_DATA(a), data, dlen);
494

    
495
  return a;
496
}
497

    
498
static inline struct rtattr *
499
nl_open_attr(struct nlmsghdr *h, uint bufsize, uint code)
500
{
501
  return nl_add_attr(h, bufsize, code, NULL, 0);
502
}
503

    
504
static inline void
505
nl_close_attr(struct nlmsghdr *h, struct rtattr *a)
506
{
507
  a->rta_len = (void *)h + NLMSG_ALIGN(h->nlmsg_len) - (void *)a;
508
}
509

    
510
static inline void
511
nl_add_attr_u16(struct nlmsghdr *h, uint bufsize, int code, u16 data)
512
{
513
  nl_add_attr(h, bufsize, code, &data, 2);
514
}
515

    
516
static inline void
517
nl_add_attr_u32(struct nlmsghdr *h, uint bufsize, int code, u32 data)
518
{
519
  nl_add_attr(h, bufsize, code, &data, 4);
520
}
521

    
522
static inline void
523
nl_add_attr_ip4(struct nlmsghdr *h, uint bufsize, int code, ip4_addr ip4)
524
{
525
  ip4 = ip4_hton(ip4);
526
  nl_add_attr(h, bufsize, code, &ip4, sizeof(ip4));
527
}
528

    
529
static inline void
530
nl_add_attr_ip6(struct nlmsghdr *h, uint bufsize, int code, ip6_addr ip6)
531
{
532
  ip6 = ip6_hton(ip6);
533
  nl_add_attr(h, bufsize, code, &ip6, sizeof(ip6));
534
}
535

    
536
static inline void
537
nl_add_attr_ipa(struct nlmsghdr *h, uint bufsize, int code, ip_addr ipa)
538
{
539
  if (ipa_is_ip4(ipa))
540
    nl_add_attr_ip4(h, bufsize, code, ipa_to_ip4(ipa));
541
  else
542
    nl_add_attr_ip6(h, bufsize, code, ipa_to_ip6(ipa));
543
}
544

    
545
static inline void
546
nl_add_attr_mpls(struct nlmsghdr *h, uint bufsize, int code, int len, u32 *stack)
547
{
548
  char buf[len*4];
549
  mpls_put(buf, len, stack);
550
  nl_add_attr(h, bufsize, code, buf, len*4);
551
}
552

    
553
static inline void
554
nl_add_attr_mpls_encap(struct nlmsghdr *h, uint bufsize, int len, u32 *stack)
555
{
556
  nl_add_attr_u16(h, bufsize, RTA_ENCAP_TYPE, LWTUNNEL_ENCAP_MPLS);
557

    
558
  struct rtattr *nest = nl_open_attr(h, bufsize, RTA_ENCAP);
559
  nl_add_attr_mpls(h, bufsize, RTA_DST, len, stack);
560
  nl_close_attr(h, nest);
561
}
562

    
563
static inline void
564
nl_add_attr_via(struct nlmsghdr *h, uint bufsize, ip_addr ipa)
565
{
566
  struct rtattr *nest = nl_open_attr(h, bufsize, RTA_VIA);
567
  struct rtvia *via = RTA_DATA(nest);
568

    
569
  h->nlmsg_len += sizeof(*via);
570

    
571
  if (ipa_is_ip4(ipa))
572
  {
573
    via->rtvia_family = AF_INET;
574
    put_ip4(via->rtvia_addr, ipa_to_ip4(ipa));
575
    h->nlmsg_len += sizeof(ip4_addr);
576
  }
577
  else
578
  {
579
    via->rtvia_family = AF_INET6;
580
    put_ip6(via->rtvia_addr, ipa_to_ip6(ipa));
581
    h->nlmsg_len += sizeof(ip6_addr);
582
  }
583

    
584
  nl_close_attr(h, nest);
585
}
586

    
587
static inline struct rtnexthop *
588
nl_open_nexthop(struct nlmsghdr *h, uint bufsize)
589
{
590
  uint pos = NLMSG_ALIGN(h->nlmsg_len);
591
  uint len = RTNH_LENGTH(0);
592

    
593
  if (pos + len > bufsize)
594
    bug("nl_open_nexthop: packet buffer overflow");
595

    
596
  h->nlmsg_len = pos + len;
597

    
598
  return (void *)h + pos;
599
}
600

    
601
static inline void
602
nl_close_nexthop(struct nlmsghdr *h, struct rtnexthop *nh)
603
{
604
  nh->rtnh_len = (void *)h + NLMSG_ALIGN(h->nlmsg_len) - (void *)nh;
605
}
606

    
607
static inline void
608
nl_add_nexthop(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af)
609
{
610
  if (nh->labels > 0)
611
    if (af == AF_MPLS)
612
      nl_add_attr_mpls(h, bufsize, RTA_NEWDST, nh->labels, nh->label);
613
    else
614
      nl_add_attr_mpls_encap(h, bufsize, nh->labels, nh->label);
615

    
616
  if (ipa_nonzero(nh->gw))
617
    if (af == AF_MPLS)
618
      nl_add_attr_via(h, bufsize, nh->gw);
619
    else
620
      nl_add_attr_ipa(h, bufsize, RTA_GATEWAY, nh->gw);
621
}
622

    
623
static void
624
nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af)
625
{
626
  struct rtattr *a = nl_open_attr(h, bufsize, RTA_MULTIPATH);
627

    
628
  for (; nh; nh = nh->next)
629
  {
630
    struct rtnexthop *rtnh = nl_open_nexthop(h, bufsize);
631

    
632
    rtnh->rtnh_flags = 0;
633
    rtnh->rtnh_hops = nh->weight;
634
    rtnh->rtnh_ifindex = nh->iface->index;
635

    
636
    nl_add_nexthop(h, bufsize, nh, af);
637

    
638
    if (nh->flags & RNF_ONLINK)
639
      rtnh->rtnh_flags |= RTNH_F_ONLINK;
640

    
641
    nl_close_nexthop(h, rtnh);
642
  }
643

    
644
  nl_close_attr(h, a);
645
}
646

    
647
static struct nexthop *
648
nl_parse_multipath(struct krt_proto *p, struct rtattr *ra, int af)
649
{
650
  /* Temporary buffer for multicast nexthops */
651
  static struct nexthop *nh_buffer;
652
  static int nh_buf_size;        /* in number of structures */
653
  static int nh_buf_used;
654

    
655
  struct rtattr *a[BIRD_RTA_MAX];
656
  struct rtnexthop *nh = RTA_DATA(ra);
657
  struct nexthop *rv, *first, **last;
658
  unsigned len = RTA_PAYLOAD(ra);
659

    
660
  first = NULL;
661
  last = &first;
662
  nh_buf_used = 0;
663

    
664
  while (len)
665
    {
666
      /* Use RTNH_OK(nh,len) ?? */
667
      if ((len < sizeof(*nh)) || (len < nh->rtnh_len))
668
        return NULL;
669

    
670
      if (nh_buf_used == nh_buf_size)
671
      {
672
        nh_buf_size = nh_buf_size ? (nh_buf_size * 2) : 4;
673
        nh_buffer = xrealloc(nh_buffer, nh_buf_size * NEXTHOP_MAX_SIZE);
674
      }
675
      *last = rv = nh_buffer + nh_buf_used++;
676
      rv->next = NULL;
677
      last = &(rv->next);
678

    
679
      rv->flags = 0;
680
      rv->weight = nh->rtnh_hops;
681
      rv->iface = if_find_by_index(nh->rtnh_ifindex);
682
      if (!rv->iface)
683
        return NULL;
684

    
685
      /* Nonexistent RTNH_PAYLOAD ?? */
686
      nl_attr_len = nh->rtnh_len - RTNH_LENGTH(0);
687
      switch (af)
688
        {
689
        case AF_INET:
690
          if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want4, a, sizeof(a)))
691
            return NULL;
692
          break;
693

    
694
        case AF_INET6:
695
          if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want6, a, sizeof(a)))
696
            return NULL;
697
          break;
698

    
699
        default:
700
          return NULL;
701
        }
702

    
703
      if (a[RTA_GATEWAY])
704
        {
705
          rv->gw = rta_get_ipa(a[RTA_GATEWAY]);
706

    
707
          if (nh->rtnh_flags & RTNH_F_ONLINK)
708
            rv->flags |= RNF_ONLINK;
709

    
710
          neighbor *nbr;
711
          nbr = neigh_find2(&p->p, &rv->gw, rv->iface,
712
                            (rv->flags & RNF_ONLINK) ? NEF_ONLINK : 0);
713
          if (!nbr || (nbr->scope == SCOPE_HOST))
714
            return NULL;
715
        }
716
      else
717
        rv->gw = IPA_NONE;
718

    
719
      if (a[RTA_ENCAP_TYPE])
720
        {
721
          if (rta_get_u16(a[RTA_ENCAP_TYPE]) != LWTUNNEL_ENCAP_MPLS) {
722
            log(L_WARN "KRT: Unknown encapsulation method %d in multipath", rta_get_u16(a[RTA_ENCAP_TYPE]));
723
            return NULL;
724
          }
725

    
726
          struct rtattr *enca[BIRD_RTA_MAX];
727
          nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]);
728
          nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca));
729
          rv->labels = rta_get_mpls(enca[RTA_DST], rv->label);
730
          break;
731
        }
732

    
733

    
734
      len -= NLMSG_ALIGN(nh->rtnh_len);
735
      nh = RTNH_NEXT(nh);
736
    }
737

    
738
  return first;
739
}
740

    
741
static void
742
nl_add_metrics(struct nlmsghdr *h, uint bufsize, u32 *metrics, int max)
743
{
744
  struct rtattr *a = nl_open_attr(h, bufsize, RTA_METRICS);
745
  int t;
746

    
747
  for (t = 1; t < max; t++)
748
    if (metrics[0] & (1 << t))
749
      nl_add_attr_u32(h, bufsize, t, metrics[t]);
750

    
751
  nl_close_attr(h, a);
752
}
753

    
754
static int
755
nl_parse_metrics(struct rtattr *hdr, u32 *metrics, int max)
756
{
757
  struct rtattr *a = RTA_DATA(hdr);
758
  int len = RTA_PAYLOAD(hdr);
759

    
760
  metrics[0] = 0;
761
  for (; RTA_OK(a, len); a = RTA_NEXT(a, len))
762
  {
763
    if (a->rta_type == RTA_UNSPEC)
764
      continue;
765

    
766
    if (a->rta_type >= max)
767
      continue;
768

    
769
    if (RTA_PAYLOAD(a) != 4)
770
      return -1;
771

    
772
    metrics[0] |= 1 << a->rta_type;
773
    metrics[a->rta_type] = rta_get_u32(a);
774
  }
775

    
776
  if (len > 0)
777
    return -1;
778

    
779
  return 0;
780
}
781

    
782

    
783
/*
784
 *        Scanning of interfaces
785
 */
786

    
787
static void
788
nl_parse_link(struct nlmsghdr *h, int scan)
789
{
790
  struct ifinfomsg *i;
791
  struct rtattr *a[BIRD_IFLA_MAX];
792
  int new = h->nlmsg_type == RTM_NEWLINK;
793
  struct iface f = {};
794
  struct iface *ifi;
795
  char *name;
796
  u32 mtu, master = 0;
797
  uint fl;
798

    
799
  if (!(i = nl_checkin(h, sizeof(*i))) || !nl_parse_attrs(IFLA_RTA(i), ifla_attr_want, a, sizeof(a)))
800
    return;
801
  if (!a[IFLA_IFNAME] || (RTA_PAYLOAD(a[IFLA_IFNAME]) < 2) || !a[IFLA_MTU])
802
    {
803
      /*
804
       * IFLA_IFNAME and IFLA_MTU are required, in fact, but there may also come
805
       * a message with IFLA_WIRELESS set, where (e.g.) no IFLA_IFNAME exists.
806
       * We simply ignore all such messages with IFLA_WIRELESS without notice.
807
       */
808

    
809
      if (a[IFLA_WIRELESS])
810
        return;
811

    
812
      log(L_ERR "KIF: Malformed message received");
813
      return;
814
    }
815

    
816
  name = RTA_DATA(a[IFLA_IFNAME]);
817
  mtu = rta_get_u32(a[IFLA_MTU]);
818

    
819
  if (a[IFLA_MASTER])
820
    master = rta_get_u32(a[IFLA_MASTER]);
821

    
822
  ifi = if_find_by_index(i->ifi_index);
823
  if (!new)
824
    {
825
      DBG("KIF: IF%d(%s) goes down\n", i->ifi_index, name);
826
      if (!ifi)
827
        return;
828

    
829
      if_delete(ifi);
830
    }
831
  else
832
    {
833
      DBG("KIF: IF%d(%s) goes up (mtu=%d,flg=%x)\n", i->ifi_index, name, mtu, i->ifi_flags);
834
      if (ifi && strncmp(ifi->name, name, sizeof(ifi->name)-1))
835
        if_delete(ifi);
836

    
837
      strncpy(f.name, name, sizeof(f.name)-1);
838
      f.index = i->ifi_index;
839
      f.mtu = mtu;
840

    
841
      f.master_index = master;
842
      f.master = if_find_by_index(master);
843

    
844
      fl = i->ifi_flags;
845
      if (fl & IFF_UP)
846
        f.flags |= IF_ADMIN_UP;
847
      if (fl & IFF_LOWER_UP)
848
        f.flags |= IF_LINK_UP;
849
      if (fl & IFF_LOOPBACK)                /* Loopback */
850
        f.flags |= IF_MULTIACCESS | IF_LOOPBACK | IF_IGNORE;
851
      else if (fl & IFF_POINTOPOINT)        /* PtP */
852
        f.flags |= IF_MULTICAST;
853
      else if (fl & IFF_BROADCAST)        /* Broadcast */
854
        f.flags |= IF_MULTIACCESS | IF_BROADCAST | IF_MULTICAST;
855
      else
856
        f.flags |= IF_MULTIACCESS;        /* NBMA */
857

    
858
      if (fl & IFF_MULTICAST)
859
        f.flags |= IF_MULTICAST;
860

    
861
      ifi = if_update(&f);
862

    
863
      if (!scan)
864
        if_end_partial_update(ifi);
865
    }
866
}
867

    
868
static void
869
nl_parse_addr4(struct ifaddrmsg *i, int scan, int new)
870
{
871
  struct rtattr *a[BIRD_IFA_MAX];
872
  struct iface *ifi;
873
  u32 ifa_flags;
874
  int scope;
875

    
876
  if (!nl_parse_attrs(IFA_RTA(i), ifa_attr_want4, a, sizeof(a)))
877
    return;
878

    
879
  if (!a[IFA_LOCAL])
880
    {
881
      log(L_ERR "KIF: Malformed message received (missing IFA_LOCAL)");
882
      return;
883
    }
884
  if (!a[IFA_ADDRESS])
885
    {
886
      log(L_ERR "KIF: Malformed message received (missing IFA_ADDRESS)");
887
      return;
888
    }
889

    
890
  ifi = if_find_by_index(i->ifa_index);
891
  if (!ifi)
892
    {
893
      log(L_ERR "KIF: Received address message for unknown interface %d", i->ifa_index);
894
      return;
895
    }
896

    
897
  if (a[IFA_FLAGS])
898
    ifa_flags = rta_get_u32(a[IFA_FLAGS]);
899
  else
900
    ifa_flags = i->ifa_flags;
901

    
902
  struct ifa ifa;
903
  bzero(&ifa, sizeof(ifa));
904
  ifa.iface = ifi;
905
  if (ifa_flags & IFA_F_SECONDARY)
906
    ifa.flags |= IA_SECONDARY;
907

    
908
  ifa.ip = rta_get_ipa(a[IFA_LOCAL]);
909

    
910
  if (i->ifa_prefixlen > IP4_MAX_PREFIX_LENGTH)
911
    {
912
      log(L_ERR "KIF: Invalid prefix length for interface %s: %d", ifi->name, i->ifa_prefixlen);
913
      new = 0;
914
    }
915
  if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH)
916
    {
917
      ifa.brd = rta_get_ipa(a[IFA_ADDRESS]);
918
      net_fill_ip4(&ifa.prefix, rta_get_ip4(a[IFA_ADDRESS]), i->ifa_prefixlen);
919

    
920
      /* It is either a host address or a peer address */
921
      if (ipa_equal(ifa.ip, ifa.brd))
922
        ifa.flags |= IA_HOST;
923
      else
924
        {
925
          ifa.flags |= IA_PEER;
926
          ifa.opposite = ifa.brd;
927
        }
928
    }
929
  else
930
    {
931
      net_fill_ip4(&ifa.prefix, ipa_to_ip4(ifa.ip), i->ifa_prefixlen);
932
      net_normalize(&ifa.prefix);
933

    
934
      if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH - 1)
935
        ifa.opposite = ipa_opposite_m1(ifa.ip);
936

    
937
      if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH - 2)
938
        ifa.opposite = ipa_opposite_m2(ifa.ip);
939

    
940
      if ((ifi->flags & IF_BROADCAST) && a[IFA_BROADCAST])
941
        {
942
          ip4_addr xbrd = rta_get_ip4(a[IFA_BROADCAST]);
943
          ip4_addr ybrd = ip4_or(ipa_to_ip4(ifa.ip), ip4_not(ip4_mkmask(i->ifa_prefixlen)));
944

    
945
          if (ip4_equal(xbrd, net4_prefix(&ifa.prefix)) || ip4_equal(xbrd, ybrd))
946
            ifa.brd = ipa_from_ip4(xbrd);
947
          else if (ifi->flags & IF_TMP_DOWN) /* Complain only during the first scan */
948
            {
949
              log(L_ERR "KIF: Invalid broadcast address %I4 for %s", xbrd, ifi->name);
950
              ifa.brd = ipa_from_ip4(ybrd);
951
            }
952
        }
953
    }
954

    
955
  scope = ipa_classify(ifa.ip);
956
  if (scope < 0)
957
    {
958
      log(L_ERR "KIF: Invalid interface address %I for %s", ifa.ip, ifi->name);
959
      return;
960
    }
961
  ifa.scope = scope & IADDR_SCOPE_MASK;
962

    
963
  DBG("KIF: IF%d(%s): %s IPA %I, flg %x, net %N, brd %I, opp %I\n",
964
      ifi->index, ifi->name,
965
      new ? "added" : "removed",
966
      ifa.ip, ifa.flags, ifa.prefix, ifa.brd, ifa.opposite);
967

    
968
  if (new)
969
    ifa_update(&ifa);
970
  else
971
    ifa_delete(&ifa);
972

    
973
  if (!scan)
974
    if_end_partial_update(ifi);
975
}
976

    
977
static void
978
nl_parse_addr6(struct ifaddrmsg *i, int scan, int new)
979
{
980
  struct rtattr *a[BIRD_IFA_MAX];
981
  struct iface *ifi;
982
  u32 ifa_flags;
983
  int scope;
984

    
985
  if (!nl_parse_attrs(IFA_RTA(i), ifa_attr_want6, a, sizeof(a)))
986
    return;
987

    
988
  if (!a[IFA_ADDRESS])
989
    {
990
      log(L_ERR "KIF: Malformed message received (missing IFA_ADDRESS)");
991
      return;
992
    }
993

    
994
  ifi = if_find_by_index(i->ifa_index);
995
  if (!ifi)
996
    {
997
      log(L_ERR "KIF: Received address message for unknown interface %d", i->ifa_index);
998
      return;
999
    }
1000

    
1001
  if (a[IFA_FLAGS])
1002
    ifa_flags = rta_get_u32(a[IFA_FLAGS]);
1003
  else
1004
    ifa_flags = i->ifa_flags;
1005

    
1006
  struct ifa ifa;
1007
  bzero(&ifa, sizeof(ifa));
1008
  ifa.iface = ifi;
1009
  if (ifa_flags & IFA_F_SECONDARY)
1010
    ifa.flags |= IA_SECONDARY;
1011

    
1012
  /* Ignore tentative addresses silently */
1013
  if (ifa_flags & IFA_F_TENTATIVE)
1014
    return;
1015

    
1016
  /* IFA_LOCAL can be unset for IPv6 interfaces */
1017
  ifa.ip = rta_get_ipa(a[IFA_LOCAL] ? : a[IFA_ADDRESS]);
1018

    
1019
  if (i->ifa_prefixlen > IP6_MAX_PREFIX_LENGTH)
1020
    {
1021
      log(L_ERR "KIF: Invalid prefix length for interface %s: %d", ifi->name, i->ifa_prefixlen);
1022
      new = 0;
1023
    }
1024
  if (i->ifa_prefixlen == IP6_MAX_PREFIX_LENGTH)
1025
    {
1026
      ifa.brd = rta_get_ipa(a[IFA_ADDRESS]);
1027
      net_fill_ip6(&ifa.prefix, rta_get_ip6(a[IFA_ADDRESS]), i->ifa_prefixlen);
1028

    
1029
      /* It is either a host address or a peer address */
1030
      if (ipa_equal(ifa.ip, ifa.brd))
1031
        ifa.flags |= IA_HOST;
1032
      else
1033
        {
1034
          ifa.flags |= IA_PEER;
1035
          ifa.opposite = ifa.brd;
1036
        }
1037
    }
1038
  else
1039
    {
1040
      net_fill_ip6(&ifa.prefix, ipa_to_ip6(ifa.ip), i->ifa_prefixlen);
1041
      net_normalize(&ifa.prefix);
1042

    
1043
      if (i->ifa_prefixlen == IP6_MAX_PREFIX_LENGTH - 1)
1044
        ifa.opposite = ipa_opposite_m1(ifa.ip);
1045
    }
1046

    
1047
  scope = ipa_classify(ifa.ip);
1048
  if (scope < 0)
1049
    {
1050
      log(L_ERR "KIF: Invalid interface address %I for %s", ifa.ip, ifi->name);
1051
      return;
1052
    }
1053
  ifa.scope = scope & IADDR_SCOPE_MASK;
1054

    
1055
  DBG("KIF: IF%d(%s): %s IPA %I, flg %x, net %N, brd %I, opp %I\n",
1056
      ifi->index, ifi->name,
1057
      new ? "added" : "removed",
1058
      ifa.ip, ifa.flags, ifa.prefix, ifa.brd, ifa.opposite);
1059

    
1060
  if (new)
1061
    ifa_update(&ifa);
1062
  else
1063
    ifa_delete(&ifa);
1064

    
1065
  if (!scan)
1066
    if_end_partial_update(ifi);
1067
}
1068

    
1069
static void
1070
nl_parse_addr(struct nlmsghdr *h, int scan)
1071
{
1072
  struct ifaddrmsg *i;
1073

    
1074
  if (!(i = nl_checkin(h, sizeof(*i))))
1075
    return;
1076

    
1077
  int new = (h->nlmsg_type == RTM_NEWADDR);
1078

    
1079
  switch (i->ifa_family)
1080
    {
1081
      case AF_INET:
1082
        return nl_parse_addr4(i, scan, new);
1083

    
1084
      case AF_INET6:
1085
        return nl_parse_addr6(i, scan, new);
1086
    }
1087
}
1088

    
1089
void
1090
kif_do_scan(struct kif_proto *p UNUSED)
1091
{
1092
  struct nlmsghdr *h;
1093

    
1094
  if_start_update();
1095

    
1096
  nl_request_dump(AF_UNSPEC, RTM_GETLINK);
1097
  while (h = nl_get_scan())
1098
    if (h->nlmsg_type == RTM_NEWLINK || h->nlmsg_type == RTM_DELLINK)
1099
      nl_parse_link(h, 1);
1100
    else
1101
      log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type);
1102

    
1103
  /* Re-resolve master interface for slaves */
1104
  struct iface *i;
1105
  WALK_LIST(i, iface_list)
1106
    if (i->master_index)
1107
    {
1108
      struct iface f = {
1109
        .flags = i->flags,
1110
        .mtu = i->mtu,
1111
        .index = i->index,
1112
        .master_index = i->master_index,
1113
        .master = if_find_by_index(i->master_index)
1114
      };
1115

    
1116
      if (f.master != i->master)
1117
      {
1118
        memcpy(f.name, i->name, sizeof(f.name));
1119
        if_update(&f);
1120
      }
1121
    }
1122

    
1123
  nl_request_dump(AF_INET, RTM_GETADDR);
1124
  while (h = nl_get_scan())
1125
    if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR)
1126
      nl_parse_addr(h, 1);
1127
    else
1128
      log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type);
1129

    
1130
  nl_request_dump(AF_INET6, RTM_GETADDR);
1131
  while (h = nl_get_scan())
1132
    if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR)
1133
      nl_parse_addr(h, 1);
1134
    else
1135
      log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type);
1136

    
1137
  if_end_update();
1138
}
1139

    
1140
/*
1141
 *        Routes
1142
 */
1143

    
1144
static inline u32
1145
krt_table_id(struct krt_proto *p)
1146
{
1147
  return KRT_CF->sys.table_id;
1148
}
1149

    
1150
static HASH(struct krt_proto) nl_table_map;
1151

    
1152
#define RTH_KEY(p)                p->af, krt_table_id(p)
1153
#define RTH_NEXT(p)                p->sys.hash_next
1154
#define RTH_EQ(a1,i1,a2,i2)        a1 == a2 && i1 == i2
1155
#define RTH_FN(a,i)                a ^ u32_hash(i)
1156

    
1157
#define RTH_REHASH                rth_rehash
1158
#define RTH_PARAMS                /8, *2, 2, 2, 6, 20
1159

    
1160
HASH_DEFINE_REHASH_FN(RTH, struct krt_proto)
1161

    
1162
int
1163
krt_capable(rte *e)
1164
{
1165
  rta *a = e->attrs;
1166

    
1167
  switch (a->dest)
1168
  {
1169
    case RTD_UNICAST:
1170
    case RTD_BLACKHOLE:
1171
    case RTD_UNREACHABLE:
1172
    case RTD_PROHIBIT:
1173
      return 1;
1174

    
1175
    default:
1176
      return 0;
1177
  }
1178
}
1179

    
1180
static inline int
1181
nh_bufsize(struct nexthop *nh)
1182
{
1183
  int rv = 0;
1184
  for (; nh != NULL; nh = nh->next)
1185
    rv += RTNH_LENGTH(RTA_LENGTH(sizeof(ip_addr)));
1186
  return rv;
1187
}
1188

    
1189
static int
1190
nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int op, int dest, struct nexthop *nh)
1191
{
1192
  eattr *ea;
1193
  net *net = e->net;
1194
  rta *a = e->attrs;
1195
  int bufsize = 128 + KRT_METRICS_MAX*8 + nh_bufsize(&(a->nh));
1196
  u32 priority = 0;
1197

    
1198
  struct {
1199
    struct nlmsghdr h;
1200
    struct rtmsg r;
1201
    char buf[0];
1202
  } *r;
1203

    
1204
  int rsize = sizeof(*r) + bufsize;
1205
  r = alloca(rsize);
1206

    
1207
  DBG("nl_send_route(%N,op=%x)\n", net->n.addr, op);
1208

    
1209
  bzero(&r->h, sizeof(r->h));
1210
  bzero(&r->r, sizeof(r->r));
1211
  r->h.nlmsg_type = op ? RTM_NEWROUTE : RTM_DELROUTE;
1212
  r->h.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
1213
  r->h.nlmsg_flags = op | NLM_F_REQUEST | NLM_F_ACK;
1214

    
1215
  r->r.rtm_family = p->af;
1216
  r->r.rtm_dst_len = net_pxlen(net->n.addr);
1217
  r->r.rtm_protocol = RTPROT_BIRD;
1218
  r->r.rtm_scope = RT_SCOPE_NOWHERE;
1219
  if (p->af == AF_MPLS)
1220
  {
1221
    u32 label = net_mpls(net->n.addr);
1222
    nl_add_attr_mpls(&r->h, rsize, RTA_DST, 1, &label);
1223
  }
1224
  else
1225
    nl_add_attr_ipa(&r->h, rsize, RTA_DST, net_prefix(net->n.addr));
1226

    
1227
  /*
1228
   * Strange behavior for RTM_DELROUTE:
1229
   * 1) rtm_family is ignored in IPv6, works for IPv4
1230
   * 2) not setting RTA_PRIORITY is different from setting default value (on IPv6)
1231
   * 3) not setting RTA_PRIORITY is equivalent to setting 0, which is wildcard
1232
   */
1233

    
1234
  if (krt_table_id(p) < 256)
1235
    r->r.rtm_table = krt_table_id(p);
1236
  else
1237
    nl_add_attr_u32(&r->h, rsize, RTA_TABLE, krt_table_id(p));
1238

    
1239
  if (a->source == RTS_DUMMY)
1240
    priority = e->u.krt.metric;
1241
  else if (KRT_CF->sys.metric)
1242
    priority = KRT_CF->sys.metric;
1243
  else if ((op != NL_OP_DELETE) && (ea = ea_find(eattrs, EA_KRT_METRIC)))
1244
    priority = ea->u.data;
1245

    
1246
  if (priority)
1247
    nl_add_attr_u32(&r->h, rsize, RTA_PRIORITY, priority);
1248

    
1249
  /* For route delete, we do not specify remaining route attributes */
1250
  if (op == NL_OP_DELETE)
1251
    goto dest;
1252

    
1253
  /* Default scope is LINK for device routes, UNIVERSE otherwise */
1254
  if (ea = ea_find(eattrs, EA_KRT_SCOPE))
1255
    r->r.rtm_scope = ea->u.data;
1256
  else
1257
    r->r.rtm_scope = (dest == RTD_UNICAST && ipa_zero(nh->gw)) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
1258

    
1259
  if (ea = ea_find(eattrs, EA_KRT_PREFSRC))
1260
    nl_add_attr_ipa(&r->h, rsize, RTA_PREFSRC, *(ip_addr *)ea->u.ptr->data);
1261

    
1262
  if (ea = ea_find(eattrs, EA_KRT_REALM))
1263
    nl_add_attr_u32(&r->h, rsize, RTA_FLOW, ea->u.data);
1264

    
1265

    
1266
  u32 metrics[KRT_METRICS_MAX];
1267
  metrics[0] = 0;
1268

    
1269
  struct ea_walk_state ews = { .eattrs = eattrs };
1270
  while (ea = ea_walk(&ews, EA_KRT_METRICS, KRT_METRICS_MAX))
1271
  {
1272
    int id = ea->id - EA_KRT_METRICS;
1273
    metrics[0] |= 1 << id;
1274
    metrics[id] = ea->u.data;
1275
  }
1276

    
1277
  if (metrics[0])
1278
    nl_add_metrics(&r->h, rsize, metrics, KRT_METRICS_MAX);
1279

    
1280

    
1281
dest:
1282
  switch (dest)
1283
    {
1284
    case RTD_UNICAST:
1285
      r->r.rtm_type = RTN_UNICAST;
1286
      if (nh->next && !krt_ecmp6(p))
1287
        nl_add_multipath(&r->h, rsize, nh, p->af);
1288
      else
1289
      {
1290
        nl_add_attr_u32(&r->h, rsize, RTA_OIF, nh->iface->index);
1291
        nl_add_nexthop(&r->h, rsize, nh, p->af);
1292

    
1293
        if (nh->flags & RNF_ONLINK)
1294
          r->r.rtm_flags |= RTNH_F_ONLINK;
1295
      }
1296
      break;
1297
    case RTD_BLACKHOLE:
1298
      r->r.rtm_type = RTN_BLACKHOLE;
1299
      break;
1300
    case RTD_UNREACHABLE:
1301
      r->r.rtm_type = RTN_UNREACHABLE;
1302
      break;
1303
    case RTD_PROHIBIT:
1304
      r->r.rtm_type = RTN_PROHIBIT;
1305
      break;
1306
    case RTD_NONE:
1307
      break;
1308
    default:
1309
      bug("krt_capable inconsistent with nl_send_route");
1310
    }
1311

    
1312
  /* Ignore missing for DELETE */
1313
  return nl_exchange(&r->h, (op == NL_OP_DELETE));
1314
}
1315

    
1316
static inline int
1317
nl_add_rte(struct krt_proto *p, rte *e, struct ea_list *eattrs)
1318
{
1319
  rta *a = e->attrs;
1320
  int err = 0;
1321

    
1322
  if (krt_ecmp6(p) && a->nh.next)
1323
  {
1324
    struct nexthop *nh = &(a->nh);
1325

    
1326
    err = nl_send_route(p, e, eattrs, NL_OP_ADD, RTD_UNICAST, nh);
1327
    if (err < 0)
1328
      return err;
1329

    
1330
    for (nh = nh->next; nh; nh = nh->next)
1331
      err += nl_send_route(p, e, eattrs, NL_OP_APPEND, RTD_UNICAST, nh);
1332

    
1333
    return err;
1334
  }
1335

    
1336
  return nl_send_route(p, e, eattrs, NL_OP_ADD, a->dest, &(a->nh));
1337
}
1338

    
1339
static inline int
1340
nl_delete_rte(struct krt_proto *p, rte *e, struct ea_list *eattrs)
1341
{
1342
  int err = 0;
1343

    
1344
  /* For IPv6, we just repeatedly request DELETE until we get error */
1345
  do
1346
    err = nl_send_route(p, e, eattrs, NL_OP_DELETE, RTD_NONE, NULL);
1347
  while (krt_ecmp6(p) && !err);
1348

    
1349
  return err;
1350
}
1351

    
1352
void
1353
krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old, struct ea_list *eattrs)
1354
{
1355
  int err = 0;
1356

    
1357
  /*
1358
   * We could use NL_OP_REPLACE, but route replace on Linux has some problems:
1359
   *
1360
   * 1) Does not check for matching rtm_protocol
1361
   * 2) Has broken semantics for IPv6 ECMP
1362
   * 3) Crashes some kernel version when used for IPv6 ECMP
1363
   *
1364
   * So we use NL_OP_DELETE and then NL_OP_ADD. We also do not trust the old
1365
   * route value, so we do not try to optimize IPv6 ECMP reconfigurations.
1366
   */
1367

    
1368
  if (old)
1369
    nl_delete_rte(p, old, eattrs);
1370

    
1371
  if (new)
1372
    err = nl_add_rte(p, new, eattrs);
1373

    
1374
  if (err < 0)
1375
    n->n.flags |= KRF_SYNC_ERROR;
1376
  else
1377
    n->n.flags &= ~KRF_SYNC_ERROR;
1378
}
1379

    
1380

    
1381
static inline struct nexthop *
1382
nl_alloc_nexthop(struct nl_parse_state *s, ip_addr gw, struct iface *iface, byte weight)
1383
{
1384
  struct nexthop *nh = lp_alloc(s->pool, sizeof(struct nexthop));
1385

    
1386
  nh->gw = gw;
1387
  nh->iface = iface;
1388
  nh->next = NULL;
1389
  nh->weight = weight;
1390

    
1391
  return nh;
1392
}
1393

    
1394
static int
1395
nl_mergable_route(struct nl_parse_state *s, net *net, struct krt_proto *p, uint priority, uint krt_type)
1396
{
1397
  /* Route merging must be active */
1398
  if (!s->merge)
1399
    return 0;
1400

    
1401
  /* Saved and new route must have same network, proto/table, and priority */
1402
  if ((s->net != net) || (s->proto != p) || (s->krt_metric != priority))
1403
    return 0;
1404

    
1405
  /* Both must be regular unicast routes */
1406
  if ((s->krt_type != RTN_UNICAST) || (krt_type != RTN_UNICAST))
1407
    return 0;
1408

    
1409
  return 1;
1410
}
1411

    
1412
static void
1413
nl_announce_route(struct nl_parse_state *s)
1414
{
1415
  rte *e = rte_get_temp(s->attrs);
1416
  e->net = s->net;
1417
  e->u.krt.src = s->krt_src;
1418
  e->u.krt.proto = s->krt_proto;
1419
  e->u.krt.seen = 0;
1420
  e->u.krt.best = 0;
1421
  e->u.krt.metric = s->krt_metric;
1422

    
1423
  if (s->scan)
1424
    krt_got_route(s->proto, e);
1425
  else
1426
    krt_got_route_async(s->proto, e, s->new);
1427

    
1428
  s->net = NULL;
1429
  s->attrs = NULL;
1430
  s->proto = NULL;
1431
  lp_flush(s->pool);
1432
}
1433

    
1434
static inline void
1435
nl_parse_begin(struct nl_parse_state *s, int scan, int merge)
1436
{
1437
  memset(s, 0, sizeof (struct nl_parse_state));
1438
  s->pool = nl_linpool;
1439
  s->scan = scan;
1440
  s->merge = merge;
1441
}
1442

    
1443
static inline void
1444
nl_parse_end(struct nl_parse_state *s)
1445
{
1446
  if (s->net)
1447
    nl_announce_route(s);
1448
}
1449

    
1450

    
1451
#define SKIP(ARG...) do { DBG("KRT: Ignoring route - " ARG); return; } while(0)
1452

    
1453
static void
1454
nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
1455
{
1456
  struct krt_proto *p;
1457
  struct rtmsg *i;
1458
  struct rtattr *a[BIRD_RTA_MAX];
1459
  int new = h->nlmsg_type == RTM_NEWROUTE;
1460

    
1461
  net_addr dst;
1462
  u32 oif = ~0;
1463
  u32 table_id;
1464
  u32 priority = 0;
1465
  u32 def_scope = RT_SCOPE_UNIVERSE;
1466
  int src;
1467

    
1468
  if (!(i = nl_checkin(h, sizeof(*i))))
1469
    return;
1470

    
1471
  switch (i->rtm_family)
1472
    {
1473
    case AF_INET:
1474
      if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want4, a, sizeof(a)))
1475
        return;
1476

    
1477
      if (a[RTA_DST])
1478
        net_fill_ip4(&dst, rta_get_ip4(a[RTA_DST]), i->rtm_dst_len);
1479
      else
1480
        net_fill_ip4(&dst, IP4_NONE, 0);
1481
      break;
1482

    
1483
    case AF_INET6:
1484
      if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want6, a, sizeof(a)))
1485
        return;
1486

    
1487
      if (a[RTA_DST])
1488
        net_fill_ip6(&dst, rta_get_ip6(a[RTA_DST]), i->rtm_dst_len);
1489
      else
1490
        net_fill_ip6(&dst, IP6_NONE, 0);
1491
      break;
1492

    
1493
    case AF_MPLS:
1494
      if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want_mpls, a, sizeof(a)))
1495
        return;
1496

    
1497
      if (!a[RTA_DST])
1498
        SKIP("MPLS route without RTA_DST");
1499

    
1500
      if (rta_get_mpls(a[RTA_DST], rta_mpls_stack) != 1)
1501
        SKIP("MPLS route with multi-label RTA_DST");
1502

    
1503
      net_fill_mpls(&dst, rta_mpls_stack[0]);
1504
      break;
1505

    
1506
    default:
1507
      return;
1508
    }
1509

    
1510
  if (a[RTA_OIF])
1511
    oif = rta_get_u32(a[RTA_OIF]);
1512

    
1513
  if (a[RTA_TABLE])
1514
    table_id = rta_get_u32(a[RTA_TABLE]);
1515
  else
1516
    table_id = i->rtm_table;
1517

    
1518
  /* Do we know this table? */
1519
  p = HASH_FIND(nl_table_map, RTH, i->rtm_family, table_id);
1520
  if (!p)
1521
    SKIP("unknown table %d\n", table);
1522

    
1523
  if (a[RTA_IIF])
1524
    SKIP("IIF set\n");
1525

    
1526
  if (i->rtm_tos != 0)                        /* We don't support TOS */
1527
    SKIP("TOS %02x\n", i->rtm_tos);
1528

    
1529
  if (s->scan && !new)
1530
    SKIP("RTM_DELROUTE in scan\n");
1531

    
1532
  if (a[RTA_PRIORITY])
1533
    priority = rta_get_u32(a[RTA_PRIORITY]);
1534

    
1535
  int c = net_classify(&dst);
1536
  if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK))
1537
    SKIP("strange class/scope\n");
1538

    
1539
  switch (i->rtm_protocol)
1540
    {
1541
    case RTPROT_UNSPEC:
1542
      SKIP("proto unspec\n");
1543

    
1544
    case RTPROT_REDIRECT:
1545
      src = KRT_SRC_REDIRECT;
1546
      break;
1547

    
1548
    case RTPROT_KERNEL:
1549
      src = KRT_SRC_KERNEL;
1550
      return;
1551

    
1552
    case RTPROT_BIRD:
1553
      if (!s->scan)
1554
        SKIP("echo\n");
1555
      src = KRT_SRC_BIRD;
1556
      break;
1557

    
1558
    case RTPROT_BOOT:
1559
    default:
1560
      src = KRT_SRC_ALIEN;
1561
    }
1562

    
1563
  net *net = net_get(p->p.main_channel->table, &dst);
1564

    
1565
  if (s->net && !nl_mergable_route(s, net, p, priority, i->rtm_type))
1566
    nl_announce_route(s);
1567

    
1568
  rta *ra = lp_allocz(s->pool, RTA_MAX_SIZE);
1569
  ra->src = p->p.main_source;
1570
  ra->source = RTS_INHERIT;
1571
  ra->scope = SCOPE_UNIVERSE;
1572

    
1573
  switch (i->rtm_type)
1574
    {
1575
    case RTN_UNICAST:
1576
      ra->dest = RTD_UNICAST;
1577

    
1578
      if (a[RTA_MULTIPATH])
1579
        {
1580
          struct nexthop *nh = nl_parse_multipath(p, a[RTA_MULTIPATH], i->rtm_family);
1581
          if (!nh)
1582
            {
1583
              log(L_ERR "KRT: Received strange multipath route %N", net->n.addr);
1584
              return;
1585
            }
1586

    
1587
          ra->nh = *nh;
1588
          break;
1589
        }
1590

    
1591
      ra->nh.iface = if_find_by_index(oif);
1592
      if (!ra->nh.iface)
1593
        {
1594
          log(L_ERR "KRT: Received route %N with unknown ifindex %u", net->n.addr, oif);
1595
          return;
1596
        }
1597

    
1598
      if ((i->rtm_family != AF_MPLS) && a[RTA_GATEWAY] || (i->rtm_family == AF_MPLS) && a[RTA_VIA])
1599
        {
1600
          if (i->rtm_family == AF_MPLS)
1601
            ra->nh.gw = rta_get_via(a[RTA_VIA]);
1602
          else
1603
            ra->nh.gw = rta_get_ipa(a[RTA_GATEWAY]);
1604

    
1605
          /* Silently skip strange 6to4 routes */
1606
          const net_addr_ip6 sit = NET_ADDR_IP6(IP6_NONE, 96);
1607
          if ((i->rtm_family == AF_INET6) && ipa_in_netX(ra->nh.gw, (net_addr *) &sit))
1608
            return;
1609

    
1610
          if (i->rtm_flags & RTNH_F_ONLINK)
1611
            ra->nh.flags |= RNF_ONLINK;
1612

    
1613
          neighbor *nbr;
1614
          nbr = neigh_find2(&p->p, &(ra->nh.gw), ra->nh.iface,
1615
                            (ra->nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0);
1616
          if (!nbr || (nbr->scope == SCOPE_HOST))
1617
            {
1618
              log(L_ERR "KRT: Received route %N with strange next-hop %I", net->n.addr,
1619
                  ra->nh.gw);
1620
              return;
1621
            }
1622
        }
1623

    
1624
      break;
1625
    case RTN_BLACKHOLE:
1626
      ra->dest = RTD_BLACKHOLE;
1627
      break;
1628
    case RTN_UNREACHABLE:
1629
      ra->dest = RTD_UNREACHABLE;
1630
      break;
1631
    case RTN_PROHIBIT:
1632
      ra->dest = RTD_PROHIBIT;
1633
      break;
1634
    /* FIXME: What about RTN_THROW? */
1635
    default:
1636
      SKIP("type %d\n", i->rtm_type);
1637
      return;
1638
    }
1639

    
1640
  int labels = 0;
1641
  if ((i->rtm_family == AF_MPLS) && a[RTA_NEWDST] && !ra->nh.next)
1642
    labels = rta_get_mpls(a[RTA_NEWDST], ra->nh.label);
1643

    
1644
  if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE] && !ra->nh.next)
1645
    {
1646
      switch (rta_get_u16(a[RTA_ENCAP_TYPE]))
1647
        {
1648
          case LWTUNNEL_ENCAP_MPLS:
1649
            {
1650
              struct rtattr *enca[BIRD_RTA_MAX];
1651
              nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]);
1652
              nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca));
1653
              labels = rta_get_mpls(enca[RTA_DST], ra->nh.label);
1654
              break;
1655
            }
1656
          default:
1657
            SKIP("unknown encapsulation method %d\n", rta_get_u16(a[RTA_ENCAP_TYPE]));
1658
            break;
1659
        }
1660
    }
1661

    
1662
  if (labels < 0)
1663
  {
1664
    log(L_WARN "KRT: Too long MPLS stack received, ignoring.");
1665
    ra->nh.labels = 0;
1666
  }
1667
  else
1668
    ra->nh.labels = labels;
1669

    
1670
  rte *e = rte_get_temp(ra);
1671
  e->net = net;
1672
  e->u.krt.src = src;
1673
  e->u.krt.proto = i->rtm_protocol;
1674
  e->u.krt.seen = 0;
1675
  e->u.krt.best = 0;
1676
  e->u.krt.metric = 0;
1677

    
1678
  if (i->rtm_scope != def_scope)
1679
    {
1680
      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
1681
      ea->next = ra->eattrs;
1682
      ra->eattrs = ea;
1683
      ea->flags = EALF_SORTED;
1684
      ea->count = 1;
1685
      ea->attrs[0].id = EA_KRT_SCOPE;
1686
      ea->attrs[0].flags = 0;
1687
      ea->attrs[0].type = EAF_TYPE_INT;
1688
      ea->attrs[0].u.data = i->rtm_scope;
1689
    }
1690

    
1691
  if (a[RTA_PRIORITY])
1692
    e->u.krt.metric = rta_get_u32(a[RTA_PRIORITY]);
1693

    
1694
  if (a[RTA_PREFSRC])
1695
    {
1696
      ip_addr ps = rta_get_ipa(a[RTA_PREFSRC]);
1697

    
1698
      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
1699
      ea->next = ra->eattrs;
1700
      ra->eattrs = ea;
1701
      ea->flags = EALF_SORTED;
1702
      ea->count = 1;
1703
      ea->attrs[0].id = EA_KRT_PREFSRC;
1704
      ea->attrs[0].flags = 0;
1705
      ea->attrs[0].type = EAF_TYPE_IP_ADDRESS;
1706
      ea->attrs[0].u.ptr = lp_alloc(s->pool, sizeof(struct adata) + sizeof(ps));
1707
      ea->attrs[0].u.ptr->length = sizeof(ps);
1708
      memcpy(ea->attrs[0].u.ptr->data, &ps, sizeof(ps));
1709
    }
1710

    
1711
  if (a[RTA_FLOW])
1712
    {
1713
      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
1714
      ea->next = ra->eattrs;
1715
      ra->eattrs = ea;
1716
      ea->flags = EALF_SORTED;
1717
      ea->count = 1;
1718
      ea->attrs[0].id = EA_KRT_REALM;
1719
      ea->attrs[0].flags = 0;
1720
      ea->attrs[0].type = EAF_TYPE_INT;
1721
      ea->attrs[0].u.data = rta_get_u32(a[RTA_FLOW]);
1722
    }
1723

    
1724
  if (a[RTA_METRICS])
1725
    {
1726
      u32 metrics[KRT_METRICS_MAX];
1727
      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + KRT_METRICS_MAX * sizeof(eattr));
1728
      int t, n = 0;
1729

    
1730
      if (nl_parse_metrics(a[RTA_METRICS], metrics, ARRAY_SIZE(metrics)) < 0)
1731
        {
1732
          log(L_ERR "KRT: Received route %N with strange RTA_METRICS attribute", net->n.addr);
1733
          return;
1734
        }
1735

    
1736
      for (t = 1; t < KRT_METRICS_MAX; t++)
1737
        if (metrics[0] & (1 << t))
1738
          {
1739
            ea->attrs[n].id = EA_CODE(EAP_KRT, KRT_METRICS_OFFSET + t);
1740
            ea->attrs[n].flags = 0;
1741
            ea->attrs[n].type = EAF_TYPE_INT; /* FIXME: Some are EAF_TYPE_BITFIELD */
1742
            ea->attrs[n].u.data = metrics[t];
1743
            n++;
1744
          }
1745

    
1746
      if (n > 0)
1747
        {
1748
          ea->next = ra->eattrs;
1749
          ea->flags = EALF_SORTED;
1750
          ea->count = n;
1751
          ra->eattrs = ea;
1752
        }
1753
    }
1754

    
1755
  /*
1756
   * Ideally, now we would send the received route to the rest of kernel code.
1757
   * But IPv6 ECMP routes before 4.11 are sent as a sequence of routes, so we
1758
   * postpone it and merge next hops until the end of the sequence. Note that
1759
   * proper multipath updates are rejected by nl_mergable_route(), so it is
1760
   * always the first case for them.
1761
   */
1762

    
1763
  if (!s->net)
1764
  {
1765
    /* Store the new route */
1766
    s->net = net;
1767
    s->attrs = ra;
1768
    s->proto = p;
1769
    s->new = new;
1770
    s->krt_src = src;
1771
    s->krt_type = i->rtm_type;
1772
    s->krt_proto = i->rtm_protocol;
1773
    s->krt_metric = priority;
1774
  }
1775
  else
1776
  {
1777
    /* Merge next hops with the stored route */
1778
    rta *oa = s->attrs;
1779

    
1780
    struct nexthop *nhs = &oa->nh;
1781
    nexthop_insert(&nhs, &ra->nh);
1782

    
1783
    /* Perhaps new nexthop is inserted at the first position */
1784
    if (nhs == &ra->nh)
1785
    {
1786
      /* Swap rtas */
1787
      s->attrs = ra;
1788

    
1789
      /* Keep old eattrs */
1790
      ra->eattrs = oa->eattrs;
1791
    }
1792
  }
1793
}
1794

    
1795
void
1796
krt_do_scan(struct krt_proto *p UNUSED)        /* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */
1797
{
1798
  struct nlmsghdr *h;
1799
  struct nl_parse_state s;
1800

    
1801
  nl_parse_begin(&s, 1, 0);
1802
  nl_request_dump(AF_INET, RTM_GETROUTE);
1803
  while (h = nl_get_scan())
1804
    if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
1805
      nl_parse_route(&s, h);
1806
    else
1807
      log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
1808
  nl_parse_end(&s);
1809

    
1810
  nl_parse_begin(&s, 1, 1);
1811
  nl_request_dump(AF_INET6, RTM_GETROUTE);
1812
  while (h = nl_get_scan())
1813
    if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
1814
      nl_parse_route(&s, h);
1815
    else
1816
      log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
1817
  nl_parse_end(&s);
1818

    
1819
  nl_parse_begin(&s, 1, 1);
1820
  nl_request_dump(AF_MPLS, RTM_GETROUTE);
1821
  while (h = nl_get_scan())
1822
    if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
1823
      nl_parse_route(&s, h);
1824
    else
1825
      log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
1826
  nl_parse_end(&s);
1827
}
1828

    
1829
/*
1830
 *        Asynchronous Netlink interface
1831
 */
1832

    
1833
static sock *nl_async_sk;                /* BIRD socket for asynchronous notifications */
1834
static byte *nl_async_rx_buffer;        /* Receive buffer */
1835

    
1836
static void
1837
nl_async_msg(struct nlmsghdr *h)
1838
{
1839
  struct nl_parse_state s;
1840

    
1841
  switch (h->nlmsg_type)
1842
    {
1843
    case RTM_NEWROUTE:
1844
    case RTM_DELROUTE:
1845
      DBG("KRT: Received async route notification (%d)\n", h->nlmsg_type);
1846
      nl_parse_begin(&s, 0, 0);
1847
      nl_parse_route(&s, h);
1848
      nl_parse_end(&s);
1849
      break;
1850
    case RTM_NEWLINK:
1851
    case RTM_DELLINK:
1852
      DBG("KRT: Received async link notification (%d)\n", h->nlmsg_type);
1853
      if (kif_proto)
1854
        nl_parse_link(h, 0);
1855
      break;
1856
    case RTM_NEWADDR:
1857
    case RTM_DELADDR:
1858
      DBG("KRT: Received async address notification (%d)\n", h->nlmsg_type);
1859
      if (kif_proto)
1860
        nl_parse_addr(h, 0);
1861
      break;
1862
    default:
1863
      DBG("KRT: Received unknown async notification (%d)\n", h->nlmsg_type);
1864
    }
1865
}
1866

    
1867
static int
1868
nl_async_hook(sock *sk, uint size UNUSED)
1869
{
1870
  struct iovec iov = { nl_async_rx_buffer, NL_RX_SIZE };
1871
  struct sockaddr_nl sa;
1872
  struct msghdr m = {
1873
    .msg_name = &sa,
1874
    .msg_namelen = sizeof(sa),
1875
    .msg_iov = &iov,
1876
    .msg_iovlen = 1,
1877
  };
1878
  struct nlmsghdr *h;
1879
  int x;
1880
  uint len;
1881

    
1882
  x = recvmsg(sk->fd, &m, 0);
1883
  if (x < 0)
1884
    {
1885
      if (errno == ENOBUFS)
1886
        {
1887
          /*
1888
           *  Netlink reports some packets have been thrown away.
1889
           *  One day we might react to it by asking for route table
1890
           *  scan in near future.
1891
           */
1892
          log(L_WARN "Kernel dropped some netlink messages, will resync on next scan.");
1893
          return 1;        /* More data are likely to be ready */
1894
        }
1895
      else if (errno != EWOULDBLOCK)
1896
        log(L_ERR "Netlink recvmsg: %m");
1897
      return 0;
1898
    }
1899
  if (sa.nl_pid)                /* It isn't from the kernel */
1900
    {
1901
      DBG("Non-kernel packet\n");
1902
      return 1;
1903
    }
1904
  h = (void *) nl_async_rx_buffer;
1905
  len = x;
1906
  if (m.msg_flags & MSG_TRUNC)
1907
    {
1908
      log(L_WARN "Netlink got truncated asynchronous message");
1909
      return 1;
1910
    }
1911
  while (NLMSG_OK(h, len))
1912
    {
1913
      nl_async_msg(h);
1914
      h = NLMSG_NEXT(h, len);
1915
    }
1916
  if (len)
1917
    log(L_WARN "nl_async_hook: Found packet remnant of size %d", len);
1918
  return 1;
1919
}
1920

    
1921
static void
1922
nl_async_err_hook(sock *sk, int e UNUSED)
1923
{
1924
  nl_async_hook(sk, 0);
1925
}
1926

    
1927
static void
1928
nl_open_async(void)
1929
{
1930
  sock *sk;
1931
  struct sockaddr_nl sa;
1932
  int fd;
1933

    
1934
  if (nl_async_sk)
1935
    return;
1936

    
1937
  DBG("KRT: Opening async netlink socket\n");
1938

    
1939
  fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
1940
  if (fd < 0)
1941
    {
1942
      log(L_ERR "Unable to open asynchronous rtnetlink socket: %m");
1943
      return;
1944
    }
1945

    
1946
  bzero(&sa, sizeof(sa));
1947
  sa.nl_family = AF_NETLINK;
1948
  sa.nl_groups = RTMGRP_LINK |
1949
    RTMGRP_IPV4_IFADDR | RTMGRP_IPV4_ROUTE |
1950
    RTMGRP_IPV6_IFADDR | RTMGRP_IPV6_ROUTE;
1951

    
1952
  if (bind(fd, (struct sockaddr *) &sa, sizeof(sa)) < 0)
1953
    {
1954
      log(L_ERR "Unable to bind asynchronous rtnetlink socket: %m");
1955
      close(fd);
1956
      return;
1957
    }
1958

    
1959
  nl_async_rx_buffer = xmalloc(NL_RX_SIZE);
1960

    
1961
  sk = nl_async_sk = sk_new(krt_pool);
1962
  sk->type = SK_MAGIC;
1963
  sk->rx_hook = nl_async_hook;
1964
  sk->err_hook = nl_async_err_hook;
1965
  sk->fd = fd;
1966
  if (sk_open(sk) < 0)
1967
    bug("Netlink: sk_open failed");
1968
}
1969

    
1970

    
1971
/*
1972
 *        Interface to the UNIX krt module
1973
 */
1974

    
1975
void
1976
krt_sys_io_init(void)
1977
{
1978
  nl_linpool = lp_new_default(krt_pool);
1979
  HASH_INIT(nl_table_map, krt_pool, 6);
1980
}
1981

    
1982
int
1983
krt_sys_start(struct krt_proto *p)
1984
{
1985
  struct krt_proto *old = HASH_FIND(nl_table_map, RTH, p->af, krt_table_id(p));
1986

    
1987
  if (old)
1988
    {
1989
      log(L_ERR "%s: Kernel table %u already registered by %s",
1990
          p->p.name, krt_table_id(p), old->p.name);
1991
      return 0;
1992
    }
1993

    
1994
  HASH_INSERT2(nl_table_map, RTH, krt_pool, p);
1995

    
1996
  nl_open();
1997
  nl_open_async();
1998

    
1999
  return 1;
2000
}
2001

    
2002
void
2003
krt_sys_shutdown(struct krt_proto *p)
2004
{
2005
  HASH_REMOVE2(nl_table_map, RTH, krt_pool, p);
2006
}
2007

    
2008
int
2009
krt_sys_reconfigure(struct krt_proto *p UNUSED, struct krt_config *n, struct krt_config *o)
2010
{
2011
  return (n->sys.table_id == o->sys.table_id) && (n->sys.metric == o->sys.metric);
2012
}
2013

    
2014
void
2015
krt_sys_init_config(struct krt_config *cf)
2016
{
2017
  cf->sys.table_id = RT_TABLE_MAIN;
2018
  cf->sys.metric = 32;
2019
}
2020

    
2021
void
2022
krt_sys_copy_config(struct krt_config *d, struct krt_config *s)
2023
{
2024
  d->sys.table_id = s->sys.table_id;
2025
  d->sys.metric = s->sys.metric;
2026
}
2027

    
2028
static const char *krt_metrics_names[KRT_METRICS_MAX] = {
2029
  NULL, "lock", "mtu", "window", "rtt", "rttvar", "sstresh", "cwnd", "advmss",
2030
  "reordering", "hoplimit", "initcwnd", "features", "rto_min", "initrwnd", "quickack"
2031
};
2032

    
2033
static const char *krt_features_names[KRT_FEATURES_MAX] = {
2034
  "ecn", NULL, NULL, "allfrag"
2035
};
2036

    
2037
int
2038
krt_sys_get_attr(eattr *a, byte *buf, int buflen UNUSED)
2039
{
2040
  switch (a->id)
2041
  {
2042
  case EA_KRT_PREFSRC:
2043
    bsprintf(buf, "prefsrc");
2044
    return GA_NAME;
2045

    
2046
  case EA_KRT_REALM:
2047
    bsprintf(buf, "realm");
2048
    return GA_NAME;
2049

    
2050
  case EA_KRT_SCOPE:
2051
    bsprintf(buf, "scope");
2052
    return GA_NAME;
2053

    
2054
  case EA_KRT_LOCK:
2055
    buf += bsprintf(buf, "lock:");
2056
    ea_format_bitfield(a, buf, buflen, krt_metrics_names, 2, KRT_METRICS_MAX);
2057
    return GA_FULL;
2058

    
2059
  case EA_KRT_FEATURES:
2060
    buf += bsprintf(buf, "features:");
2061
    ea_format_bitfield(a, buf, buflen, krt_features_names, 0, KRT_FEATURES_MAX);
2062
    return GA_FULL;
2063

    
2064
  default:;
2065
    int id = (int)EA_ID(a->id) - KRT_METRICS_OFFSET;
2066
    if (id > 0 && id < KRT_METRICS_MAX)
2067
    {
2068
      bsprintf(buf, "%s", krt_metrics_names[id]);
2069
      return GA_NAME;
2070
    }
2071

    
2072
    return GA_UNKNOWN;
2073
  }
2074
}
2075

    
2076

    
2077

    
2078
void
2079
kif_sys_start(struct kif_proto *p UNUSED)
2080
{
2081
  nl_open();
2082
  nl_open_async();
2083
}
2084

    
2085
void
2086
kif_sys_shutdown(struct kif_proto *p UNUSED)
2087
{
2088
}
2089

    
2090
int
2091
kif_update_sysdep_addr(struct iface *i UNUSED)
2092
{
2093
  return 0;
2094
}