Statistics
| Branch: | Revision:

iof-bird-daemon / sysdep / linux / netlink.c @ d14f8c3c

History | View | Annotate | Download (47.1 KB)

1
/*
2
 *        BIRD -- Linux Netlink Interface
3
 *
4
 *        (c) 1999--2000 Martin Mares <mj@ucw.cz>
5
 *
6
 *        Can be freely distributed and used under the terms of the GNU GPL.
7
 */
8

    
9
#include <alloca.h>
10
#include <stdio.h>
11
#include <unistd.h>
12
#include <fcntl.h>
13
#include <sys/socket.h>
14
#include <sys/uio.h>
15
#include <errno.h>
16

    
17
#undef LOCAL_DEBUG
18

    
19
#include "nest/bird.h"
20
#include "nest/route.h"
21
#include "nest/protocol.h"
22
#include "nest/iface.h"
23
#include "lib/alloca.h"
24
#include "sysdep/unix/timer.h"
25
#include "sysdep/unix/unix.h"
26
#include "sysdep/unix/krt.h"
27
#include "lib/socket.h"
28
#include "lib/string.h"
29
#include "lib/hash.h"
30
#include "conf/conf.h"
31

    
32
#include <asm/types.h>
33
#include <linux/if.h>
34
#include <linux/lwtunnel.h>
35
#include <linux/netlink.h>
36
#include <linux/rtnetlink.h>
37

    
38

    
39
#ifndef MSG_TRUNC                        /* Hack: Several versions of glibc miss this one :( */
40
#define MSG_TRUNC 0x20
41
#endif
42

    
43
#ifndef IFA_FLAGS
44
#define IFA_FLAGS 8
45
#endif
46

    
47
#ifndef IFF_LOWER_UP
48
#define IFF_LOWER_UP 0x10000
49
#endif
50

    
51
#ifndef RTA_TABLE
52
#define RTA_TABLE  15
53
#endif
54

    
55
#ifndef RTA_VIA
56
#define RTA_VIA         18
57
#endif
58

    
59
#ifndef RTA_NEWDST
60
#define RTA_NEWDST  19
61
#endif
62

    
63
#ifndef RTA_ENCAP_TYPE
64
#define RTA_ENCAP_TYPE        21
65
#endif
66

    
67
#ifndef RTA_ENCAP
68
#define RTA_ENCAP  22
69
#endif
70

    
71
#define krt_ecmp6(p) ((p)->af == AF_INET6)
72

    
73
/*
74
 * Structure nl_parse_state keeps state of received route processing. Ideally,
75
 * we could just independently parse received Netlink messages and immediately
76
 * propagate received routes to the rest of BIRD, but Linux kernel represents
77
 * and announces IPv6 ECMP routes not as one route with multiple next hops (like
78
 * RTA_MULTIPATH in IPv4 ECMP), but as a set of routes with the same prefix.
79
 *
80
 * Therefore, BIRD keeps currently processed route in nl_parse_state structure
81
 * and postpones its propagation until we expect it to be final; i.e., when
82
 * non-matching route is received or when the scan ends. When another matching
83
 * route is received, it is merged with the already processed route to form an
84
 * ECMP route. Note that merging is done only for IPv6 (merge == 1), but the
85
 * postponing is done in both cases (for simplicity). All IPv4 routes are just
86
 * considered non-matching.
87
 *
88
 * This is ignored for asynchronous notifications (every notification is handled
89
 * as a separate route). It is not an issue for our routes, as we ignore such
90
 * notifications anyways. But importing alien IPv6 ECMP routes does not work
91
 * properly.
92
 */
93

    
94
struct nl_parse_state
95
{
96
  struct linpool *pool;
97
  int scan;
98
  int merge;
99

    
100
  net *net;
101
  rta *attrs;
102
  struct krt_proto *proto;
103
  s8 new;
104
  s8 krt_src;
105
  u8 krt_type;
106
  u8 krt_proto;
107
  u32 krt_metric;
108
};
109

    
110
/*
111
 *        Synchronous Netlink interface
112
 */
113

    
114
struct nl_sock
115
{
116
  int fd;
117
  u32 seq;
118
  byte *rx_buffer;                        /* Receive buffer */
119
  struct nlmsghdr *last_hdr;                /* Recently received packet */
120
  uint last_size;
121
};
122

    
123
#define NL_RX_SIZE 8192
124

    
125
#define NL_OP_DELETE        0
126
#define NL_OP_ADD        (NLM_F_CREATE|NLM_F_EXCL)
127
#define NL_OP_REPLACE        (NLM_F_CREATE|NLM_F_REPLACE)
128
#define NL_OP_APPEND        (NLM_F_CREATE|NLM_F_APPEND)
129

    
130
static linpool *nl_linpool;
131

    
132
static struct nl_sock nl_scan = {.fd = -1};        /* Netlink socket for synchronous scan */
133
static struct nl_sock nl_req  = {.fd = -1};        /* Netlink socket for requests */
134

    
135
static void
136
nl_open_sock(struct nl_sock *nl)
137
{
138
  if (nl->fd < 0)
139
    {
140
      nl->fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
141
      if (nl->fd < 0)
142
        die("Unable to open rtnetlink socket: %m");
143
      nl->seq = now;
144
      nl->rx_buffer = xmalloc(NL_RX_SIZE);
145
      nl->last_hdr = NULL;
146
      nl->last_size = 0;
147
    }
148
}
149

    
150
static void
151
nl_open(void)
152
{
153
  nl_open_sock(&nl_scan);
154
  nl_open_sock(&nl_req);
155
}
156

    
157
static void
158
nl_send(struct nl_sock *nl, struct nlmsghdr *nh)
159
{
160
  struct sockaddr_nl sa;
161

    
162
  memset(&sa, 0, sizeof(sa));
163
  sa.nl_family = AF_NETLINK;
164
  nh->nlmsg_pid = 0;
165
  nh->nlmsg_seq = ++(nl->seq);
166
  if (sendto(nl->fd, nh, nh->nlmsg_len, 0, (struct sockaddr *)&sa, sizeof(sa)) < 0)
167
    die("rtnetlink sendto: %m");
168
  nl->last_hdr = NULL;
169
}
170

    
171
static void
172
nl_request_dump(int af, int cmd)
173
{
174
  struct {
175
    struct nlmsghdr nh;
176
    struct rtgenmsg g;
177
  } req = {
178
    .nh.nlmsg_type = cmd,
179
    .nh.nlmsg_len = sizeof(req),
180
    .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
181
    .g.rtgen_family = af
182
  };
183
  nl_send(&nl_scan, &req.nh);
184
}
185

    
186
static struct nlmsghdr *
187
nl_get_reply(struct nl_sock *nl)
188
{
189
  for(;;)
190
    {
191
      if (!nl->last_hdr)
192
        {
193
          struct iovec iov = { nl->rx_buffer, NL_RX_SIZE };
194
          struct sockaddr_nl sa;
195
          struct msghdr m = {
196
            .msg_name = &sa,
197
            .msg_namelen = sizeof(sa),
198
            .msg_iov = &iov,
199
            .msg_iovlen = 1,
200
          };
201
          int x = recvmsg(nl->fd, &m, 0);
202
          if (x < 0)
203
            die("nl_get_reply: %m");
204
          if (sa.nl_pid)                /* It isn't from the kernel */
205
            {
206
              DBG("Non-kernel packet\n");
207
              continue;
208
            }
209
          nl->last_size = x;
210
          nl->last_hdr = (void *) nl->rx_buffer;
211
          if (m.msg_flags & MSG_TRUNC)
212
            bug("nl_get_reply: got truncated reply which should be impossible");
213
        }
214
      if (NLMSG_OK(nl->last_hdr, nl->last_size))
215
        {
216
          struct nlmsghdr *h = nl->last_hdr;
217
          nl->last_hdr = NLMSG_NEXT(h, nl->last_size);
218
          if (h->nlmsg_seq != nl->seq)
219
            {
220
              log(L_WARN "nl_get_reply: Ignoring out of sequence netlink packet (%x != %x)",
221
                  h->nlmsg_seq, nl->seq);
222
              continue;
223
            }
224
          return h;
225
        }
226
      if (nl->last_size)
227
        log(L_WARN "nl_get_reply: Found packet remnant of size %d", nl->last_size);
228
      nl->last_hdr = NULL;
229
    }
230
}
231

    
232
static struct tbf rl_netlink_err = TBF_DEFAULT_LOG_LIMITS;
233

    
234
static int
235
nl_error(struct nlmsghdr *h, int ignore_esrch)
236
{
237
  struct nlmsgerr *e;
238
  int ec;
239

    
240
  if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr)))
241
    {
242
      log(L_WARN "Netlink: Truncated error message received");
243
      return ENOBUFS;
244
    }
245
  e = (struct nlmsgerr *) NLMSG_DATA(h);
246
  ec = -e->error;
247
  if (ec && !(ignore_esrch && (ec == ESRCH)))
248
    log_rl(&rl_netlink_err, L_WARN "Netlink: %s", strerror(ec));
249
  return ec;
250
}
251

    
252
static struct nlmsghdr *
253
nl_get_scan(void)
254
{
255
  struct nlmsghdr *h = nl_get_reply(&nl_scan);
256

    
257
  if (h->nlmsg_type == NLMSG_DONE)
258
    return NULL;
259
  if (h->nlmsg_type == NLMSG_ERROR)
260
    {
261
      nl_error(h, 0);
262
      return NULL;
263
    }
264
  return h;
265
}
266

    
267
static int
268
nl_exchange(struct nlmsghdr *pkt, int ignore_esrch)
269
{
270
  struct nlmsghdr *h;
271

    
272
  nl_send(&nl_req, pkt);
273
  for(;;)
274
    {
275
      h = nl_get_reply(&nl_req);
276
      if (h->nlmsg_type == NLMSG_ERROR)
277
        break;
278
      log(L_WARN "nl_exchange: Unexpected reply received");
279
    }
280
  return nl_error(h, ignore_esrch) ? -1 : 0;
281
}
282

    
283
/*
284
 *        Netlink attributes
285
 */
286

    
287
static int nl_attr_len;
288

    
289
static void *
290
nl_checkin(struct nlmsghdr *h, int lsize)
291
{
292
  nl_attr_len = h->nlmsg_len - NLMSG_LENGTH(lsize);
293
  if (nl_attr_len < 0)
294
    {
295
      log(L_ERR "nl_checkin: underrun by %d bytes", -nl_attr_len);
296
      return NULL;
297
    }
298
  return NLMSG_DATA(h);
299
}
300

    
301
struct nl_want_attrs {
302
  u8 defined:1;
303
  u8 checksize:1;
304
  u8 size;
305
};
306

    
307

    
308
#define BIRD_IFLA_MAX (IFLA_WIRELESS+1)
309

    
310
static struct nl_want_attrs ifla_attr_want[BIRD_IFLA_MAX] = {
311
  [IFLA_IFNAME]          = { 1, 0, 0 },
312
  [IFLA_MTU]          = { 1, 1, sizeof(u32) },
313
  [IFLA_WIRELESS] = { 1, 0, 0 },
314
};
315

    
316

    
317
#define BIRD_IFA_MAX  (IFA_FLAGS+1)
318

    
319
static struct nl_want_attrs ifa_attr_want4[BIRD_IFA_MAX] = {
320
  [IFA_ADDRESS]          = { 1, 1, sizeof(ip4_addr) },
321
  [IFA_LOCAL]          = { 1, 1, sizeof(ip4_addr) },
322
  [IFA_BROADCAST] = { 1, 1, sizeof(ip4_addr) },
323
};
324

    
325
static struct nl_want_attrs ifa_attr_want6[BIRD_IFA_MAX] = {
326
  [IFA_ADDRESS]          = { 1, 1, sizeof(ip6_addr) },
327
  [IFA_LOCAL]          = { 1, 1, sizeof(ip6_addr) },
328
  [IFA_FLAGS]          = { 1, 1, sizeof(u32) },
329
};
330

    
331

    
332
#define BIRD_RTA_MAX  (RTA_ENCAP+1)
333

    
334
static struct nl_want_attrs nexthop_attr_want4[BIRD_RTA_MAX] = {
335
  [RTA_GATEWAY]          = { 1, 1, sizeof(ip4_addr) },
336
  [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
337
  [RTA_ENCAP]          = { 1, 0, 0 },
338
};
339

    
340
static struct nl_want_attrs encap_mpls_want[BIRD_RTA_MAX] = {
341
  [RTA_DST]       = { 1, 0, 0 },
342
};
343

    
344
static struct nl_want_attrs rtm_attr_want4[BIRD_RTA_MAX] = {
345
  [RTA_DST]          = { 1, 1, sizeof(ip4_addr) },
346
  [RTA_OIF]          = { 1, 1, sizeof(u32) },
347
  [RTA_GATEWAY]          = { 1, 1, sizeof(ip4_addr) },
348
  [RTA_PRIORITY]  = { 1, 1, sizeof(u32) },
349
  [RTA_PREFSRC]          = { 1, 1, sizeof(ip4_addr) },
350
  [RTA_METRICS]          = { 1, 0, 0 },
351
  [RTA_MULTIPATH] = { 1, 0, 0 },
352
  [RTA_FLOW]          = { 1, 1, sizeof(u32) },
353
  [RTA_TABLE]          = { 1, 1, sizeof(u32) },
354
  [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
355
  [RTA_ENCAP]          = { 1, 0, 0 },
356
};
357

    
358
static struct nl_want_attrs rtm_attr_want6[BIRD_RTA_MAX] = {
359
  [RTA_DST]          = { 1, 1, sizeof(ip6_addr) },
360
  [RTA_IIF]          = { 1, 1, sizeof(u32) },
361
  [RTA_OIF]          = { 1, 1, sizeof(u32) },
362
  [RTA_GATEWAY]          = { 1, 1, sizeof(ip6_addr) },
363
  [RTA_PRIORITY]  = { 1, 1, sizeof(u32) },
364
  [RTA_PREFSRC]          = { 1, 1, sizeof(ip6_addr) },
365
  [RTA_METRICS]          = { 1, 0, 0 },
366
  [RTA_FLOW]          = { 1, 1, sizeof(u32) },
367
  [RTA_TABLE]          = { 1, 1, sizeof(u32) },
368
  [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
369
  [RTA_ENCAP]          = { 1, 0, 0 },
370
};
371

    
372
static struct nl_want_attrs rtm_attr_want_mpls[BIRD_RTA_MAX] = {
373
  [RTA_DST]          = { 1, 1, sizeof(u32) },
374
  [RTA_IIF]          = { 1, 1, sizeof(u32) },
375
  [RTA_OIF]          = { 1, 1, sizeof(u32) },
376
  [RTA_PRIORITY]  = { 1, 1, sizeof(u32) },
377
  [RTA_METRICS]          = { 1, 0, 0 },
378
  [RTA_FLOW]          = { 1, 1, sizeof(u32) },
379
  [RTA_TABLE]          = { 1, 1, sizeof(u32) },
380
  [RTA_VIA]          = { 1, 0, 0 },
381
  [RTA_NEWDST]          = { 1, 0, 0 },
382
};
383

    
384

    
385
static int
386
nl_parse_attrs(struct rtattr *a, struct nl_want_attrs *want, struct rtattr **k, int ksize)
387
{
388
  int max = ksize / sizeof(struct rtattr *);
389
  bzero(k, ksize);
390

    
391
  for ( ; RTA_OK(a, nl_attr_len); a = RTA_NEXT(a, nl_attr_len))
392
    {
393
      if ((a->rta_type >= max) || !want[a->rta_type].defined)
394
        continue;
395

    
396
      if (want[a->rta_type].checksize && (RTA_PAYLOAD(a) != want[a->rta_type].size))
397
        {
398
          log(L_ERR "nl_parse_attrs: Malformed attribute received");
399
          return 0;
400
        }
401

    
402
      k[a->rta_type] = a;
403
    }
404

    
405
  if (nl_attr_len)
406
    {
407
      log(L_ERR "nl_parse_attrs: remnant of size %d", nl_attr_len);
408
      return 0;
409
    }
410

    
411
  return 1;
412
}
413

    
414
static inline u16 rta_get_u16(struct rtattr *a)
415
{ return *(u16 *) RTA_DATA(a); }
416

    
417
static inline u32 rta_get_u32(struct rtattr *a)
418
{ return *(u32 *) RTA_DATA(a); }
419

    
420
static inline ip4_addr rta_get_ip4(struct rtattr *a)
421
{ return ip4_ntoh(*(ip4_addr *) RTA_DATA(a)); }
422

    
423
static inline ip6_addr rta_get_ip6(struct rtattr *a)
424
{ return ip6_ntoh(*(ip6_addr *) RTA_DATA(a)); }
425

    
426
static inline ip_addr rta_get_ipa(struct rtattr *a)
427
{
428
  if (RTA_PAYLOAD(a) == sizeof(ip4_addr))
429
    return ipa_from_ip4(rta_get_ip4(a));
430
  else
431
    return ipa_from_ip6(rta_get_ip6(a));
432
}
433

    
434
static inline ip_addr rta_get_via(struct rtattr *a)
435
{
436
  struct rtvia *v = RTA_DATA(a);
437
  switch(v->rtvia_family) {
438
    case AF_INET:  return ipa_from_ip4(ip4_ntoh(*(ip4_addr *) v->rtvia_addr));
439
    case AF_INET6: return ipa_from_ip6(ip6_ntoh(*(ip6_addr *) v->rtvia_addr));
440
  }
441
  return IPA_NONE;
442
}
443

    
444
static u32 rta_mpls_stack[MPLS_MAX_LABEL_STACK];
445
static inline int rta_get_mpls(struct rtattr *a, u32 *stack)
446
{
447
  if (RTA_PAYLOAD(a) % 4)
448
    log(L_WARN "KRT: Strange length of received MPLS stack: %u", RTA_PAYLOAD(a));
449

    
450
  return mpls_get(RTA_DATA(a), RTA_PAYLOAD(a) & ~0x3, stack);
451
}
452

    
453
struct rtattr *
454
nl_add_attr(struct nlmsghdr *h, uint bufsize, uint code, const void *data, uint dlen)
455
{
456
  uint pos = NLMSG_ALIGN(h->nlmsg_len);
457
  uint len = RTA_LENGTH(dlen);
458

    
459
  if (pos + len > bufsize)
460
    bug("nl_add_attr: packet buffer overflow");
461

    
462
  struct rtattr *a = (struct rtattr *)((char *)h + pos);
463
  a->rta_type = code;
464
  a->rta_len = len;
465
  h->nlmsg_len = pos + len;
466

    
467
  if (dlen > 0)
468
    memcpy(RTA_DATA(a), data, dlen);
469

    
470
  return a;
471
}
472

    
473
static inline struct rtattr *
474
nl_open_attr(struct nlmsghdr *h, uint bufsize, uint code)
475
{
476
  return nl_add_attr(h, bufsize, code, NULL, 0);
477
}
478

    
479
static inline void
480
nl_close_attr(struct nlmsghdr *h, struct rtattr *a)
481
{
482
  a->rta_len = (void *)h + NLMSG_ALIGN(h->nlmsg_len) - (void *)a;
483
}
484

    
485
static inline void
486
nl_add_attr_u16(struct nlmsghdr *h, uint bufsize, int code, u16 data)
487
{
488
  nl_add_attr(h, bufsize, code, &data, 2);
489
}
490

    
491
static inline void
492
nl_add_attr_u32(struct nlmsghdr *h, uint bufsize, int code, u32 data)
493
{
494
  nl_add_attr(h, bufsize, code, &data, 4);
495
}
496

    
497
static inline void
498
nl_add_attr_ip4(struct nlmsghdr *h, uint bufsize, int code, ip4_addr ip4)
499
{
500
  ip4 = ip4_hton(ip4);
501
  nl_add_attr(h, bufsize, code, &ip4, sizeof(ip4));
502
}
503

    
504
static inline void
505
nl_add_attr_ip6(struct nlmsghdr *h, uint bufsize, int code, ip6_addr ip6)
506
{
507
  ip6 = ip6_hton(ip6);
508
  nl_add_attr(h, bufsize, code, &ip6, sizeof(ip6));
509
}
510

    
511
static inline void
512
nl_add_attr_ipa(struct nlmsghdr *h, uint bufsize, int code, ip_addr ipa)
513
{
514
  if (ipa_is_ip4(ipa))
515
    nl_add_attr_ip4(h, bufsize, code, ipa_to_ip4(ipa));
516
  else
517
    nl_add_attr_ip6(h, bufsize, code, ipa_to_ip6(ipa));
518
}
519

    
520
static inline void
521
nl_add_attr_mpls(struct nlmsghdr *h, uint bufsize, int code, int len, u32 *stack)
522
{
523
  char buf[len*4];
524
  mpls_put(buf, len, stack);
525
  nl_add_attr(h, bufsize, code, buf, len*4);
526
}
527

    
528
static inline void
529
nl_add_attr_mpls_encap(struct nlmsghdr *h, uint bufsize, int len, u32 *stack)
530
{
531
  nl_add_attr_u16(h, bufsize, RTA_ENCAP_TYPE, LWTUNNEL_ENCAP_MPLS);
532

    
533
  struct rtattr *nest = nl_open_attr(h, bufsize, RTA_ENCAP);
534
  nl_add_attr_mpls(h, bufsize, RTA_DST, len, stack);
535
  nl_close_attr(h, nest);
536
}
537

    
538
static inline void
539
nl_add_attr_via(struct nlmsghdr *h, uint bufsize, ip_addr ipa)
540
{
541
  struct rtattr *nest = nl_open_attr(h, bufsize, RTA_VIA);
542
  struct rtvia *via = RTA_DATA(nest);
543

    
544
  h->nlmsg_len += sizeof(*via);
545

    
546
  if (ipa_is_ip4(ipa)) {
547
    ip4_addr ip4 = ipa_to_ip4(ipa);
548
    ip4 = ip4_hton(ip4);
549
    via->rtvia_family = AF_INET;
550
    memcpy(via->rtvia_addr, &ip4, sizeof(ip4));
551
    h->nlmsg_len += sizeof(ip4);
552
  } else {
553
    ip6_addr ip6 = ipa_to_ip6(ipa);
554
    ip6 = ip6_hton(ip6);
555
    via->rtvia_family = AF_INET6;
556
    memcpy(via->rtvia_addr, &ip6, sizeof(ip6));
557
    h->nlmsg_len += sizeof(ip6);
558
  }
559

    
560
  nl_close_attr(h, nest);
561
}
562

    
563
static inline struct rtnexthop *
564
nl_open_nexthop(struct nlmsghdr *h, uint bufsize)
565
{
566
  uint pos = NLMSG_ALIGN(h->nlmsg_len);
567
  uint len = RTNH_LENGTH(0);
568

    
569
  if (pos + len > bufsize)
570
    bug("nl_open_nexthop: packet buffer overflow");
571

    
572
  h->nlmsg_len = pos + len;
573

    
574
  return (void *)h + pos;
575
}
576

    
577
static inline void
578
nl_close_nexthop(struct nlmsghdr *h, struct rtnexthop *nh)
579
{
580
  nh->rtnh_len = (void *)h + NLMSG_ALIGN(h->nlmsg_len) - (void *)nh;
581
}
582

    
583
static inline void
584
nl_add_nexthop(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af)
585
{
586
  if (nh->labels > 0)
587
    if (af == AF_MPLS)
588
      nl_add_attr_mpls(h, bufsize, RTA_NEWDST, nh->labels, nh->label);
589
    else
590
      nl_add_attr_mpls_encap(h, bufsize, nh->labels, nh->label);
591

    
592
  if (ipa_nonzero(nh->gw))
593
    if (af == AF_MPLS)
594
      nl_add_attr_via(h, bufsize, nh->gw);
595
    else
596
      nl_add_attr_ipa(h, bufsize, RTA_GATEWAY, nh->gw);
597
}
598

    
599
static void
600
nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af)
601
{
602
  struct rtattr *a = nl_open_attr(h, bufsize, RTA_MULTIPATH);
603

    
604
  for (; nh; nh = nh->next)
605
  {
606
    struct rtnexthop *rtnh = nl_open_nexthop(h, bufsize);
607

    
608
    rtnh->rtnh_flags = 0;
609
    rtnh->rtnh_hops = nh->weight;
610
    rtnh->rtnh_ifindex = nh->iface->index;
611

    
612
    nl_add_nexthop(h, bufsize, nh, af);
613

    
614
    nl_close_nexthop(h, rtnh);
615
  }
616

    
617
  nl_close_attr(h, a);
618
}
619

    
620
static struct nexthop *
621
nl_parse_multipath(struct krt_proto *p, struct rtattr *ra)
622
{
623
  /* Temporary buffer for multicast nexthops */
624
  static struct nexthop *nh_buffer;
625
  static int nh_buf_size;        /* in number of structures */
626
  static int nh_buf_used;
627

    
628
  struct rtattr *a[BIRD_RTA_MAX];
629
  struct rtnexthop *nh = RTA_DATA(ra);
630
  struct nexthop *rv, *first, **last;
631
  unsigned len = RTA_PAYLOAD(ra);
632

    
633
  first = NULL;
634
  last = &first;
635
  nh_buf_used = 0;
636

    
637
  while (len)
638
    {
639
      /* Use RTNH_OK(nh,len) ?? */
640
      if ((len < sizeof(*nh)) || (len < nh->rtnh_len))
641
        return NULL;
642

    
643
      if (nh_buf_used == nh_buf_size)
644
      {
645
        nh_buf_size = nh_buf_size ? (nh_buf_size * 2) : 4;
646
        nh_buffer = xrealloc(nh_buffer, nh_buf_size * NEXTHOP_MAX_SIZE);
647
      }
648
      *last = rv = nh_buffer + nh_buf_used++;
649
      rv->next = NULL;
650
      last = &(rv->next);
651

    
652
      rv->weight = nh->rtnh_hops;
653
      rv->iface = if_find_by_index(nh->rtnh_ifindex);
654
      if (!rv->iface)
655
        return NULL;
656

    
657
      /* Nonexistent RTNH_PAYLOAD ?? */
658
      nl_attr_len = nh->rtnh_len - RTNH_LENGTH(0);
659
      nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want4, a, sizeof(a));
660
      if (a[RTA_GATEWAY])
661
        {
662
          rv->gw = rta_get_ipa(a[RTA_GATEWAY]);
663

    
664
          neighbor *nbr;
665
          nbr = neigh_find2(&p->p, &rv->gw, rv->iface,
666
                            (nh->rtnh_flags & RTNH_F_ONLINK) ? NEF_ONLINK : 0);
667
          if (!nbr || (nbr->scope == SCOPE_HOST))
668
            return NULL;
669
        }
670
      else
671
        rv->gw = IPA_NONE;
672
      if (a[RTA_ENCAP_TYPE])
673
        {
674
          if (rta_get_u16(a[RTA_ENCAP_TYPE]) != LWTUNNEL_ENCAP_MPLS) {
675
            log(L_WARN "KRT: Unknown encapsulation method %d in multipath", rta_get_u16(a[RTA_ENCAP_TYPE]));
676
            return NULL;
677
          }
678

    
679
          struct rtattr *enca[BIRD_RTA_MAX];
680
          nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]);
681
          nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca));
682
          rv->labels = rta_get_mpls(enca[RTA_DST], rv->label);
683
          break;
684
        }
685

    
686

    
687
      len -= NLMSG_ALIGN(nh->rtnh_len);
688
      nh = RTNH_NEXT(nh);
689
    }
690

    
691
  return first;
692
}
693

    
694
static void
695
nl_add_metrics(struct nlmsghdr *h, uint bufsize, u32 *metrics, int max)
696
{
697
  struct rtattr *a = nl_open_attr(h, bufsize, RTA_METRICS);
698
  int t;
699

    
700
  for (t = 1; t < max; t++)
701
    if (metrics[0] & (1 << t))
702
      nl_add_attr_u32(h, bufsize, t, metrics[t]);
703

    
704
  nl_close_attr(h, a);
705
}
706

    
707
static int
708
nl_parse_metrics(struct rtattr *hdr, u32 *metrics, int max)
709
{
710
  struct rtattr *a = RTA_DATA(hdr);
711
  int len = RTA_PAYLOAD(hdr);
712

    
713
  metrics[0] = 0;
714
  for (; RTA_OK(a, len); a = RTA_NEXT(a, len))
715
  {
716
    if (a->rta_type == RTA_UNSPEC)
717
      continue;
718

    
719
    if (a->rta_type >= max)
720
      continue;
721

    
722
    if (RTA_PAYLOAD(a) != 4)
723
      return -1;
724

    
725
    metrics[0] |= 1 << a->rta_type;
726
    metrics[a->rta_type] = rta_get_u32(a);
727
  }
728

    
729
  if (len > 0)
730
    return -1;
731

    
732
  return 0;
733
}
734

    
735

    
736
/*
737
 *        Scanning of interfaces
738
 */
739

    
740
static void
741
nl_parse_link(struct nlmsghdr *h, int scan)
742
{
743
  struct ifinfomsg *i;
744
  struct rtattr *a[BIRD_IFLA_MAX];
745
  int new = h->nlmsg_type == RTM_NEWLINK;
746
  struct iface f = {};
747
  struct iface *ifi;
748
  char *name;
749
  u32 mtu;
750
  uint fl;
751

    
752
  if (!(i = nl_checkin(h, sizeof(*i))) || !nl_parse_attrs(IFLA_RTA(i), ifla_attr_want, a, sizeof(a)))
753
    return;
754
  if (!a[IFLA_IFNAME] || (RTA_PAYLOAD(a[IFLA_IFNAME]) < 2) || !a[IFLA_MTU])
755
    {
756
      /*
757
       * IFLA_IFNAME and IFLA_MTU are required, in fact, but there may also come
758
       * a message with IFLA_WIRELESS set, where (e.g.) no IFLA_IFNAME exists.
759
       * We simply ignore all such messages with IFLA_WIRELESS without notice.
760
       */
761

    
762
      if (a[IFLA_WIRELESS])
763
        return;
764

    
765
      log(L_ERR "KIF: Malformed message received");
766
      return;
767
    }
768

    
769
  name = RTA_DATA(a[IFLA_IFNAME]);
770
  mtu = rta_get_u32(a[IFLA_MTU]);
771

    
772
  ifi = if_find_by_index(i->ifi_index);
773
  if (!new)
774
    {
775
      DBG("KIF: IF%d(%s) goes down\n", i->ifi_index, name);
776
      if (!ifi)
777
        return;
778

    
779
      if_delete(ifi);
780
    }
781
  else
782
    {
783
      DBG("KIF: IF%d(%s) goes up (mtu=%d,flg=%x)\n", i->ifi_index, name, mtu, i->ifi_flags);
784
      if (ifi && strncmp(ifi->name, name, sizeof(ifi->name)-1))
785
        if_delete(ifi);
786

    
787
      strncpy(f.name, name, sizeof(f.name)-1);
788
      f.index = i->ifi_index;
789
      f.mtu = mtu;
790

    
791
      fl = i->ifi_flags;
792
      if (fl & IFF_UP)
793
        f.flags |= IF_ADMIN_UP;
794
      if (fl & IFF_LOWER_UP)
795
        f.flags |= IF_LINK_UP;
796
      if (fl & IFF_LOOPBACK)                /* Loopback */
797
        f.flags |= IF_MULTIACCESS | IF_LOOPBACK | IF_IGNORE;
798
      else if (fl & IFF_POINTOPOINT)        /* PtP */
799
        f.flags |= IF_MULTICAST;
800
      else if (fl & IFF_BROADCAST)        /* Broadcast */
801
        f.flags |= IF_MULTIACCESS | IF_BROADCAST | IF_MULTICAST;
802
      else
803
        f.flags |= IF_MULTIACCESS;        /* NBMA */
804

    
805
      if (fl & IFF_MULTICAST)
806
        f.flags |= IF_MULTICAST;
807

    
808
      ifi = if_update(&f);
809

    
810
      if (!scan)
811
        if_end_partial_update(ifi);
812
    }
813
}
814

    
815
static void
816
nl_parse_addr4(struct ifaddrmsg *i, int scan, int new)
817
{
818
  struct rtattr *a[BIRD_IFA_MAX];
819
  struct iface *ifi;
820
  u32 ifa_flags;
821
  int scope;
822

    
823
  if (!nl_parse_attrs(IFA_RTA(i), ifa_attr_want4, a, sizeof(a)))
824
    return;
825

    
826
  if (!a[IFA_LOCAL])
827
    {
828
      log(L_ERR "KIF: Malformed message received (missing IFA_LOCAL)");
829
      return;
830
    }
831
  if (!a[IFA_ADDRESS])
832
    {
833
      log(L_ERR "KIF: Malformed message received (missing IFA_ADDRESS)");
834
      return;
835
    }
836

    
837
  ifi = if_find_by_index(i->ifa_index);
838
  if (!ifi)
839
    {
840
      log(L_ERR "KIF: Received address message for unknown interface %d", i->ifa_index);
841
      return;
842
    }
843

    
844
  if (a[IFA_FLAGS])
845
    ifa_flags = rta_get_u32(a[IFA_FLAGS]);
846
  else
847
    ifa_flags = i->ifa_flags;
848

    
849
  struct ifa ifa;
850
  bzero(&ifa, sizeof(ifa));
851
  ifa.iface = ifi;
852
  if (ifa_flags & IFA_F_SECONDARY)
853
    ifa.flags |= IA_SECONDARY;
854

    
855
  ifa.ip = rta_get_ipa(a[IFA_LOCAL]);
856

    
857
  if (i->ifa_prefixlen > IP4_MAX_PREFIX_LENGTH)
858
    {
859
      log(L_ERR "KIF: Invalid prefix length for interface %s: %d", ifi->name, i->ifa_prefixlen);
860
      new = 0;
861
    }
862
  if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH)
863
    {
864
      ifa.brd = rta_get_ipa(a[IFA_ADDRESS]);
865
      net_fill_ip4(&ifa.prefix, rta_get_ip4(a[IFA_ADDRESS]), i->ifa_prefixlen);
866

    
867
      /* It is either a host address or a peer address */
868
      if (ipa_equal(ifa.ip, ifa.brd))
869
        ifa.flags |= IA_HOST;
870
      else
871
        {
872
          ifa.flags |= IA_PEER;
873
          ifa.opposite = ifa.brd;
874
        }
875
    }
876
  else
877
    {
878
      net_fill_ip4(&ifa.prefix, ipa_to_ip4(ifa.ip), i->ifa_prefixlen);
879
      net_normalize(&ifa.prefix);
880

    
881
      if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH - 1)
882
        ifa.opposite = ipa_opposite_m1(ifa.ip);
883

    
884
      if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH - 2)
885
        ifa.opposite = ipa_opposite_m2(ifa.ip);
886

    
887
      if ((ifi->flags & IF_BROADCAST) && a[IFA_BROADCAST])
888
        {
889
          ip4_addr xbrd = rta_get_ip4(a[IFA_BROADCAST]);
890
          ip4_addr ybrd = ip4_or(ipa_to_ip4(ifa.ip), ip4_not(ip4_mkmask(i->ifa_prefixlen)));
891

    
892
          if (ip4_equal(xbrd, net4_prefix(&ifa.prefix)) || ip4_equal(xbrd, ybrd))
893
            ifa.brd = ipa_from_ip4(xbrd);
894
          else if (ifi->flags & IF_TMP_DOWN) /* Complain only during the first scan */
895
            {
896
              log(L_ERR "KIF: Invalid broadcast address %I4 for %s", xbrd, ifi->name);
897
              ifa.brd = ipa_from_ip4(ybrd);
898
            }
899
        }
900
    }
901

    
902
  scope = ipa_classify(ifa.ip);
903
  if (scope < 0)
904
    {
905
      log(L_ERR "KIF: Invalid interface address %I for %s", ifa.ip, ifi->name);
906
      return;
907
    }
908
  ifa.scope = scope & IADDR_SCOPE_MASK;
909

    
910
  DBG("KIF: IF%d(%s): %s IPA %I, flg %x, net %N, brd %I, opp %I\n",
911
      ifi->index, ifi->name,
912
      new ? "added" : "removed",
913
      ifa.ip, ifa.flags, ifa.prefix, ifa.brd, ifa.opposite);
914

    
915
  if (new)
916
    ifa_update(&ifa);
917
  else
918
    ifa_delete(&ifa);
919

    
920
  if (!scan)
921
    if_end_partial_update(ifi);
922
}
923

    
924
static void
925
nl_parse_addr6(struct ifaddrmsg *i, int scan, int new)
926
{
927
  struct rtattr *a[BIRD_IFA_MAX];
928
  struct iface *ifi;
929
  u32 ifa_flags;
930
  int scope;
931

    
932
  if (!nl_parse_attrs(IFA_RTA(i), ifa_attr_want6, a, sizeof(a)))
933
    return;
934

    
935
  if (!a[IFA_ADDRESS])
936
    {
937
      log(L_ERR "KIF: Malformed message received (missing IFA_ADDRESS)");
938
      return;
939
    }
940

    
941
  ifi = if_find_by_index(i->ifa_index);
942
  if (!ifi)
943
    {
944
      log(L_ERR "KIF: Received address message for unknown interface %d", i->ifa_index);
945
      return;
946
    }
947

    
948
  if (a[IFA_FLAGS])
949
    ifa_flags = rta_get_u32(a[IFA_FLAGS]);
950
  else
951
    ifa_flags = i->ifa_flags;
952

    
953
  struct ifa ifa;
954
  bzero(&ifa, sizeof(ifa));
955
  ifa.iface = ifi;
956
  if (ifa_flags & IFA_F_SECONDARY)
957
    ifa.flags |= IA_SECONDARY;
958

    
959
  /* Ignore tentative addresses silently */
960
  if (ifa_flags & IFA_F_TENTATIVE)
961
    return;
962

    
963
  /* IFA_LOCAL can be unset for IPv6 interfaces */
964
  ifa.ip = rta_get_ipa(a[IFA_LOCAL] ? : a[IFA_ADDRESS]);
965

    
966
  if (i->ifa_prefixlen > IP6_MAX_PREFIX_LENGTH)
967
    {
968
      log(L_ERR "KIF: Invalid prefix length for interface %s: %d", ifi->name, i->ifa_prefixlen);
969
      new = 0;
970
    }
971
  if (i->ifa_prefixlen == IP6_MAX_PREFIX_LENGTH)
972
    {
973
      ifa.brd = rta_get_ipa(a[IFA_ADDRESS]);
974
      net_fill_ip6(&ifa.prefix, rta_get_ip6(a[IFA_ADDRESS]), i->ifa_prefixlen);
975

    
976
      /* It is either a host address or a peer address */
977
      if (ipa_equal(ifa.ip, ifa.brd))
978
        ifa.flags |= IA_HOST;
979
      else
980
        {
981
          ifa.flags |= IA_PEER;
982
          ifa.opposite = ifa.brd;
983
        }
984
    }
985
  else
986
    {
987
      net_fill_ip6(&ifa.prefix, ipa_to_ip6(ifa.ip), i->ifa_prefixlen);
988
      net_normalize(&ifa.prefix);
989

    
990
      if (i->ifa_prefixlen == IP6_MAX_PREFIX_LENGTH - 1)
991
        ifa.opposite = ipa_opposite_m1(ifa.ip);
992
    }
993

    
994
  scope = ipa_classify(ifa.ip);
995
  if (scope < 0)
996
    {
997
      log(L_ERR "KIF: Invalid interface address %I for %s", ifa.ip, ifi->name);
998
      return;
999
    }
1000
  ifa.scope = scope & IADDR_SCOPE_MASK;
1001

    
1002
  DBG("KIF: IF%d(%s): %s IPA %I, flg %x, net %N, brd %I, opp %I\n",
1003
      ifi->index, ifi->name,
1004
      new ? "added" : "removed",
1005
      ifa.ip, ifa.flags, ifa.prefix, ifa.brd, ifa.opposite);
1006

    
1007
  if (new)
1008
    ifa_update(&ifa);
1009
  else
1010
    ifa_delete(&ifa);
1011

    
1012
  if (!scan)
1013
    if_end_partial_update(ifi);
1014
}
1015

    
1016
static void
1017
nl_parse_addr(struct nlmsghdr *h, int scan)
1018
{
1019
  struct ifaddrmsg *i;
1020

    
1021
  if (!(i = nl_checkin(h, sizeof(*i))))
1022
    return;
1023

    
1024
  int new = (h->nlmsg_type == RTM_NEWADDR);
1025

    
1026
  switch (i->ifa_family)
1027
    {
1028
      case AF_INET:
1029
        return nl_parse_addr4(i, scan, new);
1030

    
1031
      case AF_INET6:
1032
        return nl_parse_addr6(i, scan, new);
1033
    }
1034
}
1035

    
1036
void
1037
kif_do_scan(struct kif_proto *p UNUSED)
1038
{
1039
  struct nlmsghdr *h;
1040

    
1041
  if_start_update();
1042

    
1043
  nl_request_dump(AF_UNSPEC, RTM_GETLINK);
1044
  while (h = nl_get_scan())
1045
    if (h->nlmsg_type == RTM_NEWLINK || h->nlmsg_type == RTM_DELLINK)
1046
      nl_parse_link(h, 1);
1047
    else
1048
      log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type);
1049

    
1050
  nl_request_dump(AF_INET, RTM_GETADDR);
1051
  while (h = nl_get_scan())
1052
    if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR)
1053
      nl_parse_addr(h, 1);
1054
    else
1055
      log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type);
1056

    
1057
  nl_request_dump(AF_INET6, RTM_GETADDR);
1058
  while (h = nl_get_scan())
1059
    if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR)
1060
      nl_parse_addr(h, 1);
1061
    else
1062
      log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type);
1063

    
1064
  if_end_update();
1065
}
1066

    
1067
/*
1068
 *        Routes
1069
 */
1070

    
1071
static inline u32
1072
krt_table_id(struct krt_proto *p)
1073
{
1074
  return KRT_CF->sys.table_id;
1075
}
1076

    
1077
static HASH(struct krt_proto) nl_table_map;
1078

    
1079
#define RTH_KEY(p)                p->af, krt_table_id(p)
1080
#define RTH_NEXT(p)                p->sys.hash_next
1081
#define RTH_EQ(a1,i1,a2,i2)        a1 == a2 && i1 == i2
1082
#define RTH_FN(a,i)                a ^ u32_hash(i)
1083

    
1084
#define RTH_REHASH                rth_rehash
1085
#define RTH_PARAMS                /8, *2, 2, 2, 6, 20
1086

    
1087
HASH_DEFINE_REHASH_FN(RTH, struct krt_proto)
1088

    
1089
int
1090
krt_capable(rte *e)
1091
{
1092
  rta *a = e->attrs;
1093

    
1094
  switch (a->dest)
1095
    {
1096
    case RTD_UNICAST:
1097
      for (struct nexthop *nh = &(a->nh); nh; nh = nh->next)
1098
        if (nh->iface)
1099
          return 1;
1100
      return 0;
1101
    case RTD_BLACKHOLE:
1102
    case RTD_UNREACHABLE:
1103
    case RTD_PROHIBIT:
1104
      break;
1105
    default:
1106
      return 0;
1107
    }
1108
  return 1;
1109
}
1110

    
1111
static inline int
1112
nh_bufsize(struct nexthop *nh)
1113
{
1114
  int rv = 0;
1115
  for (; nh != NULL; nh = nh->next)
1116
    rv += RTNH_LENGTH(RTA_LENGTH(sizeof(ip_addr)));
1117
  return rv;
1118
}
1119

    
1120
static int
1121
nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int op, int dest, struct nexthop *nh)
1122
{
1123
  eattr *ea;
1124
  net *net = e->net;
1125
  rta *a = e->attrs;
1126
  int bufsize = 128 + KRT_METRICS_MAX*8 + nh_bufsize(&(a->nh));
1127
  u32 priority = 0;
1128

    
1129
  struct {
1130
    struct nlmsghdr h;
1131
    struct rtmsg r;
1132
    char buf[0];
1133
  } *r;
1134

    
1135
  int rsize = sizeof(*r) + bufsize;
1136
  r = alloca(rsize);
1137

    
1138
  DBG("nl_send_route(%N,op=%x)\n", net->n.addr, op);
1139

    
1140
  bzero(&r->h, sizeof(r->h));
1141
  bzero(&r->r, sizeof(r->r));
1142
  r->h.nlmsg_type = op ? RTM_NEWROUTE : RTM_DELROUTE;
1143
  r->h.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
1144
  r->h.nlmsg_flags = op | NLM_F_REQUEST | NLM_F_ACK;
1145

    
1146
  r->r.rtm_family = p->af;
1147
  r->r.rtm_dst_len = net_pxlen(net->n.addr);
1148
  r->r.rtm_protocol = RTPROT_BIRD;
1149
  r->r.rtm_scope = RT_SCOPE_UNIVERSE;
1150
  if (p->af == AF_MPLS)
1151
  {
1152
    u32 label = net_mpls(net->n.addr);
1153
    nl_add_attr_mpls(&r->h, rsize, RTA_DST, 1, &label);
1154
  }
1155
  else
1156
    nl_add_attr_ipa(&r->h, rsize, RTA_DST, net_prefix(net->n.addr));
1157

    
1158
  /*
1159
   * Strange behavior for RTM_DELROUTE:
1160
   * 1) rtm_family is ignored in IPv6, works for IPv4
1161
   * 2) not setting RTA_PRIORITY is different from setting default value (on IPv6)
1162
   * 3) not setting RTA_PRIORITY is equivalent to setting 0, which is wildcard
1163
   */
1164

    
1165
  if (krt_table_id(p) < 256)
1166
    r->r.rtm_table = krt_table_id(p);
1167
  else
1168
    nl_add_attr_u32(&r->h, rsize, RTA_TABLE, krt_table_id(p));
1169

    
1170
  if (a->source == RTS_DUMMY)
1171
    priority = e->u.krt.metric;
1172
  else if (KRT_CF->sys.metric)
1173
    priority = KRT_CF->sys.metric;
1174
  else if ((op != NL_OP_DELETE) && (ea = ea_find(eattrs, EA_KRT_METRIC)))
1175
    priority = ea->u.data;
1176

    
1177
  if (priority)
1178
    nl_add_attr_u32(&r->h, sizeof(r), RTA_PRIORITY, priority);
1179

    
1180
  /* For route delete, we do not specify remaining route attributes */
1181
  if (op == NL_OP_DELETE)
1182
    goto dest;
1183

    
1184
  /* Default scope is LINK for device routes, UNIVERSE otherwise */
1185
  if (ea = ea_find(eattrs, EA_KRT_SCOPE))
1186
    r->r.rtm_scope = ea->u.data;
1187
  else
1188
    r->r.rtm_scope = (dest == RTD_UNICAST && ipa_zero(nh->gw)) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
1189

    
1190
  if (ea = ea_find(eattrs, EA_KRT_PREFSRC))
1191
    nl_add_attr_ipa(&r->h, rsize, RTA_PREFSRC, *(ip_addr *)ea->u.ptr->data);
1192

    
1193
  if (ea = ea_find(eattrs, EA_KRT_REALM))
1194
    nl_add_attr_u32(&r->h, rsize, RTA_FLOW, ea->u.data);
1195

    
1196

    
1197
  u32 metrics[KRT_METRICS_MAX];
1198
  metrics[0] = 0;
1199

    
1200
  struct ea_walk_state ews = { .eattrs = eattrs };
1201
  while (ea = ea_walk(&ews, EA_KRT_METRICS, KRT_METRICS_MAX))
1202
  {
1203
    int id = ea->id - EA_KRT_METRICS;
1204
    metrics[0] |= 1 << id;
1205
    metrics[id] = ea->u.data;
1206
  }
1207

    
1208
  if (metrics[0])
1209
    nl_add_metrics(&r->h, rsize, metrics, KRT_METRICS_MAX);
1210

    
1211

    
1212
dest:
1213
  /* a->iface != NULL checked in krt_capable() for router and device routes */
1214
  switch (dest)
1215
    {
1216
    case RTD_UNICAST:
1217
      r->r.rtm_type = RTN_UNICAST;
1218
      if (nh->next && !krt_ecmp6(p))
1219
        nl_add_multipath(&r->h, rsize, nh, p->af);
1220
      else
1221
      {
1222
        nl_add_attr_u32(&r->h, rsize, RTA_OIF, nh->iface->index);
1223
        nl_add_nexthop(&r->h, rsize, nh, p->af);
1224
      }
1225
      break;
1226
    case RTD_BLACKHOLE:
1227
      r->r.rtm_type = RTN_BLACKHOLE;
1228
      break;
1229
    case RTD_UNREACHABLE:
1230
      r->r.rtm_type = RTN_UNREACHABLE;
1231
      break;
1232
    case RTD_PROHIBIT:
1233
      r->r.rtm_type = RTN_PROHIBIT;
1234
      break;
1235
    case RTD_NONE:
1236
      break;
1237
    default:
1238
      bug("krt_capable inconsistent with nl_send_route");
1239
    }
1240

    
1241
  /* Ignore missing for DELETE */
1242
  return nl_exchange(&r->h, (op == NL_OP_DELETE));
1243
}
1244

    
1245
static inline int
1246
nl_add_rte(struct krt_proto *p, rte *e, struct ea_list *eattrs)
1247
{
1248
  rta *a = e->attrs;
1249
  int err = 0;
1250

    
1251
  if (krt_ecmp6(p) && a->nh.next)
1252
  {
1253
    struct nexthop *nh = &(a->nh);
1254

    
1255
    err = nl_send_route(p, e, eattrs, NL_OP_ADD, RTD_UNICAST, nh);
1256
    if (err < 0)
1257
      return err;
1258

    
1259
    for (nh = nh->next; nh; nh = nh->next)
1260
      err += nl_send_route(p, e, eattrs, NL_OP_APPEND, RTD_UNICAST, nh);
1261

    
1262
    return err;
1263
  }
1264

    
1265
  return nl_send_route(p, e, eattrs, NL_OP_ADD, a->dest, &(a->nh));
1266
}
1267

    
1268
static inline int
1269
nl_delete_rte(struct krt_proto *p, rte *e, struct ea_list *eattrs)
1270
{
1271
  int err = 0;
1272

    
1273
  /* For IPv6, we just repeatedly request DELETE until we get error */
1274
  do
1275
    err = nl_send_route(p, e, eattrs, NL_OP_DELETE, RTD_NONE, NULL);
1276
  while (krt_ecmp6(p) && !err);
1277

    
1278
  return err;
1279
}
1280

    
1281
void
1282
krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old, struct ea_list *eattrs)
1283
{
1284
  int err = 0;
1285

    
1286
  /*
1287
   * We could use NL_OP_REPLACE, but route replace on Linux has some problems:
1288
   *
1289
   * 1) Does not check for matching rtm_protocol
1290
   * 2) Has broken semantics for IPv6 ECMP
1291
   * 3) Crashes some kernel version when used for IPv6 ECMP
1292
   *
1293
   * So we use NL_OP_DELETE and then NL_OP_ADD. We also do not trust the old
1294
   * route value, so we do not try to optimize IPv6 ECMP reconfigurations.
1295
   */
1296

    
1297
  if (old)
1298
    nl_delete_rte(p, old, eattrs);
1299

    
1300
  if (new)
1301
    err = nl_add_rte(p, new, eattrs);
1302

    
1303
  if (err < 0)
1304
    n->n.flags |= KRF_SYNC_ERROR;
1305
  else
1306
    n->n.flags &= ~KRF_SYNC_ERROR;
1307
}
1308

    
1309

    
1310
static inline struct nexthop *
1311
nl_alloc_nexthop(struct nl_parse_state *s, ip_addr gw, struct iface *iface, byte weight)
1312
{
1313
  struct nexthop *nh = lp_alloc(s->pool, sizeof(struct nexthop));
1314

    
1315
  nh->gw = gw;
1316
  nh->iface = iface;
1317
  nh->next = NULL;
1318
  nh->weight = weight;
1319

    
1320
  return nh;
1321
}
1322

    
1323
static int
1324
nl_mergable_route(struct nl_parse_state *s, net *net, struct krt_proto *p, uint priority, uint krt_type)
1325
{
1326
  /* Route merging must be active */
1327
  if (!s->merge)
1328
    return 0;
1329

    
1330
  /* Saved and new route must have same network, proto/table, and priority */
1331
  if ((s->net != net) || (s->proto != p) || (s->krt_metric != priority))
1332
    return 0;
1333

    
1334
  /* Both must be regular unicast routes */
1335
  if ((s->krt_type != RTN_UNICAST) || (krt_type != RTN_UNICAST))
1336
    return 0;
1337

    
1338
  return 1;
1339
}
1340

    
1341
static void
1342
nl_announce_route(struct nl_parse_state *s)
1343
{
1344
  rte *e = rte_get_temp(s->attrs);
1345
  e->net = s->net;
1346
  e->u.krt.src = s->krt_src;
1347
  e->u.krt.proto = s->krt_proto;
1348
  e->u.krt.seen = 0;
1349
  e->u.krt.best = 0;
1350
  e->u.krt.metric = s->krt_metric;
1351

    
1352
  if (s->scan)
1353
    krt_got_route(s->proto, e);
1354
  else
1355
    krt_got_route_async(s->proto, e, s->new);
1356

    
1357
  s->net = NULL;
1358
  s->attrs = NULL;
1359
  s->proto = NULL;
1360
  lp_flush(s->pool);
1361
}
1362

    
1363
static inline void
1364
nl_parse_begin(struct nl_parse_state *s, int scan, int merge)
1365
{
1366
  memset(s, 0, sizeof (struct nl_parse_state));
1367
  s->pool = nl_linpool;
1368
  s->scan = scan;
1369
  s->merge = merge;
1370
}
1371

    
1372
static inline void
1373
nl_parse_end(struct nl_parse_state *s)
1374
{
1375
  if (s->net)
1376
    nl_announce_route(s);
1377
}
1378

    
1379

    
1380
#define SKIP(ARG...) do { DBG("KRT: Ignoring route - " ARG); return; } while(0)
1381

    
1382
static void
1383
nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
1384
{
1385
  struct krt_proto *p;
1386
  struct rtmsg *i;
1387
  struct rtattr *a[BIRD_RTA_MAX];
1388
  int new = h->nlmsg_type == RTM_NEWROUTE;
1389

    
1390
  net_addr dst;
1391
  u32 oif = ~0;
1392
  u32 table_id;
1393
  u32 priority = 0;
1394
  u32 def_scope = RT_SCOPE_UNIVERSE;
1395
  int src;
1396

    
1397
  if (!(i = nl_checkin(h, sizeof(*i))))
1398
    return;
1399

    
1400
  switch (i->rtm_family)
1401
    {
1402
    case AF_INET:
1403
      if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want4, a, sizeof(a)))
1404
        return;
1405

    
1406
      if (a[RTA_DST])
1407
        net_fill_ip4(&dst, rta_get_ip4(a[RTA_DST]), i->rtm_dst_len);
1408
      else
1409
        net_fill_ip4(&dst, IP4_NONE, 0);
1410
      break;
1411

    
1412
    case AF_INET6:
1413
      if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want6, a, sizeof(a)))
1414
        return;
1415

    
1416
      if (a[RTA_DST])
1417
        net_fill_ip6(&dst, rta_get_ip6(a[RTA_DST]), i->rtm_dst_len);
1418
      else
1419
        net_fill_ip6(&dst, IP6_NONE, 0);
1420
      break;
1421

    
1422
    case AF_MPLS:
1423
      if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want_mpls, a, sizeof(a)))
1424
        return;
1425

    
1426
      if (a[RTA_DST])
1427
        if (rta_get_mpls(a[RTA_DST], rta_mpls_stack) == 1)
1428
          net_fill_mpls(&dst, rta_mpls_stack[0]);
1429
        else
1430
          log(L_WARN "KRT: Got multi-label MPLS RTA_DST");
1431
      else
1432
        return; /* No support for MPLS routes without RTA_DST */
1433
      break;
1434

    
1435
    default:
1436
      return;
1437
    }
1438

    
1439
  if (a[RTA_OIF])
1440
    oif = rta_get_u32(a[RTA_OIF]);
1441

    
1442
  if (a[RTA_TABLE])
1443
    table_id = rta_get_u32(a[RTA_TABLE]);
1444
  else
1445
    table_id = i->rtm_table;
1446

    
1447
  /* Do we know this table? */
1448
  p = HASH_FIND(nl_table_map, RTH, i->rtm_family, table_id);
1449
  if (!p)
1450
    SKIP("unknown table %d\n", table);
1451

    
1452
  if (a[RTA_IIF])
1453
    SKIP("IIF set\n");
1454

    
1455
  if (i->rtm_tos != 0)                        /* We don't support TOS */
1456
    SKIP("TOS %02x\n", i->rtm_tos);
1457

    
1458
  if (s->scan && !new)
1459
    SKIP("RTM_DELROUTE in scan\n");
1460

    
1461
  if (a[RTA_PRIORITY])
1462
    priority = rta_get_u32(a[RTA_PRIORITY]);
1463

    
1464
  int c = net_classify(&dst);
1465
  if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK))
1466
    SKIP("strange class/scope\n");
1467

    
1468
  switch (i->rtm_protocol)
1469
    {
1470
    case RTPROT_UNSPEC:
1471
      SKIP("proto unspec\n");
1472

    
1473
    case RTPROT_REDIRECT:
1474
      src = KRT_SRC_REDIRECT;
1475
      break;
1476

    
1477
    case RTPROT_KERNEL:
1478
      src = KRT_SRC_KERNEL;
1479
      return;
1480

    
1481
    case RTPROT_BIRD:
1482
      if (!s->scan)
1483
        SKIP("echo\n");
1484
      src = KRT_SRC_BIRD;
1485
      break;
1486

    
1487
    case RTPROT_BOOT:
1488
    default:
1489
      src = KRT_SRC_ALIEN;
1490
    }
1491

    
1492
  net *net = net_get(p->p.main_channel->table, &dst);
1493

    
1494
  if (s->net && !nl_mergable_route(s, net, p, priority, i->rtm_type))
1495
    nl_announce_route(s);
1496

    
1497
  rta *ra = lp_allocz(s->pool, RTA_MAX_SIZE);
1498
  ra->src = p->p.main_source;
1499
  ra->source = RTS_INHERIT;
1500
  ra->scope = SCOPE_UNIVERSE;
1501

    
1502
  switch (i->rtm_type)
1503
    {
1504
    case RTN_UNICAST:
1505

    
1506
      if (a[RTA_MULTIPATH] && (i->rtm_family == AF_INET))
1507
        {
1508
          struct nexthop *nh = nl_parse_multipath(p, a[RTA_MULTIPATH]);
1509
          if (!nh)
1510
            {
1511
              log(L_ERR "KRT: Received strange multipath route %N", net->n.addr);
1512
              return;
1513
            }
1514

    
1515
          nexthop_link(ra, nh);
1516
          break;
1517
        }
1518

    
1519
      ra->nh.iface = if_find_by_index(oif);
1520
      if (!ra->nh.iface)
1521
        {
1522
          log(L_ERR "KRT: Received route %N with unknown ifindex %u", net->n.addr, oif);
1523
          return;
1524
        }
1525

    
1526
      if ((i->rtm_family != AF_MPLS) && a[RTA_GATEWAY] || (i->rtm_family == AF_MPLS) && a[RTA_VIA])
1527
        {
1528
          if (i->rtm_family == AF_MPLS)
1529
            ra->nh.gw = rta_get_via(a[RTA_VIA]);
1530
          else
1531
            ra->nh.gw = rta_get_ipa(a[RTA_GATEWAY]);
1532

    
1533
          /* Silently skip strange 6to4 routes */
1534
          const net_addr_ip6 sit = NET_ADDR_IP6(IP6_NONE, 96);
1535
          if ((i->rtm_family == AF_INET6) && ipa_in_netX(ra->nh.gw, (net_addr *) &sit))
1536
            return;
1537

    
1538
          neighbor *nbr;
1539
          nbr = neigh_find2(&p->p, &(ra->nh.gw), ra->nh.iface,
1540
                            (i->rtm_flags & RTNH_F_ONLINK) ? NEF_ONLINK : 0);
1541
          if (!nbr || (nbr->scope == SCOPE_HOST))
1542
            {
1543
              log(L_ERR "KRT: Received route %N with strange next-hop %I", net->n.addr,
1544
                  ra->nh.gw);
1545
              return;
1546
            }
1547
        }
1548

    
1549
      break;
1550
    case RTN_BLACKHOLE:
1551
      ra->dest = RTD_BLACKHOLE;
1552
      break;
1553
    case RTN_UNREACHABLE:
1554
      ra->dest = RTD_UNREACHABLE;
1555
      break;
1556
    case RTN_PROHIBIT:
1557
      ra->dest = RTD_PROHIBIT;
1558
      break;
1559
    /* FIXME: What about RTN_THROW? */
1560
    default:
1561
      SKIP("type %d\n", i->rtm_type);
1562
      return;
1563
    }
1564

    
1565
  int labels = 0;
1566
  if ((i->rtm_family == AF_MPLS) && a[RTA_NEWDST] && !ra->nh.next)
1567
    labels = rta_get_mpls(a[RTA_NEWDST], ra->nh.label);
1568

    
1569
  if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE] && !ra->nh.next)
1570
    {
1571
      switch (rta_get_u16(a[RTA_ENCAP_TYPE]))
1572
        {
1573
          case LWTUNNEL_ENCAP_MPLS:
1574
            {
1575
              struct rtattr *enca[BIRD_RTA_MAX];
1576
              nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]);
1577
              nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca));
1578
              labels = rta_get_mpls(enca[RTA_DST], ra->nh.label);
1579
              break;
1580
            }
1581
          default:
1582
            SKIP("unknown encapsulation method %d\n", rta_get_u16(a[RTA_ENCAP_TYPE]));
1583
            break;
1584
        }
1585
    }
1586

    
1587
  if (labels < 0)
1588
  {
1589
    log(L_WARN "KRT: Too long MPLS stack received, ignoring.");
1590
    ra->nh.labels = 0;
1591
  }
1592
  else
1593
    ra->nh.labels = labels;
1594

    
1595
  rte *e = rte_get_temp(ra);
1596
  e->net = net;
1597
  e->u.krt.src = src;
1598
  e->u.krt.proto = i->rtm_protocol;
1599
  e->u.krt.seen = 0;
1600
  e->u.krt.best = 0;
1601
  e->u.krt.metric = 0;
1602

    
1603
  if (i->rtm_scope != def_scope)
1604
    {
1605
      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
1606
      ea->next = ra->eattrs;
1607
      ra->eattrs = ea;
1608
      ea->flags = EALF_SORTED;
1609
      ea->count = 1;
1610
      ea->attrs[0].id = EA_KRT_SCOPE;
1611
      ea->attrs[0].flags = 0;
1612
      ea->attrs[0].type = EAF_TYPE_INT;
1613
      ea->attrs[0].u.data = i->rtm_scope;
1614
    }
1615

    
1616
  if (a[RTA_PRIORITY])
1617
    e->u.krt.metric = rta_get_u32(a[RTA_PRIORITY]);
1618

    
1619
  if (a[RTA_PREFSRC])
1620
    {
1621
      ip_addr ps = rta_get_ipa(a[RTA_PREFSRC]);
1622

    
1623
      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
1624
      ea->next = ra->eattrs;
1625
      ra->eattrs = ea;
1626
      ea->flags = EALF_SORTED;
1627
      ea->count = 1;
1628
      ea->attrs[0].id = EA_KRT_PREFSRC;
1629
      ea->attrs[0].flags = 0;
1630
      ea->attrs[0].type = EAF_TYPE_IP_ADDRESS;
1631
      ea->attrs[0].u.ptr = lp_alloc(s->pool, sizeof(struct adata) + sizeof(ps));
1632
      ea->attrs[0].u.ptr->length = sizeof(ps);
1633
      memcpy(ea->attrs[0].u.ptr->data, &ps, sizeof(ps));
1634
    }
1635

    
1636
  if (a[RTA_FLOW])
1637
    {
1638
      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
1639
      ea->next = ra->eattrs;
1640
      ra->eattrs = ea;
1641
      ea->flags = EALF_SORTED;
1642
      ea->count = 1;
1643
      ea->attrs[0].id = EA_KRT_REALM;
1644
      ea->attrs[0].flags = 0;
1645
      ea->attrs[0].type = EAF_TYPE_INT;
1646
      ea->attrs[0].u.data = rta_get_u32(a[RTA_FLOW]);
1647
    }
1648

    
1649
  if (a[RTA_METRICS])
1650
    {
1651
      u32 metrics[KRT_METRICS_MAX];
1652
      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + KRT_METRICS_MAX * sizeof(eattr));
1653
      int t, n = 0;
1654

    
1655
      if (nl_parse_metrics(a[RTA_METRICS], metrics, ARRAY_SIZE(metrics)) < 0)
1656
        {
1657
          log(L_ERR "KRT: Received route %N with strange RTA_METRICS attribute", net->n.addr);
1658
          return;
1659
        }
1660

    
1661
      for (t = 1; t < KRT_METRICS_MAX; t++)
1662
        if (metrics[0] & (1 << t))
1663
          {
1664
            ea->attrs[n].id = EA_CODE(EAP_KRT, KRT_METRICS_OFFSET + t);
1665
            ea->attrs[n].flags = 0;
1666
            ea->attrs[n].type = EAF_TYPE_INT; /* FIXME: Some are EAF_TYPE_BITFIELD */
1667
            ea->attrs[n].u.data = metrics[t];
1668
            n++;
1669
          }
1670

    
1671
      if (n > 0)
1672
        {
1673
          ea->next = ra->eattrs;
1674
          ea->flags = EALF_SORTED;
1675
          ea->count = n;
1676
          ra->eattrs = ea;
1677
        }
1678
    }
1679

    
1680
  /*
1681
   * Ideally, now we would send the received route to the rest of kernel code.
1682
   * But IPv6 ECMP routes are sent as a sequence of routes, so we postpone it
1683
   * and merge next hops until the end of the sequence.
1684
   */
1685

    
1686
  if (!s->net)
1687
  {
1688
    /* Store the new route */
1689
    s->net = net;
1690
    s->attrs = ra;
1691
    s->proto = p;
1692
    s->new = new;
1693
    s->krt_src = src;
1694
    s->krt_type = i->rtm_type;
1695
    s->krt_proto = i->rtm_protocol;
1696
    s->krt_metric = priority;
1697
  }
1698
  else
1699
  {
1700
    /* Merge next hops with the stored route */
1701
    rta *a = s->attrs;
1702

    
1703
    nexthop_insert(&a->nh, &ra->nh);
1704
  }
1705
}
1706

    
1707
void
1708
krt_do_scan(struct krt_proto *p UNUSED)        /* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */
1709
{
1710
  struct nlmsghdr *h;
1711
  struct nl_parse_state s;
1712

    
1713
  nl_parse_begin(&s, 1, 0);
1714
  nl_request_dump(AF_INET, RTM_GETROUTE);
1715
  while (h = nl_get_scan())
1716
    if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
1717
      nl_parse_route(&s, h);
1718
    else
1719
      log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
1720
  nl_parse_end(&s);
1721

    
1722
  nl_parse_begin(&s, 1, 1);
1723
  nl_request_dump(AF_INET6, RTM_GETROUTE);
1724
  while (h = nl_get_scan())
1725
    if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
1726
      nl_parse_route(&s, h);
1727
    else
1728
      log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
1729
  nl_parse_end(&s);
1730

    
1731
  nl_parse_begin(&s, 1, 1);
1732
  nl_request_dump(AF_MPLS, RTM_GETROUTE);
1733
  while (h = nl_get_scan())
1734
    if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
1735
      nl_parse_route(&s, h);
1736
    else
1737
      log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
1738
  nl_parse_end(&s);
1739
}
1740

    
1741
/*
1742
 *        Asynchronous Netlink interface
1743
 */
1744

    
1745
static sock *nl_async_sk;                /* BIRD socket for asynchronous notifications */
1746
static byte *nl_async_rx_buffer;        /* Receive buffer */
1747

    
1748
static void
1749
nl_async_msg(struct nlmsghdr *h)
1750
{
1751
  struct nl_parse_state s;
1752

    
1753
  switch (h->nlmsg_type)
1754
    {
1755
    case RTM_NEWROUTE:
1756
    case RTM_DELROUTE:
1757
      DBG("KRT: Received async route notification (%d)\n", h->nlmsg_type);
1758
      nl_parse_begin(&s, 0, 0);
1759
      nl_parse_route(&s, h);
1760
      nl_parse_end(&s);
1761
      break;
1762
    case RTM_NEWLINK:
1763
    case RTM_DELLINK:
1764
      DBG("KRT: Received async link notification (%d)\n", h->nlmsg_type);
1765
      if (kif_proto)
1766
        nl_parse_link(h, 0);
1767
      break;
1768
    case RTM_NEWADDR:
1769
    case RTM_DELADDR:
1770
      DBG("KRT: Received async address notification (%d)\n", h->nlmsg_type);
1771
      if (kif_proto)
1772
        nl_parse_addr(h, 0);
1773
      break;
1774
    default:
1775
      DBG("KRT: Received unknown async notification (%d)\n", h->nlmsg_type);
1776
    }
1777
}
1778

    
1779
static int
1780
nl_async_hook(sock *sk, uint size UNUSED)
1781
{
1782
  struct iovec iov = { nl_async_rx_buffer, NL_RX_SIZE };
1783
  struct sockaddr_nl sa;
1784
  struct msghdr m = {
1785
    .msg_name = &sa,
1786
    .msg_namelen = sizeof(sa),
1787
    .msg_iov = &iov,
1788
    .msg_iovlen = 1,
1789
  };
1790
  struct nlmsghdr *h;
1791
  int x;
1792
  uint len;
1793

    
1794
  x = recvmsg(sk->fd, &m, 0);
1795
  if (x < 0)
1796
    {
1797
      if (errno == ENOBUFS)
1798
        {
1799
          /*
1800
           *  Netlink reports some packets have been thrown away.
1801
           *  One day we might react to it by asking for route table
1802
           *  scan in near future.
1803
           */
1804
          return 1;        /* More data are likely to be ready */
1805
        }
1806
      else if (errno != EWOULDBLOCK)
1807
        log(L_ERR "Netlink recvmsg: %m");
1808
      return 0;
1809
    }
1810
  if (sa.nl_pid)                /* It isn't from the kernel */
1811
    {
1812
      DBG("Non-kernel packet\n");
1813
      return 1;
1814
    }
1815
  h = (void *) nl_async_rx_buffer;
1816
  len = x;
1817
  if (m.msg_flags & MSG_TRUNC)
1818
    {
1819
      log(L_WARN "Netlink got truncated asynchronous message");
1820
      return 1;
1821
    }
1822
  while (NLMSG_OK(h, len))
1823
    {
1824
      nl_async_msg(h);
1825
      h = NLMSG_NEXT(h, len);
1826
    }
1827
  if (len)
1828
    log(L_WARN "nl_async_hook: Found packet remnant of size %d", len);
1829
  return 1;
1830
}
1831

    
1832
static void
1833
nl_async_err_hook(sock *sk, int e UNUSED)
1834
{
1835
  nl_async_hook(sk, 0);
1836
}
1837

    
1838
static void
1839
nl_open_async(void)
1840
{
1841
  sock *sk;
1842
  struct sockaddr_nl sa;
1843
  int fd;
1844

    
1845
  if (nl_async_sk)
1846
    return;
1847

    
1848
  DBG("KRT: Opening async netlink socket\n");
1849

    
1850
  fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
1851
  if (fd < 0)
1852
    {
1853
      log(L_ERR "Unable to open asynchronous rtnetlink socket: %m");
1854
      return;
1855
    }
1856

    
1857
  bzero(&sa, sizeof(sa));
1858
  sa.nl_family = AF_NETLINK;
1859
  sa.nl_groups = RTMGRP_LINK |
1860
    RTMGRP_IPV4_IFADDR | RTMGRP_IPV4_ROUTE |
1861
    RTMGRP_IPV6_IFADDR | RTMGRP_IPV6_ROUTE;
1862

    
1863
  if (bind(fd, (struct sockaddr *) &sa, sizeof(sa)) < 0)
1864
    {
1865
      log(L_ERR "Unable to bind asynchronous rtnetlink socket: %m");
1866
      close(fd);
1867
      return;
1868
    }
1869

    
1870
  nl_async_rx_buffer = xmalloc(NL_RX_SIZE);
1871

    
1872
  sk = nl_async_sk = sk_new(krt_pool);
1873
  sk->type = SK_MAGIC;
1874
  sk->rx_hook = nl_async_hook;
1875
  sk->err_hook = nl_async_err_hook;
1876
  sk->fd = fd;
1877
  if (sk_open(sk) < 0)
1878
    bug("Netlink: sk_open failed");
1879
}
1880

    
1881

    
1882
/*
1883
 *        Interface to the UNIX krt module
1884
 */
1885

    
1886
void
1887
krt_sys_io_init(void)
1888
{
1889
  nl_linpool = lp_new(krt_pool, 4080);
1890
  HASH_INIT(nl_table_map, krt_pool, 6);
1891
}
1892

    
1893
int
1894
krt_sys_start(struct krt_proto *p)
1895
{
1896
  struct krt_proto *old = HASH_FIND(nl_table_map, RTH, p->af, krt_table_id(p));
1897

    
1898
  if (old)
1899
    {
1900
      log(L_ERR "%s: Kernel table %u already registered by %s",
1901
          p->p.name, krt_table_id(p), old->p.name);
1902
      return 0;
1903
    }
1904

    
1905
  HASH_INSERT2(nl_table_map, RTH, krt_pool, p);
1906

    
1907
  nl_open();
1908
  nl_open_async();
1909

    
1910
  return 1;
1911
}
1912

    
1913
void
1914
krt_sys_shutdown(struct krt_proto *p)
1915
{
1916
  HASH_REMOVE2(nl_table_map, RTH, krt_pool, p);
1917
}
1918

    
1919
int
1920
krt_sys_reconfigure(struct krt_proto *p UNUSED, struct krt_config *n, struct krt_config *o)
1921
{
1922
  return (n->sys.table_id == o->sys.table_id) && (n->sys.metric == o->sys.metric);
1923
}
1924

    
1925
void
1926
krt_sys_init_config(struct krt_config *cf)
1927
{
1928
  cf->sys.table_id = RT_TABLE_MAIN;
1929
  cf->sys.metric = 0;
1930
}
1931

    
1932
void
1933
krt_sys_copy_config(struct krt_config *d, struct krt_config *s)
1934
{
1935
  d->sys.table_id = s->sys.table_id;
1936
  d->sys.metric = s->sys.metric;
1937
}
1938

    
1939
static const char *krt_metrics_names[KRT_METRICS_MAX] = {
1940
  NULL, "lock", "mtu", "window", "rtt", "rttvar", "sstresh", "cwnd", "advmss",
1941
  "reordering", "hoplimit", "initcwnd", "features", "rto_min", "initrwnd", "quickack"
1942
};
1943

    
1944
static const char *krt_features_names[KRT_FEATURES_MAX] = {
1945
  "ecn", NULL, NULL, "allfrag"
1946
};
1947

    
1948
int
1949
krt_sys_get_attr(eattr *a, byte *buf, int buflen UNUSED)
1950
{
1951
  switch (a->id)
1952
  {
1953
  case EA_KRT_PREFSRC:
1954
    bsprintf(buf, "prefsrc");
1955
    return GA_NAME;
1956

    
1957
  case EA_KRT_REALM:
1958
    bsprintf(buf, "realm");
1959
    return GA_NAME;
1960

    
1961
  case EA_KRT_SCOPE:
1962
    bsprintf(buf, "scope");
1963
    return GA_NAME;
1964

    
1965
  case EA_KRT_LOCK:
1966
    buf += bsprintf(buf, "lock:");
1967
    ea_format_bitfield(a, buf, buflen, krt_metrics_names, 2, KRT_METRICS_MAX);
1968
    return GA_FULL;
1969

    
1970
  case EA_KRT_FEATURES:
1971
    buf += bsprintf(buf, "features:");
1972
    ea_format_bitfield(a, buf, buflen, krt_features_names, 0, KRT_FEATURES_MAX);
1973
    return GA_FULL;
1974

    
1975
  default:;
1976
    int id = (int)EA_ID(a->id) - KRT_METRICS_OFFSET;
1977
    if (id > 0 && id < KRT_METRICS_MAX)
1978
    {
1979
      bsprintf(buf, "%s", krt_metrics_names[id]);
1980
      return GA_NAME;
1981
    }
1982

    
1983
    return GA_UNKNOWN;
1984
  }
1985
}
1986

    
1987

    
1988

    
1989
void
1990
kif_sys_start(struct kif_proto *p UNUSED)
1991
{
1992
  nl_open();
1993
  nl_open_async();
1994
}
1995

    
1996
void
1997
kif_sys_shutdown(struct kif_proto *p UNUSED)
1998
{
1999
}