Statistics
| Branch: | Revision:

iof-bird-daemon / sysdep / linux / netlink.c @ 62e64905

History | View | Annotate | Download (47.1 KB)

1
/*
2
 *        BIRD -- Linux Netlink Interface
3
 *
4
 *        (c) 1999--2000 Martin Mares <mj@ucw.cz>
5
 *
6
 *        Can be freely distributed and used under the terms of the GNU GPL.
7
 */
8

    
9
#include <alloca.h>
10
#include <stdio.h>
11
#include <unistd.h>
12
#include <fcntl.h>
13
#include <sys/socket.h>
14
#include <sys/uio.h>
15
#include <errno.h>
16

    
17
#undef LOCAL_DEBUG
18

    
19
#include "nest/bird.h"
20
#include "nest/route.h"
21
#include "nest/protocol.h"
22
#include "nest/iface.h"
23
#include "lib/alloca.h"
24
#include "sysdep/unix/timer.h"
25
#include "sysdep/unix/unix.h"
26
#include "sysdep/unix/krt.h"
27
#include "lib/socket.h"
28
#include "lib/string.h"
29
#include "lib/hash.h"
30
#include "conf/conf.h"
31

    
32
#include <asm/types.h>
33
#include <linux/if.h>
34
#include <linux/lwtunnel.h>
35
#include <linux/netlink.h>
36
#include <linux/rtnetlink.h>
37

    
38

    
39
#ifndef MSG_TRUNC                        /* Hack: Several versions of glibc miss this one :( */
40
#define MSG_TRUNC 0x20
41
#endif
42

    
43
#ifndef IFA_FLAGS
44
#define IFA_FLAGS 8
45
#endif
46

    
47
#ifndef IFF_LOWER_UP
48
#define IFF_LOWER_UP 0x10000
49
#endif
50

    
51
#ifndef RTA_TABLE
52
#define RTA_TABLE  15
53
#endif
54

    
55
#ifndef RTA_VIA
56
#define RTA_VIA         18
57
#endif
58

    
59
#ifndef RTA_NEWDST
60
#define RTA_NEWDST  19
61
#endif
62

    
63
#ifndef RTA_ENCAP_TYPE
64
#define RTA_ENCAP_TYPE        21
65
#endif
66

    
67
#ifndef RTA_ENCAP
68
#define RTA_ENCAP  22
69
#endif
70

    
71
#define krt_ecmp6(p) ((p)->af == AF_INET6)
72

    
73
/*
74
 * Structure nl_parse_state keeps state of received route processing. Ideally,
75
 * we could just independently parse received Netlink messages and immediately
76
 * propagate received routes to the rest of BIRD, but Linux kernel represents
77
 * and announces IPv6 ECMP routes not as one route with multiple next hops (like
78
 * RTA_MULTIPATH in IPv4 ECMP), but as a set of routes with the same prefix.
79
 *
80
 * Therefore, BIRD keeps currently processed route in nl_parse_state structure
81
 * and postpones its propagation until we expect it to be final; i.e., when
82
 * non-matching route is received or when the scan ends. When another matching
83
 * route is received, it is merged with the already processed route to form an
84
 * ECMP route. Note that merging is done only for IPv6 (merge == 1), but the
85
 * postponing is done in both cases (for simplicity). All IPv4 routes are just
86
 * considered non-matching.
87
 *
88
 * This is ignored for asynchronous notifications (every notification is handled
89
 * as a separate route). It is not an issue for our routes, as we ignore such
90
 * notifications anyways. But importing alien IPv6 ECMP routes does not work
91
 * properly.
92
 */
93

    
94
struct nl_parse_state
95
{
96
  struct linpool *pool;
97
  int scan;
98
  int merge;
99

    
100
  net *net;
101
  rta *attrs;
102
  struct krt_proto *proto;
103
  s8 new;
104
  s8 krt_src;
105
  u8 krt_type;
106
  u8 krt_proto;
107
  u32 krt_metric;
108
};
109

    
110
/*
111
 *        Synchronous Netlink interface
112
 */
113

    
114
struct nl_sock
115
{
116
  int fd;
117
  u32 seq;
118
  byte *rx_buffer;                        /* Receive buffer */
119
  struct nlmsghdr *last_hdr;                /* Recently received packet */
120
  uint last_size;
121
};
122

    
123
#define NL_RX_SIZE 8192
124

    
125
#define NL_OP_DELETE        0
126
#define NL_OP_ADD        (NLM_F_CREATE|NLM_F_EXCL)
127
#define NL_OP_REPLACE        (NLM_F_CREATE|NLM_F_REPLACE)
128
#define NL_OP_APPEND        (NLM_F_CREATE|NLM_F_APPEND)
129

    
130
static linpool *nl_linpool;
131

    
132
static struct nl_sock nl_scan = {.fd = -1};        /* Netlink socket for synchronous scan */
133
static struct nl_sock nl_req  = {.fd = -1};        /* Netlink socket for requests */
134

    
135
static void
136
nl_open_sock(struct nl_sock *nl)
137
{
138
  if (nl->fd < 0)
139
    {
140
      nl->fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
141
      if (nl->fd < 0)
142
        die("Unable to open rtnetlink socket: %m");
143
      nl->seq = now;
144
      nl->rx_buffer = xmalloc(NL_RX_SIZE);
145
      nl->last_hdr = NULL;
146
      nl->last_size = 0;
147
    }
148
}
149

    
150
static void
151
nl_open(void)
152
{
153
  nl_open_sock(&nl_scan);
154
  nl_open_sock(&nl_req);
155
}
156

    
157
static void
158
nl_send(struct nl_sock *nl, struct nlmsghdr *nh)
159
{
160
  struct sockaddr_nl sa;
161

    
162
  memset(&sa, 0, sizeof(sa));
163
  sa.nl_family = AF_NETLINK;
164
  nh->nlmsg_pid = 0;
165
  nh->nlmsg_seq = ++(nl->seq);
166
  if (sendto(nl->fd, nh, nh->nlmsg_len, 0, (struct sockaddr *)&sa, sizeof(sa)) < 0)
167
    die("rtnetlink sendto: %m");
168
  nl->last_hdr = NULL;
169
}
170

    
171
static void
172
nl_request_dump(int af, int cmd)
173
{
174
  struct {
175
    struct nlmsghdr nh;
176
    struct rtgenmsg g;
177
  } req = {
178
    .nh.nlmsg_type = cmd,
179
    .nh.nlmsg_len = sizeof(req),
180
    .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
181
    .g.rtgen_family = af
182
  };
183
  nl_send(&nl_scan, &req.nh);
184
}
185

    
186
static struct nlmsghdr *
187
nl_get_reply(struct nl_sock *nl)
188
{
189
  for(;;)
190
    {
191
      if (!nl->last_hdr)
192
        {
193
          struct iovec iov = { nl->rx_buffer, NL_RX_SIZE };
194
          struct sockaddr_nl sa;
195
          struct msghdr m = {
196
            .msg_name = &sa,
197
            .msg_namelen = sizeof(sa),
198
            .msg_iov = &iov,
199
            .msg_iovlen = 1,
200
          };
201
          int x = recvmsg(nl->fd, &m, 0);
202
          if (x < 0)
203
            die("nl_get_reply: %m");
204
          if (sa.nl_pid)                /* It isn't from the kernel */
205
            {
206
              DBG("Non-kernel packet\n");
207
              continue;
208
            }
209
          nl->last_size = x;
210
          nl->last_hdr = (void *) nl->rx_buffer;
211
          if (m.msg_flags & MSG_TRUNC)
212
            bug("nl_get_reply: got truncated reply which should be impossible");
213
        }
214
      if (NLMSG_OK(nl->last_hdr, nl->last_size))
215
        {
216
          struct nlmsghdr *h = nl->last_hdr;
217
          nl->last_hdr = NLMSG_NEXT(h, nl->last_size);
218
          if (h->nlmsg_seq != nl->seq)
219
            {
220
              log(L_WARN "nl_get_reply: Ignoring out of sequence netlink packet (%x != %x)",
221
                  h->nlmsg_seq, nl->seq);
222
              continue;
223
            }
224
          return h;
225
        }
226
      if (nl->last_size)
227
        log(L_WARN "nl_get_reply: Found packet remnant of size %d", nl->last_size);
228
      nl->last_hdr = NULL;
229
    }
230
}
231

    
232
static struct tbf rl_netlink_err = TBF_DEFAULT_LOG_LIMITS;
233

    
234
static int
235
nl_error(struct nlmsghdr *h, int ignore_esrch)
236
{
237
  struct nlmsgerr *e;
238
  int ec;
239

    
240
  if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr)))
241
    {
242
      log(L_WARN "Netlink: Truncated error message received");
243
      return ENOBUFS;
244
    }
245
  e = (struct nlmsgerr *) NLMSG_DATA(h);
246
  ec = -e->error;
247
  if (ec && !(ignore_esrch && (ec == ESRCH)))
248
    log_rl(&rl_netlink_err, L_WARN "Netlink: %s", strerror(ec));
249
  return ec;
250
}
251

    
252
static struct nlmsghdr *
253
nl_get_scan(void)
254
{
255
  struct nlmsghdr *h = nl_get_reply(&nl_scan);
256

    
257
  if (h->nlmsg_type == NLMSG_DONE)
258
    return NULL;
259
  if (h->nlmsg_type == NLMSG_ERROR)
260
    {
261
      nl_error(h, 0);
262
      return NULL;
263
    }
264
  return h;
265
}
266

    
267
static int
268
nl_exchange(struct nlmsghdr *pkt, int ignore_esrch)
269
{
270
  struct nlmsghdr *h;
271

    
272
  nl_send(&nl_req, pkt);
273
  for(;;)
274
    {
275
      h = nl_get_reply(&nl_req);
276
      if (h->nlmsg_type == NLMSG_ERROR)
277
        break;
278
      log(L_WARN "nl_exchange: Unexpected reply received");
279
    }
280
  return nl_error(h, ignore_esrch) ? -1 : 0;
281
}
282

    
283
/*
284
 *        Netlink attributes
285
 */
286

    
287
static int nl_attr_len;
288

    
289
static void *
290
nl_checkin(struct nlmsghdr *h, int lsize)
291
{
292
  nl_attr_len = h->nlmsg_len - NLMSG_LENGTH(lsize);
293
  if (nl_attr_len < 0)
294
    {
295
      log(L_ERR "nl_checkin: underrun by %d bytes", -nl_attr_len);
296
      return NULL;
297
    }
298
  return NLMSG_DATA(h);
299
}
300

    
301
struct nl_want_attrs {
302
  u8 defined:1;
303
  u8 checksize:1;
304
  u8 size;
305
};
306

    
307

    
308
#define BIRD_IFLA_MAX (IFLA_WIRELESS+1)
309

    
310
static struct nl_want_attrs ifla_attr_want[BIRD_IFLA_MAX] = {
311
  [IFLA_IFNAME]          = { 1, 0, 0 },
312
  [IFLA_MTU]          = { 1, 1, sizeof(u32) },
313
  [IFLA_WIRELESS] = { 1, 0, 0 },
314
};
315

    
316

    
317
#define BIRD_IFA_MAX  (IFA_FLAGS+1)
318

    
319
static struct nl_want_attrs ifa_attr_want4[BIRD_IFA_MAX] = {
320
  [IFA_ADDRESS]          = { 1, 1, sizeof(ip4_addr) },
321
  [IFA_LOCAL]          = { 1, 1, sizeof(ip4_addr) },
322
  [IFA_BROADCAST] = { 1, 1, sizeof(ip4_addr) },
323
  [IFA_FLAGS]     = { 1, 1, sizeof(u32) },
324
};
325

    
326
static struct nl_want_attrs ifa_attr_want6[BIRD_IFA_MAX] = {
327
  [IFA_ADDRESS]          = { 1, 1, sizeof(ip6_addr) },
328
  [IFA_LOCAL]          = { 1, 1, sizeof(ip6_addr) },
329
  [IFA_FLAGS]          = { 1, 1, sizeof(u32) },
330
};
331

    
332

    
333
#define BIRD_RTA_MAX  (RTA_ENCAP+1)
334

    
335
static struct nl_want_attrs nexthop_attr_want4[BIRD_RTA_MAX] = {
336
  [RTA_GATEWAY]          = { 1, 1, sizeof(ip4_addr) },
337
  [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
338
  [RTA_ENCAP]          = { 1, 0, 0 },
339
};
340

    
341
static struct nl_want_attrs encap_mpls_want[BIRD_RTA_MAX] = {
342
  [RTA_DST]       = { 1, 0, 0 },
343
};
344

    
345
static struct nl_want_attrs rtm_attr_want4[BIRD_RTA_MAX] = {
346
  [RTA_DST]          = { 1, 1, sizeof(ip4_addr) },
347
  [RTA_OIF]          = { 1, 1, sizeof(u32) },
348
  [RTA_GATEWAY]          = { 1, 1, sizeof(ip4_addr) },
349
  [RTA_PRIORITY]  = { 1, 1, sizeof(u32) },
350
  [RTA_PREFSRC]          = { 1, 1, sizeof(ip4_addr) },
351
  [RTA_METRICS]          = { 1, 0, 0 },
352
  [RTA_MULTIPATH] = { 1, 0, 0 },
353
  [RTA_FLOW]          = { 1, 1, sizeof(u32) },
354
  [RTA_TABLE]          = { 1, 1, sizeof(u32) },
355
  [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
356
  [RTA_ENCAP]          = { 1, 0, 0 },
357
};
358

    
359
static struct nl_want_attrs rtm_attr_want6[BIRD_RTA_MAX] = {
360
  [RTA_DST]          = { 1, 1, sizeof(ip6_addr) },
361
  [RTA_IIF]          = { 1, 1, sizeof(u32) },
362
  [RTA_OIF]          = { 1, 1, sizeof(u32) },
363
  [RTA_GATEWAY]          = { 1, 1, sizeof(ip6_addr) },
364
  [RTA_PRIORITY]  = { 1, 1, sizeof(u32) },
365
  [RTA_PREFSRC]          = { 1, 1, sizeof(ip6_addr) },
366
  [RTA_METRICS]          = { 1, 0, 0 },
367
  [RTA_FLOW]          = { 1, 1, sizeof(u32) },
368
  [RTA_TABLE]          = { 1, 1, sizeof(u32) },
369
  [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
370
  [RTA_ENCAP]          = { 1, 0, 0 },
371
};
372

    
373
static struct nl_want_attrs rtm_attr_want_mpls[BIRD_RTA_MAX] = {
374
  [RTA_DST]          = { 1, 1, sizeof(u32) },
375
  [RTA_IIF]          = { 1, 1, sizeof(u32) },
376
  [RTA_OIF]          = { 1, 1, sizeof(u32) },
377
  [RTA_PRIORITY]  = { 1, 1, sizeof(u32) },
378
  [RTA_METRICS]          = { 1, 0, 0 },
379
  [RTA_FLOW]          = { 1, 1, sizeof(u32) },
380
  [RTA_TABLE]          = { 1, 1, sizeof(u32) },
381
  [RTA_VIA]          = { 1, 0, 0 },
382
  [RTA_NEWDST]          = { 1, 0, 0 },
383
};
384

    
385

    
386
static int
387
nl_parse_attrs(struct rtattr *a, struct nl_want_attrs *want, struct rtattr **k, int ksize)
388
{
389
  int max = ksize / sizeof(struct rtattr *);
390
  bzero(k, ksize);
391

    
392
  for ( ; RTA_OK(a, nl_attr_len); a = RTA_NEXT(a, nl_attr_len))
393
    {
394
      if ((a->rta_type >= max) || !want[a->rta_type].defined)
395
        continue;
396

    
397
      if (want[a->rta_type].checksize && (RTA_PAYLOAD(a) != want[a->rta_type].size))
398
        {
399
          log(L_ERR "nl_parse_attrs: Malformed attribute received");
400
          return 0;
401
        }
402

    
403
      k[a->rta_type] = a;
404
    }
405

    
406
  if (nl_attr_len)
407
    {
408
      log(L_ERR "nl_parse_attrs: remnant of size %d", nl_attr_len);
409
      return 0;
410
    }
411

    
412
  return 1;
413
}
414

    
415
static inline u16 rta_get_u16(struct rtattr *a)
416
{ return *(u16 *) RTA_DATA(a); }
417

    
418
static inline u32 rta_get_u32(struct rtattr *a)
419
{ return *(u32 *) RTA_DATA(a); }
420

    
421
static inline ip4_addr rta_get_ip4(struct rtattr *a)
422
{ return ip4_ntoh(*(ip4_addr *) RTA_DATA(a)); }
423

    
424
static inline ip6_addr rta_get_ip6(struct rtattr *a)
425
{ return ip6_ntoh(*(ip6_addr *) RTA_DATA(a)); }
426

    
427
static inline ip_addr rta_get_ipa(struct rtattr *a)
428
{
429
  if (RTA_PAYLOAD(a) == sizeof(ip4_addr))
430
    return ipa_from_ip4(rta_get_ip4(a));
431
  else
432
    return ipa_from_ip6(rta_get_ip6(a));
433
}
434

    
435
static inline ip_addr rta_get_via(struct rtattr *a)
436
{
437
  struct rtvia *v = RTA_DATA(a);
438
  switch(v->rtvia_family) {
439
    case AF_INET:  return ipa_from_ip4(ip4_ntoh(*(ip4_addr *) v->rtvia_addr));
440
    case AF_INET6: return ipa_from_ip6(ip6_ntoh(*(ip6_addr *) v->rtvia_addr));
441
  }
442
  return IPA_NONE;
443
}
444

    
445
static u32 rta_mpls_stack[MPLS_MAX_LABEL_STACK];
446
static inline int rta_get_mpls(struct rtattr *a, u32 *stack)
447
{
448
  if (RTA_PAYLOAD(a) % 4)
449
    log(L_WARN "KRT: Strange length of received MPLS stack: %u", RTA_PAYLOAD(a));
450

    
451
  return mpls_get(RTA_DATA(a), RTA_PAYLOAD(a) & ~0x3, stack);
452
}
453

    
454
struct rtattr *
455
nl_add_attr(struct nlmsghdr *h, uint bufsize, uint code, const void *data, uint dlen)
456
{
457
  uint pos = NLMSG_ALIGN(h->nlmsg_len);
458
  uint len = RTA_LENGTH(dlen);
459

    
460
  if (pos + len > bufsize)
461
    bug("nl_add_attr: packet buffer overflow");
462

    
463
  struct rtattr *a = (struct rtattr *)((char *)h + pos);
464
  a->rta_type = code;
465
  a->rta_len = len;
466
  h->nlmsg_len = pos + len;
467

    
468
  if (dlen > 0)
469
    memcpy(RTA_DATA(a), data, dlen);
470

    
471
  return a;
472
}
473

    
474
static inline struct rtattr *
475
nl_open_attr(struct nlmsghdr *h, uint bufsize, uint code)
476
{
477
  return nl_add_attr(h, bufsize, code, NULL, 0);
478
}
479

    
480
static inline void
481
nl_close_attr(struct nlmsghdr *h, struct rtattr *a)
482
{
483
  a->rta_len = (void *)h + NLMSG_ALIGN(h->nlmsg_len) - (void *)a;
484
}
485

    
486
static inline void
487
nl_add_attr_u16(struct nlmsghdr *h, uint bufsize, int code, u16 data)
488
{
489
  nl_add_attr(h, bufsize, code, &data, 2);
490
}
491

    
492
static inline void
493
nl_add_attr_u32(struct nlmsghdr *h, uint bufsize, int code, u32 data)
494
{
495
  nl_add_attr(h, bufsize, code, &data, 4);
496
}
497

    
498
static inline void
499
nl_add_attr_ip4(struct nlmsghdr *h, uint bufsize, int code, ip4_addr ip4)
500
{
501
  ip4 = ip4_hton(ip4);
502
  nl_add_attr(h, bufsize, code, &ip4, sizeof(ip4));
503
}
504

    
505
static inline void
506
nl_add_attr_ip6(struct nlmsghdr *h, uint bufsize, int code, ip6_addr ip6)
507
{
508
  ip6 = ip6_hton(ip6);
509
  nl_add_attr(h, bufsize, code, &ip6, sizeof(ip6));
510
}
511

    
512
static inline void
513
nl_add_attr_ipa(struct nlmsghdr *h, uint bufsize, int code, ip_addr ipa)
514
{
515
  if (ipa_is_ip4(ipa))
516
    nl_add_attr_ip4(h, bufsize, code, ipa_to_ip4(ipa));
517
  else
518
    nl_add_attr_ip6(h, bufsize, code, ipa_to_ip6(ipa));
519
}
520

    
521
static inline void
522
nl_add_attr_mpls(struct nlmsghdr *h, uint bufsize, int code, int len, u32 *stack)
523
{
524
  char buf[len*4];
525
  mpls_put(buf, len, stack);
526
  nl_add_attr(h, bufsize, code, buf, len*4);
527
}
528

    
529
static inline void
530
nl_add_attr_mpls_encap(struct nlmsghdr *h, uint bufsize, int len, u32 *stack)
531
{
532
  nl_add_attr_u16(h, bufsize, RTA_ENCAP_TYPE, LWTUNNEL_ENCAP_MPLS);
533

    
534
  struct rtattr *nest = nl_open_attr(h, bufsize, RTA_ENCAP);
535
  nl_add_attr_mpls(h, bufsize, RTA_DST, len, stack);
536
  nl_close_attr(h, nest);
537
}
538

    
539
static inline void
540
nl_add_attr_via(struct nlmsghdr *h, uint bufsize, ip_addr ipa)
541
{
542
  struct rtattr *nest = nl_open_attr(h, bufsize, RTA_VIA);
543
  struct rtvia *via = RTA_DATA(nest);
544

    
545
  h->nlmsg_len += sizeof(*via);
546

    
547
  if (ipa_is_ip4(ipa))
548
  {
549
    via->rtvia_family = AF_INET;
550
    put_ip4(via->rtvia_addr, ipa_to_ip4(ipa));
551
    h->nlmsg_len += sizeof(ip4_addr);
552
  }
553
  else
554
  {
555
    via->rtvia_family = AF_INET6;
556
    put_ip6(via->rtvia_addr, ipa_to_ip6(ipa));
557
    h->nlmsg_len += sizeof(ip6_addr);
558
  }
559

    
560
  nl_close_attr(h, nest);
561
}
562

    
563
static inline struct rtnexthop *
564
nl_open_nexthop(struct nlmsghdr *h, uint bufsize)
565
{
566
  uint pos = NLMSG_ALIGN(h->nlmsg_len);
567
  uint len = RTNH_LENGTH(0);
568

    
569
  if (pos + len > bufsize)
570
    bug("nl_open_nexthop: packet buffer overflow");
571

    
572
  h->nlmsg_len = pos + len;
573

    
574
  return (void *)h + pos;
575
}
576

    
577
static inline void
578
nl_close_nexthop(struct nlmsghdr *h, struct rtnexthop *nh)
579
{
580
  nh->rtnh_len = (void *)h + NLMSG_ALIGN(h->nlmsg_len) - (void *)nh;
581
}
582

    
583
static inline void
584
nl_add_nexthop(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af)
585
{
586
  if (nh->labels > 0)
587
    if (af == AF_MPLS)
588
      nl_add_attr_mpls(h, bufsize, RTA_NEWDST, nh->labels, nh->label);
589
    else
590
      nl_add_attr_mpls_encap(h, bufsize, nh->labels, nh->label);
591

    
592
  if (ipa_nonzero(nh->gw))
593
    if (af == AF_MPLS)
594
      nl_add_attr_via(h, bufsize, nh->gw);
595
    else
596
      nl_add_attr_ipa(h, bufsize, RTA_GATEWAY, nh->gw);
597
}
598

    
599
static void
600
nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af)
601
{
602
  struct rtattr *a = nl_open_attr(h, bufsize, RTA_MULTIPATH);
603

    
604
  for (; nh; nh = nh->next)
605
  {
606
    struct rtnexthop *rtnh = nl_open_nexthop(h, bufsize);
607

    
608
    rtnh->rtnh_flags = 0;
609
    rtnh->rtnh_hops = nh->weight;
610
    rtnh->rtnh_ifindex = nh->iface->index;
611

    
612
    nl_add_nexthop(h, bufsize, nh, af);
613

    
614
    nl_close_nexthop(h, rtnh);
615
  }
616

    
617
  nl_close_attr(h, a);
618
}
619

    
620
static struct nexthop *
621
nl_parse_multipath(struct krt_proto *p, struct rtattr *ra)
622
{
623
  /* Temporary buffer for multicast nexthops */
624
  static struct nexthop *nh_buffer;
625
  static int nh_buf_size;        /* in number of structures */
626
  static int nh_buf_used;
627

    
628
  struct rtattr *a[BIRD_RTA_MAX];
629
  struct rtnexthop *nh = RTA_DATA(ra);
630
  struct nexthop *rv, *first, **last;
631
  unsigned len = RTA_PAYLOAD(ra);
632

    
633
  first = NULL;
634
  last = &first;
635
  nh_buf_used = 0;
636

    
637
  while (len)
638
    {
639
      /* Use RTNH_OK(nh,len) ?? */
640
      if ((len < sizeof(*nh)) || (len < nh->rtnh_len))
641
        return NULL;
642

    
643
      if (nh_buf_used == nh_buf_size)
644
      {
645
        nh_buf_size = nh_buf_size ? (nh_buf_size * 2) : 4;
646
        nh_buffer = xrealloc(nh_buffer, nh_buf_size * NEXTHOP_MAX_SIZE);
647
      }
648
      *last = rv = nh_buffer + nh_buf_used++;
649
      rv->next = NULL;
650
      last = &(rv->next);
651

    
652
      rv->weight = nh->rtnh_hops;
653
      rv->iface = if_find_by_index(nh->rtnh_ifindex);
654
      if (!rv->iface)
655
        return NULL;
656

    
657
      /* Nonexistent RTNH_PAYLOAD ?? */
658
      nl_attr_len = nh->rtnh_len - RTNH_LENGTH(0);
659
      nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want4, a, sizeof(a));
660
      if (a[RTA_GATEWAY])
661
        {
662
          rv->gw = rta_get_ipa(a[RTA_GATEWAY]);
663

    
664
          neighbor *nbr;
665
          nbr = neigh_find2(&p->p, &rv->gw, rv->iface,
666
                            (nh->rtnh_flags & RTNH_F_ONLINK) ? NEF_ONLINK : 0);
667
          if (!nbr || (nbr->scope == SCOPE_HOST))
668
            return NULL;
669
        }
670
      else
671
        rv->gw = IPA_NONE;
672

    
673
      if (a[RTA_ENCAP_TYPE])
674
        {
675
          if (rta_get_u16(a[RTA_ENCAP_TYPE]) != LWTUNNEL_ENCAP_MPLS) {
676
            log(L_WARN "KRT: Unknown encapsulation method %d in multipath", rta_get_u16(a[RTA_ENCAP_TYPE]));
677
            return NULL;
678
          }
679

    
680
          struct rtattr *enca[BIRD_RTA_MAX];
681
          nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]);
682
          nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca));
683
          rv->labels = rta_get_mpls(enca[RTA_DST], rv->label);
684
          break;
685
        }
686

    
687

    
688
      len -= NLMSG_ALIGN(nh->rtnh_len);
689
      nh = RTNH_NEXT(nh);
690
    }
691

    
692
  return first;
693
}
694

    
695
static void
696
nl_add_metrics(struct nlmsghdr *h, uint bufsize, u32 *metrics, int max)
697
{
698
  struct rtattr *a = nl_open_attr(h, bufsize, RTA_METRICS);
699
  int t;
700

    
701
  for (t = 1; t < max; t++)
702
    if (metrics[0] & (1 << t))
703
      nl_add_attr_u32(h, bufsize, t, metrics[t]);
704

    
705
  nl_close_attr(h, a);
706
}
707

    
708
static int
709
nl_parse_metrics(struct rtattr *hdr, u32 *metrics, int max)
710
{
711
  struct rtattr *a = RTA_DATA(hdr);
712
  int len = RTA_PAYLOAD(hdr);
713

    
714
  metrics[0] = 0;
715
  for (; RTA_OK(a, len); a = RTA_NEXT(a, len))
716
  {
717
    if (a->rta_type == RTA_UNSPEC)
718
      continue;
719

    
720
    if (a->rta_type >= max)
721
      continue;
722

    
723
    if (RTA_PAYLOAD(a) != 4)
724
      return -1;
725

    
726
    metrics[0] |= 1 << a->rta_type;
727
    metrics[a->rta_type] = rta_get_u32(a);
728
  }
729

    
730
  if (len > 0)
731
    return -1;
732

    
733
  return 0;
734
}
735

    
736

    
737
/*
738
 *        Scanning of interfaces
739
 */
740

    
741
static void
742
nl_parse_link(struct nlmsghdr *h, int scan)
743
{
744
  struct ifinfomsg *i;
745
  struct rtattr *a[BIRD_IFLA_MAX];
746
  int new = h->nlmsg_type == RTM_NEWLINK;
747
  struct iface f = {};
748
  struct iface *ifi;
749
  char *name;
750
  u32 mtu;
751
  uint fl;
752

    
753
  if (!(i = nl_checkin(h, sizeof(*i))) || !nl_parse_attrs(IFLA_RTA(i), ifla_attr_want, a, sizeof(a)))
754
    return;
755
  if (!a[IFLA_IFNAME] || (RTA_PAYLOAD(a[IFLA_IFNAME]) < 2) || !a[IFLA_MTU])
756
    {
757
      /*
758
       * IFLA_IFNAME and IFLA_MTU are required, in fact, but there may also come
759
       * a message with IFLA_WIRELESS set, where (e.g.) no IFLA_IFNAME exists.
760
       * We simply ignore all such messages with IFLA_WIRELESS without notice.
761
       */
762

    
763
      if (a[IFLA_WIRELESS])
764
        return;
765

    
766
      log(L_ERR "KIF: Malformed message received");
767
      return;
768
    }
769

    
770
  name = RTA_DATA(a[IFLA_IFNAME]);
771
  mtu = rta_get_u32(a[IFLA_MTU]);
772

    
773
  ifi = if_find_by_index(i->ifi_index);
774
  if (!new)
775
    {
776
      DBG("KIF: IF%d(%s) goes down\n", i->ifi_index, name);
777
      if (!ifi)
778
        return;
779

    
780
      if_delete(ifi);
781
    }
782
  else
783
    {
784
      DBG("KIF: IF%d(%s) goes up (mtu=%d,flg=%x)\n", i->ifi_index, name, mtu, i->ifi_flags);
785
      if (ifi && strncmp(ifi->name, name, sizeof(ifi->name)-1))
786
        if_delete(ifi);
787

    
788
      strncpy(f.name, name, sizeof(f.name)-1);
789
      f.index = i->ifi_index;
790
      f.mtu = mtu;
791

    
792
      fl = i->ifi_flags;
793
      if (fl & IFF_UP)
794
        f.flags |= IF_ADMIN_UP;
795
      if (fl & IFF_LOWER_UP)
796
        f.flags |= IF_LINK_UP;
797
      if (fl & IFF_LOOPBACK)                /* Loopback */
798
        f.flags |= IF_MULTIACCESS | IF_LOOPBACK | IF_IGNORE;
799
      else if (fl & IFF_POINTOPOINT)        /* PtP */
800
        f.flags |= IF_MULTICAST;
801
      else if (fl & IFF_BROADCAST)        /* Broadcast */
802
        f.flags |= IF_MULTIACCESS | IF_BROADCAST | IF_MULTICAST;
803
      else
804
        f.flags |= IF_MULTIACCESS;        /* NBMA */
805

    
806
      if (fl & IFF_MULTICAST)
807
        f.flags |= IF_MULTICAST;
808

    
809
      ifi = if_update(&f);
810

    
811
      if (!scan)
812
        if_end_partial_update(ifi);
813
    }
814
}
815

    
816
static void
817
nl_parse_addr4(struct ifaddrmsg *i, int scan, int new)
818
{
819
  struct rtattr *a[BIRD_IFA_MAX];
820
  struct iface *ifi;
821
  u32 ifa_flags;
822
  int scope;
823

    
824
  if (!nl_parse_attrs(IFA_RTA(i), ifa_attr_want4, a, sizeof(a)))
825
    return;
826

    
827
  if (!a[IFA_LOCAL])
828
    {
829
      log(L_ERR "KIF: Malformed message received (missing IFA_LOCAL)");
830
      return;
831
    }
832
  if (!a[IFA_ADDRESS])
833
    {
834
      log(L_ERR "KIF: Malformed message received (missing IFA_ADDRESS)");
835
      return;
836
    }
837

    
838
  ifi = if_find_by_index(i->ifa_index);
839
  if (!ifi)
840
    {
841
      log(L_ERR "KIF: Received address message for unknown interface %d", i->ifa_index);
842
      return;
843
    }
844

    
845
  if (a[IFA_FLAGS])
846
    ifa_flags = rta_get_u32(a[IFA_FLAGS]);
847
  else
848
    ifa_flags = i->ifa_flags;
849

    
850
  struct ifa ifa;
851
  bzero(&ifa, sizeof(ifa));
852
  ifa.iface = ifi;
853
  if (ifa_flags & IFA_F_SECONDARY)
854
    ifa.flags |= IA_SECONDARY;
855

    
856
  ifa.ip = rta_get_ipa(a[IFA_LOCAL]);
857

    
858
  if (i->ifa_prefixlen > IP4_MAX_PREFIX_LENGTH)
859
    {
860
      log(L_ERR "KIF: Invalid prefix length for interface %s: %d", ifi->name, i->ifa_prefixlen);
861
      new = 0;
862
    }
863
  if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH)
864
    {
865
      ifa.brd = rta_get_ipa(a[IFA_ADDRESS]);
866
      net_fill_ip4(&ifa.prefix, rta_get_ip4(a[IFA_ADDRESS]), i->ifa_prefixlen);
867

    
868
      /* It is either a host address or a peer address */
869
      if (ipa_equal(ifa.ip, ifa.brd))
870
        ifa.flags |= IA_HOST;
871
      else
872
        {
873
          ifa.flags |= IA_PEER;
874
          ifa.opposite = ifa.brd;
875
        }
876
    }
877
  else
878
    {
879
      net_fill_ip4(&ifa.prefix, ipa_to_ip4(ifa.ip), i->ifa_prefixlen);
880
      net_normalize(&ifa.prefix);
881

    
882
      if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH - 1)
883
        ifa.opposite = ipa_opposite_m1(ifa.ip);
884

    
885
      if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH - 2)
886
        ifa.opposite = ipa_opposite_m2(ifa.ip);
887

    
888
      if ((ifi->flags & IF_BROADCAST) && a[IFA_BROADCAST])
889
        {
890
          ip4_addr xbrd = rta_get_ip4(a[IFA_BROADCAST]);
891
          ip4_addr ybrd = ip4_or(ipa_to_ip4(ifa.ip), ip4_not(ip4_mkmask(i->ifa_prefixlen)));
892

    
893
          if (ip4_equal(xbrd, net4_prefix(&ifa.prefix)) || ip4_equal(xbrd, ybrd))
894
            ifa.brd = ipa_from_ip4(xbrd);
895
          else if (ifi->flags & IF_TMP_DOWN) /* Complain only during the first scan */
896
            {
897
              log(L_ERR "KIF: Invalid broadcast address %I4 for %s", xbrd, ifi->name);
898
              ifa.brd = ipa_from_ip4(ybrd);
899
            }
900
        }
901
    }
902

    
903
  scope = ipa_classify(ifa.ip);
904
  if (scope < 0)
905
    {
906
      log(L_ERR "KIF: Invalid interface address %I for %s", ifa.ip, ifi->name);
907
      return;
908
    }
909
  ifa.scope = scope & IADDR_SCOPE_MASK;
910

    
911
  DBG("KIF: IF%d(%s): %s IPA %I, flg %x, net %N, brd %I, opp %I\n",
912
      ifi->index, ifi->name,
913
      new ? "added" : "removed",
914
      ifa.ip, ifa.flags, ifa.prefix, ifa.brd, ifa.opposite);
915

    
916
  if (new)
917
    ifa_update(&ifa);
918
  else
919
    ifa_delete(&ifa);
920

    
921
  if (!scan)
922
    if_end_partial_update(ifi);
923
}
924

    
925
static void
926
nl_parse_addr6(struct ifaddrmsg *i, int scan, int new)
927
{
928
  struct rtattr *a[BIRD_IFA_MAX];
929
  struct iface *ifi;
930
  u32 ifa_flags;
931
  int scope;
932

    
933
  if (!nl_parse_attrs(IFA_RTA(i), ifa_attr_want6, a, sizeof(a)))
934
    return;
935

    
936
  if (!a[IFA_ADDRESS])
937
    {
938
      log(L_ERR "KIF: Malformed message received (missing IFA_ADDRESS)");
939
      return;
940
    }
941

    
942
  ifi = if_find_by_index(i->ifa_index);
943
  if (!ifi)
944
    {
945
      log(L_ERR "KIF: Received address message for unknown interface %d", i->ifa_index);
946
      return;
947
    }
948

    
949
  if (a[IFA_FLAGS])
950
    ifa_flags = rta_get_u32(a[IFA_FLAGS]);
951
  else
952
    ifa_flags = i->ifa_flags;
953

    
954
  struct ifa ifa;
955
  bzero(&ifa, sizeof(ifa));
956
  ifa.iface = ifi;
957
  if (ifa_flags & IFA_F_SECONDARY)
958
    ifa.flags |= IA_SECONDARY;
959

    
960
  /* Ignore tentative addresses silently */
961
  if (ifa_flags & IFA_F_TENTATIVE)
962
    return;
963

    
964
  /* IFA_LOCAL can be unset for IPv6 interfaces */
965
  ifa.ip = rta_get_ipa(a[IFA_LOCAL] ? : a[IFA_ADDRESS]);
966

    
967
  if (i->ifa_prefixlen > IP6_MAX_PREFIX_LENGTH)
968
    {
969
      log(L_ERR "KIF: Invalid prefix length for interface %s: %d", ifi->name, i->ifa_prefixlen);
970
      new = 0;
971
    }
972
  if (i->ifa_prefixlen == IP6_MAX_PREFIX_LENGTH)
973
    {
974
      ifa.brd = rta_get_ipa(a[IFA_ADDRESS]);
975
      net_fill_ip6(&ifa.prefix, rta_get_ip6(a[IFA_ADDRESS]), i->ifa_prefixlen);
976

    
977
      /* It is either a host address or a peer address */
978
      if (ipa_equal(ifa.ip, ifa.brd))
979
        ifa.flags |= IA_HOST;
980
      else
981
        {
982
          ifa.flags |= IA_PEER;
983
          ifa.opposite = ifa.brd;
984
        }
985
    }
986
  else
987
    {
988
      net_fill_ip6(&ifa.prefix, ipa_to_ip6(ifa.ip), i->ifa_prefixlen);
989
      net_normalize(&ifa.prefix);
990

    
991
      if (i->ifa_prefixlen == IP6_MAX_PREFIX_LENGTH - 1)
992
        ifa.opposite = ipa_opposite_m1(ifa.ip);
993
    }
994

    
995
  scope = ipa_classify(ifa.ip);
996
  if (scope < 0)
997
    {
998
      log(L_ERR "KIF: Invalid interface address %I for %s", ifa.ip, ifi->name);
999
      return;
1000
    }
1001
  ifa.scope = scope & IADDR_SCOPE_MASK;
1002

    
1003
  DBG("KIF: IF%d(%s): %s IPA %I, flg %x, net %N, brd %I, opp %I\n",
1004
      ifi->index, ifi->name,
1005
      new ? "added" : "removed",
1006
      ifa.ip, ifa.flags, ifa.prefix, ifa.brd, ifa.opposite);
1007

    
1008
  if (new)
1009
    ifa_update(&ifa);
1010
  else
1011
    ifa_delete(&ifa);
1012

    
1013
  if (!scan)
1014
    if_end_partial_update(ifi);
1015
}
1016

    
1017
static void
1018
nl_parse_addr(struct nlmsghdr *h, int scan)
1019
{
1020
  struct ifaddrmsg *i;
1021

    
1022
  if (!(i = nl_checkin(h, sizeof(*i))))
1023
    return;
1024

    
1025
  int new = (h->nlmsg_type == RTM_NEWADDR);
1026

    
1027
  switch (i->ifa_family)
1028
    {
1029
      case AF_INET:
1030
        return nl_parse_addr4(i, scan, new);
1031

    
1032
      case AF_INET6:
1033
        return nl_parse_addr6(i, scan, new);
1034
    }
1035
}
1036

    
1037
void
1038
kif_do_scan(struct kif_proto *p UNUSED)
1039
{
1040
  struct nlmsghdr *h;
1041

    
1042
  if_start_update();
1043

    
1044
  nl_request_dump(AF_UNSPEC, RTM_GETLINK);
1045
  while (h = nl_get_scan())
1046
    if (h->nlmsg_type == RTM_NEWLINK || h->nlmsg_type == RTM_DELLINK)
1047
      nl_parse_link(h, 1);
1048
    else
1049
      log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type);
1050

    
1051
  nl_request_dump(AF_INET, RTM_GETADDR);
1052
  while (h = nl_get_scan())
1053
    if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR)
1054
      nl_parse_addr(h, 1);
1055
    else
1056
      log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type);
1057

    
1058
  nl_request_dump(AF_INET6, RTM_GETADDR);
1059
  while (h = nl_get_scan())
1060
    if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR)
1061
      nl_parse_addr(h, 1);
1062
    else
1063
      log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type);
1064

    
1065
  if_end_update();
1066
}
1067

    
1068
/*
1069
 *        Routes
1070
 */
1071

    
1072
static inline u32
1073
krt_table_id(struct krt_proto *p)
1074
{
1075
  return KRT_CF->sys.table_id;
1076
}
1077

    
1078
static HASH(struct krt_proto) nl_table_map;
1079

    
1080
#define RTH_KEY(p)                p->af, krt_table_id(p)
1081
#define RTH_NEXT(p)                p->sys.hash_next
1082
#define RTH_EQ(a1,i1,a2,i2)        a1 == a2 && i1 == i2
1083
#define RTH_FN(a,i)                a ^ u32_hash(i)
1084

    
1085
#define RTH_REHASH                rth_rehash
1086
#define RTH_PARAMS                /8, *2, 2, 2, 6, 20
1087

    
1088
HASH_DEFINE_REHASH_FN(RTH, struct krt_proto)
1089

    
1090
int
1091
krt_capable(rte *e)
1092
{
1093
  rta *a = e->attrs;
1094

    
1095
  switch (a->dest)
1096
  {
1097
    case RTD_UNICAST:
1098
    case RTD_BLACKHOLE:
1099
    case RTD_UNREACHABLE:
1100
    case RTD_PROHIBIT:
1101
      return 1;
1102

    
1103
    default:
1104
      return 0;
1105
  }
1106
}
1107

    
1108
static inline int
1109
nh_bufsize(struct nexthop *nh)
1110
{
1111
  int rv = 0;
1112
  for (; nh != NULL; nh = nh->next)
1113
    rv += RTNH_LENGTH(RTA_LENGTH(sizeof(ip_addr)));
1114
  return rv;
1115
}
1116

    
1117
static int
1118
nl_send_route(struct krt_proto *p, rte *e, struct ea_list *eattrs, int op, int dest, struct nexthop *nh)
1119
{
1120
  eattr *ea;
1121
  net *net = e->net;
1122
  rta *a = e->attrs;
1123
  int bufsize = 128 + KRT_METRICS_MAX*8 + nh_bufsize(&(a->nh));
1124
  u32 priority = 0;
1125

    
1126
  struct {
1127
    struct nlmsghdr h;
1128
    struct rtmsg r;
1129
    char buf[0];
1130
  } *r;
1131

    
1132
  int rsize = sizeof(*r) + bufsize;
1133
  r = alloca(rsize);
1134

    
1135
  DBG("nl_send_route(%N,op=%x)\n", net->n.addr, op);
1136

    
1137
  bzero(&r->h, sizeof(r->h));
1138
  bzero(&r->r, sizeof(r->r));
1139
  r->h.nlmsg_type = op ? RTM_NEWROUTE : RTM_DELROUTE;
1140
  r->h.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
1141
  r->h.nlmsg_flags = op | NLM_F_REQUEST | NLM_F_ACK;
1142

    
1143
  r->r.rtm_family = p->af;
1144
  r->r.rtm_dst_len = net_pxlen(net->n.addr);
1145
  r->r.rtm_protocol = RTPROT_BIRD;
1146
  r->r.rtm_scope = RT_SCOPE_UNIVERSE;
1147
  if (p->af == AF_MPLS)
1148
  {
1149
    u32 label = net_mpls(net->n.addr);
1150
    nl_add_attr_mpls(&r->h, rsize, RTA_DST, 1, &label);
1151
  }
1152
  else
1153
    nl_add_attr_ipa(&r->h, rsize, RTA_DST, net_prefix(net->n.addr));
1154

    
1155
  /*
1156
   * Strange behavior for RTM_DELROUTE:
1157
   * 1) rtm_family is ignored in IPv6, works for IPv4
1158
   * 2) not setting RTA_PRIORITY is different from setting default value (on IPv6)
1159
   * 3) not setting RTA_PRIORITY is equivalent to setting 0, which is wildcard
1160
   */
1161

    
1162
  if (krt_table_id(p) < 256)
1163
    r->r.rtm_table = krt_table_id(p);
1164
  else
1165
    nl_add_attr_u32(&r->h, rsize, RTA_TABLE, krt_table_id(p));
1166

    
1167
  if (a->source == RTS_DUMMY)
1168
    priority = e->u.krt.metric;
1169
  else if (KRT_CF->sys.metric)
1170
    priority = KRT_CF->sys.metric;
1171
  else if ((op != NL_OP_DELETE) && (ea = ea_find(eattrs, EA_KRT_METRIC)))
1172
    priority = ea->u.data;
1173

    
1174
  if (priority)
1175
    nl_add_attr_u32(&r->h, sizeof(r), RTA_PRIORITY, priority);
1176

    
1177
  /* For route delete, we do not specify remaining route attributes */
1178
  if (op == NL_OP_DELETE)
1179
    goto dest;
1180

    
1181
  /* Default scope is LINK for device routes, UNIVERSE otherwise */
1182
  if (ea = ea_find(eattrs, EA_KRT_SCOPE))
1183
    r->r.rtm_scope = ea->u.data;
1184
  else
1185
    r->r.rtm_scope = (dest == RTD_UNICAST && ipa_zero(nh->gw)) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
1186

    
1187
  if (ea = ea_find(eattrs, EA_KRT_PREFSRC))
1188
    nl_add_attr_ipa(&r->h, rsize, RTA_PREFSRC, *(ip_addr *)ea->u.ptr->data);
1189

    
1190
  if (ea = ea_find(eattrs, EA_KRT_REALM))
1191
    nl_add_attr_u32(&r->h, rsize, RTA_FLOW, ea->u.data);
1192

    
1193

    
1194
  u32 metrics[KRT_METRICS_MAX];
1195
  metrics[0] = 0;
1196

    
1197
  struct ea_walk_state ews = { .eattrs = eattrs };
1198
  while (ea = ea_walk(&ews, EA_KRT_METRICS, KRT_METRICS_MAX))
1199
  {
1200
    int id = ea->id - EA_KRT_METRICS;
1201
    metrics[0] |= 1 << id;
1202
    metrics[id] = ea->u.data;
1203
  }
1204

    
1205
  if (metrics[0])
1206
    nl_add_metrics(&r->h, rsize, metrics, KRT_METRICS_MAX);
1207

    
1208

    
1209
dest:
1210
  switch (dest)
1211
    {
1212
    case RTD_UNICAST:
1213
      r->r.rtm_type = RTN_UNICAST;
1214
      if (nh->next && !krt_ecmp6(p))
1215
        nl_add_multipath(&r->h, rsize, nh, p->af);
1216
      else
1217
      {
1218
        nl_add_attr_u32(&r->h, rsize, RTA_OIF, nh->iface->index);
1219
        nl_add_nexthop(&r->h, rsize, nh, p->af);
1220
      }
1221
      break;
1222
    case RTD_BLACKHOLE:
1223
      r->r.rtm_type = RTN_BLACKHOLE;
1224
      break;
1225
    case RTD_UNREACHABLE:
1226
      r->r.rtm_type = RTN_UNREACHABLE;
1227
      break;
1228
    case RTD_PROHIBIT:
1229
      r->r.rtm_type = RTN_PROHIBIT;
1230
      break;
1231
    case RTD_NONE:
1232
      break;
1233
    default:
1234
      bug("krt_capable inconsistent with nl_send_route");
1235
    }
1236

    
1237
  /* Ignore missing for DELETE */
1238
  return nl_exchange(&r->h, (op == NL_OP_DELETE));
1239
}
1240

    
1241
static inline int
1242
nl_add_rte(struct krt_proto *p, rte *e, struct ea_list *eattrs)
1243
{
1244
  rta *a = e->attrs;
1245
  int err = 0;
1246

    
1247
  if (krt_ecmp6(p) && a->nh.next)
1248
  {
1249
    struct nexthop *nh = &(a->nh);
1250

    
1251
    err = nl_send_route(p, e, eattrs, NL_OP_ADD, RTD_UNICAST, nh);
1252
    if (err < 0)
1253
      return err;
1254

    
1255
    for (nh = nh->next; nh; nh = nh->next)
1256
      err += nl_send_route(p, e, eattrs, NL_OP_APPEND, RTD_UNICAST, nh);
1257

    
1258
    return err;
1259
  }
1260

    
1261
  return nl_send_route(p, e, eattrs, NL_OP_ADD, a->dest, &(a->nh));
1262
}
1263

    
1264
static inline int
1265
nl_delete_rte(struct krt_proto *p, rte *e, struct ea_list *eattrs)
1266
{
1267
  int err = 0;
1268

    
1269
  /* For IPv6, we just repeatedly request DELETE until we get error */
1270
  do
1271
    err = nl_send_route(p, e, eattrs, NL_OP_DELETE, RTD_NONE, NULL);
1272
  while (krt_ecmp6(p) && !err);
1273

    
1274
  return err;
1275
}
1276

    
1277
void
1278
krt_replace_rte(struct krt_proto *p, net *n, rte *new, rte *old, struct ea_list *eattrs)
1279
{
1280
  int err = 0;
1281

    
1282
  /*
1283
   * We could use NL_OP_REPLACE, but route replace on Linux has some problems:
1284
   *
1285
   * 1) Does not check for matching rtm_protocol
1286
   * 2) Has broken semantics for IPv6 ECMP
1287
   * 3) Crashes some kernel version when used for IPv6 ECMP
1288
   *
1289
   * So we use NL_OP_DELETE and then NL_OP_ADD. We also do not trust the old
1290
   * route value, so we do not try to optimize IPv6 ECMP reconfigurations.
1291
   */
1292

    
1293
  if (old)
1294
    nl_delete_rte(p, old, eattrs);
1295

    
1296
  if (new)
1297
    err = nl_add_rte(p, new, eattrs);
1298

    
1299
  if (err < 0)
1300
    n->n.flags |= KRF_SYNC_ERROR;
1301
  else
1302
    n->n.flags &= ~KRF_SYNC_ERROR;
1303
}
1304

    
1305

    
1306
static inline struct nexthop *
1307
nl_alloc_nexthop(struct nl_parse_state *s, ip_addr gw, struct iface *iface, byte weight)
1308
{
1309
  struct nexthop *nh = lp_alloc(s->pool, sizeof(struct nexthop));
1310

    
1311
  nh->gw = gw;
1312
  nh->iface = iface;
1313
  nh->next = NULL;
1314
  nh->weight = weight;
1315

    
1316
  return nh;
1317
}
1318

    
1319
static int
1320
nl_mergable_route(struct nl_parse_state *s, net *net, struct krt_proto *p, uint priority, uint krt_type)
1321
{
1322
  /* Route merging must be active */
1323
  if (!s->merge)
1324
    return 0;
1325

    
1326
  /* Saved and new route must have same network, proto/table, and priority */
1327
  if ((s->net != net) || (s->proto != p) || (s->krt_metric != priority))
1328
    return 0;
1329

    
1330
  /* Both must be regular unicast routes */
1331
  if ((s->krt_type != RTN_UNICAST) || (krt_type != RTN_UNICAST))
1332
    return 0;
1333

    
1334
  return 1;
1335
}
1336

    
1337
static void
1338
nl_announce_route(struct nl_parse_state *s)
1339
{
1340
  rte *e = rte_get_temp(s->attrs);
1341
  e->net = s->net;
1342
  e->u.krt.src = s->krt_src;
1343
  e->u.krt.proto = s->krt_proto;
1344
  e->u.krt.seen = 0;
1345
  e->u.krt.best = 0;
1346
  e->u.krt.metric = s->krt_metric;
1347

    
1348
  if (s->scan)
1349
    krt_got_route(s->proto, e);
1350
  else
1351
    krt_got_route_async(s->proto, e, s->new);
1352

    
1353
  s->net = NULL;
1354
  s->attrs = NULL;
1355
  s->proto = NULL;
1356
  lp_flush(s->pool);
1357
}
1358

    
1359
static inline void
1360
nl_parse_begin(struct nl_parse_state *s, int scan, int merge)
1361
{
1362
  memset(s, 0, sizeof (struct nl_parse_state));
1363
  s->pool = nl_linpool;
1364
  s->scan = scan;
1365
  s->merge = merge;
1366
}
1367

    
1368
static inline void
1369
nl_parse_end(struct nl_parse_state *s)
1370
{
1371
  if (s->net)
1372
    nl_announce_route(s);
1373
}
1374

    
1375

    
1376
#define SKIP(ARG...) do { DBG("KRT: Ignoring route - " ARG); return; } while(0)
1377

    
1378
static void
1379
nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
1380
{
1381
  struct krt_proto *p;
1382
  struct rtmsg *i;
1383
  struct rtattr *a[BIRD_RTA_MAX];
1384
  int new = h->nlmsg_type == RTM_NEWROUTE;
1385

    
1386
  net_addr dst;
1387
  u32 oif = ~0;
1388
  u32 table_id;
1389
  u32 priority = 0;
1390
  u32 def_scope = RT_SCOPE_UNIVERSE;
1391
  int src;
1392

    
1393
  if (!(i = nl_checkin(h, sizeof(*i))))
1394
    return;
1395

    
1396
  switch (i->rtm_family)
1397
    {
1398
    case AF_INET:
1399
      if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want4, a, sizeof(a)))
1400
        return;
1401

    
1402
      if (a[RTA_DST])
1403
        net_fill_ip4(&dst, rta_get_ip4(a[RTA_DST]), i->rtm_dst_len);
1404
      else
1405
        net_fill_ip4(&dst, IP4_NONE, 0);
1406
      break;
1407

    
1408
    case AF_INET6:
1409
      if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want6, a, sizeof(a)))
1410
        return;
1411

    
1412
      if (a[RTA_DST])
1413
        net_fill_ip6(&dst, rta_get_ip6(a[RTA_DST]), i->rtm_dst_len);
1414
      else
1415
        net_fill_ip6(&dst, IP6_NONE, 0);
1416
      break;
1417

    
1418
    case AF_MPLS:
1419
      if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want_mpls, a, sizeof(a)))
1420
        return;
1421

    
1422
      if (a[RTA_DST])
1423
        if (rta_get_mpls(a[RTA_DST], rta_mpls_stack) == 1)
1424
          net_fill_mpls(&dst, rta_mpls_stack[0]);
1425
        else
1426
          log(L_WARN "KRT: Got multi-label MPLS RTA_DST");
1427
      else
1428
        return; /* No support for MPLS routes without RTA_DST */
1429
      break;
1430

    
1431
    default:
1432
      return;
1433
    }
1434

    
1435
  if (a[RTA_OIF])
1436
    oif = rta_get_u32(a[RTA_OIF]);
1437

    
1438
  if (a[RTA_TABLE])
1439
    table_id = rta_get_u32(a[RTA_TABLE]);
1440
  else
1441
    table_id = i->rtm_table;
1442

    
1443
  /* Do we know this table? */
1444
  p = HASH_FIND(nl_table_map, RTH, i->rtm_family, table_id);
1445
  if (!p)
1446
    SKIP("unknown table %d\n", table);
1447

    
1448
  if (a[RTA_IIF])
1449
    SKIP("IIF set\n");
1450

    
1451
  if (i->rtm_tos != 0)                        /* We don't support TOS */
1452
    SKIP("TOS %02x\n", i->rtm_tos);
1453

    
1454
  if (s->scan && !new)
1455
    SKIP("RTM_DELROUTE in scan\n");
1456

    
1457
  if (a[RTA_PRIORITY])
1458
    priority = rta_get_u32(a[RTA_PRIORITY]);
1459

    
1460
  int c = net_classify(&dst);
1461
  if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK))
1462
    SKIP("strange class/scope\n");
1463

    
1464
  switch (i->rtm_protocol)
1465
    {
1466
    case RTPROT_UNSPEC:
1467
      SKIP("proto unspec\n");
1468

    
1469
    case RTPROT_REDIRECT:
1470
      src = KRT_SRC_REDIRECT;
1471
      break;
1472

    
1473
    case RTPROT_KERNEL:
1474
      src = KRT_SRC_KERNEL;
1475
      return;
1476

    
1477
    case RTPROT_BIRD:
1478
      if (!s->scan)
1479
        SKIP("echo\n");
1480
      src = KRT_SRC_BIRD;
1481
      break;
1482

    
1483
    case RTPROT_BOOT:
1484
    default:
1485
      src = KRT_SRC_ALIEN;
1486
    }
1487

    
1488
  net *net = net_get(p->p.main_channel->table, &dst);
1489

    
1490
  if (s->net && !nl_mergable_route(s, net, p, priority, i->rtm_type))
1491
    nl_announce_route(s);
1492

    
1493
  rta *ra = lp_allocz(s->pool, RTA_MAX_SIZE);
1494
  ra->src = p->p.main_source;
1495
  ra->source = RTS_INHERIT;
1496
  ra->scope = SCOPE_UNIVERSE;
1497

    
1498
  switch (i->rtm_type)
1499
    {
1500
    case RTN_UNICAST:
1501
      ra->dest = RTD_UNICAST;
1502

    
1503
      if (a[RTA_MULTIPATH] && (i->rtm_family == AF_INET))
1504
        {
1505
          struct nexthop *nh = nl_parse_multipath(p, a[RTA_MULTIPATH]);
1506
          if (!nh)
1507
            {
1508
              log(L_ERR "KRT: Received strange multipath route %N", net->n.addr);
1509
              return;
1510
            }
1511

    
1512
          ra->nh = *nh;
1513
          break;
1514
        }
1515

    
1516
      ra->nh.iface = if_find_by_index(oif);
1517
      if (!ra->nh.iface)
1518
        {
1519
          log(L_ERR "KRT: Received route %N with unknown ifindex %u", net->n.addr, oif);
1520
          return;
1521
        }
1522

    
1523
      if ((i->rtm_family != AF_MPLS) && a[RTA_GATEWAY] || (i->rtm_family == AF_MPLS) && a[RTA_VIA])
1524
        {
1525
          if (i->rtm_family == AF_MPLS)
1526
            ra->nh.gw = rta_get_via(a[RTA_VIA]);
1527
          else
1528
            ra->nh.gw = rta_get_ipa(a[RTA_GATEWAY]);
1529

    
1530
          /* Silently skip strange 6to4 routes */
1531
          const net_addr_ip6 sit = NET_ADDR_IP6(IP6_NONE, 96);
1532
          if ((i->rtm_family == AF_INET6) && ipa_in_netX(ra->nh.gw, (net_addr *) &sit))
1533
            return;
1534

    
1535
          neighbor *nbr;
1536
          nbr = neigh_find2(&p->p, &(ra->nh.gw), ra->nh.iface,
1537
                            (i->rtm_flags & RTNH_F_ONLINK) ? NEF_ONLINK : 0);
1538
          if (!nbr || (nbr->scope == SCOPE_HOST))
1539
            {
1540
              log(L_ERR "KRT: Received route %N with strange next-hop %I", net->n.addr,
1541
                  ra->nh.gw);
1542
              return;
1543
            }
1544
        }
1545

    
1546
      break;
1547
    case RTN_BLACKHOLE:
1548
      ra->dest = RTD_BLACKHOLE;
1549
      break;
1550
    case RTN_UNREACHABLE:
1551
      ra->dest = RTD_UNREACHABLE;
1552
      break;
1553
    case RTN_PROHIBIT:
1554
      ra->dest = RTD_PROHIBIT;
1555
      break;
1556
    /* FIXME: What about RTN_THROW? */
1557
    default:
1558
      SKIP("type %d\n", i->rtm_type);
1559
      return;
1560
    }
1561

    
1562
  int labels = 0;
1563
  if ((i->rtm_family == AF_MPLS) && a[RTA_NEWDST] && !ra->nh.next)
1564
    labels = rta_get_mpls(a[RTA_NEWDST], ra->nh.label);
1565

    
1566
  if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE] && !ra->nh.next)
1567
    {
1568
      switch (rta_get_u16(a[RTA_ENCAP_TYPE]))
1569
        {
1570
          case LWTUNNEL_ENCAP_MPLS:
1571
            {
1572
              struct rtattr *enca[BIRD_RTA_MAX];
1573
              nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]);
1574
              nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca));
1575
              labels = rta_get_mpls(enca[RTA_DST], ra->nh.label);
1576
              break;
1577
            }
1578
          default:
1579
            SKIP("unknown encapsulation method %d\n", rta_get_u16(a[RTA_ENCAP_TYPE]));
1580
            break;
1581
        }
1582
    }
1583

    
1584
  if (labels < 0)
1585
  {
1586
    log(L_WARN "KRT: Too long MPLS stack received, ignoring.");
1587
    ra->nh.labels = 0;
1588
  }
1589
  else
1590
    ra->nh.labels = labels;
1591

    
1592
  rte *e = rte_get_temp(ra);
1593
  e->net = net;
1594
  e->u.krt.src = src;
1595
  e->u.krt.proto = i->rtm_protocol;
1596
  e->u.krt.seen = 0;
1597
  e->u.krt.best = 0;
1598
  e->u.krt.metric = 0;
1599

    
1600
  if (i->rtm_scope != def_scope)
1601
    {
1602
      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
1603
      ea->next = ra->eattrs;
1604
      ra->eattrs = ea;
1605
      ea->flags = EALF_SORTED;
1606
      ea->count = 1;
1607
      ea->attrs[0].id = EA_KRT_SCOPE;
1608
      ea->attrs[0].flags = 0;
1609
      ea->attrs[0].type = EAF_TYPE_INT;
1610
      ea->attrs[0].u.data = i->rtm_scope;
1611
    }
1612

    
1613
  if (a[RTA_PRIORITY])
1614
    e->u.krt.metric = rta_get_u32(a[RTA_PRIORITY]);
1615

    
1616
  if (a[RTA_PREFSRC])
1617
    {
1618
      ip_addr ps = rta_get_ipa(a[RTA_PREFSRC]);
1619

    
1620
      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
1621
      ea->next = ra->eattrs;
1622
      ra->eattrs = ea;
1623
      ea->flags = EALF_SORTED;
1624
      ea->count = 1;
1625
      ea->attrs[0].id = EA_KRT_PREFSRC;
1626
      ea->attrs[0].flags = 0;
1627
      ea->attrs[0].type = EAF_TYPE_IP_ADDRESS;
1628
      ea->attrs[0].u.ptr = lp_alloc(s->pool, sizeof(struct adata) + sizeof(ps));
1629
      ea->attrs[0].u.ptr->length = sizeof(ps);
1630
      memcpy(ea->attrs[0].u.ptr->data, &ps, sizeof(ps));
1631
    }
1632

    
1633
  if (a[RTA_FLOW])
1634
    {
1635
      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
1636
      ea->next = ra->eattrs;
1637
      ra->eattrs = ea;
1638
      ea->flags = EALF_SORTED;
1639
      ea->count = 1;
1640
      ea->attrs[0].id = EA_KRT_REALM;
1641
      ea->attrs[0].flags = 0;
1642
      ea->attrs[0].type = EAF_TYPE_INT;
1643
      ea->attrs[0].u.data = rta_get_u32(a[RTA_FLOW]);
1644
    }
1645

    
1646
  if (a[RTA_METRICS])
1647
    {
1648
      u32 metrics[KRT_METRICS_MAX];
1649
      ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + KRT_METRICS_MAX * sizeof(eattr));
1650
      int t, n = 0;
1651

    
1652
      if (nl_parse_metrics(a[RTA_METRICS], metrics, ARRAY_SIZE(metrics)) < 0)
1653
        {
1654
          log(L_ERR "KRT: Received route %N with strange RTA_METRICS attribute", net->n.addr);
1655
          return;
1656
        }
1657

    
1658
      for (t = 1; t < KRT_METRICS_MAX; t++)
1659
        if (metrics[0] & (1 << t))
1660
          {
1661
            ea->attrs[n].id = EA_CODE(EAP_KRT, KRT_METRICS_OFFSET + t);
1662
            ea->attrs[n].flags = 0;
1663
            ea->attrs[n].type = EAF_TYPE_INT; /* FIXME: Some are EAF_TYPE_BITFIELD */
1664
            ea->attrs[n].u.data = metrics[t];
1665
            n++;
1666
          }
1667

    
1668
      if (n > 0)
1669
        {
1670
          ea->next = ra->eattrs;
1671
          ea->flags = EALF_SORTED;
1672
          ea->count = n;
1673
          ra->eattrs = ea;
1674
        }
1675
    }
1676

    
1677
  /*
1678
   * Ideally, now we would send the received route to the rest of kernel code.
1679
   * But IPv6 ECMP routes are sent as a sequence of routes, so we postpone it
1680
   * and merge next hops until the end of the sequence.
1681
   */
1682

    
1683
  if (!s->net)
1684
  {
1685
    /* Store the new route */
1686
    s->net = net;
1687
    s->attrs = ra;
1688
    s->proto = p;
1689
    s->new = new;
1690
    s->krt_src = src;
1691
    s->krt_type = i->rtm_type;
1692
    s->krt_proto = i->rtm_protocol;
1693
    s->krt_metric = priority;
1694
  }
1695
  else
1696
  {
1697
    /* Merge next hops with the stored route */
1698
    rta *oa = s->attrs;
1699

    
1700
    struct nexthop *nhs = &oa->nh;
1701
    nexthop_insert(&nhs, &ra->nh);
1702

    
1703
    /* Perhaps new nexthop is inserted at the first position */
1704
    if (nhs == &ra->nh)
1705
    {
1706
      /* Swap rtas */
1707
      s->attrs = ra;
1708

    
1709
      /* Keep old eattrs */
1710
      ra->eattrs = oa->eattrs;
1711
    }
1712
  }
1713
}
1714

    
1715
void
1716
krt_do_scan(struct krt_proto *p UNUSED)        /* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */
1717
{
1718
  struct nlmsghdr *h;
1719
  struct nl_parse_state s;
1720

    
1721
  nl_parse_begin(&s, 1, 0);
1722
  nl_request_dump(AF_INET, RTM_GETROUTE);
1723
  while (h = nl_get_scan())
1724
    if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
1725
      nl_parse_route(&s, h);
1726
    else
1727
      log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
1728
  nl_parse_end(&s);
1729

    
1730
  nl_parse_begin(&s, 1, 1);
1731
  nl_request_dump(AF_INET6, RTM_GETROUTE);
1732
  while (h = nl_get_scan())
1733
    if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
1734
      nl_parse_route(&s, h);
1735
    else
1736
      log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
1737
  nl_parse_end(&s);
1738

    
1739
  nl_parse_begin(&s, 1, 1);
1740
  nl_request_dump(AF_MPLS, RTM_GETROUTE);
1741
  while (h = nl_get_scan())
1742
    if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
1743
      nl_parse_route(&s, h);
1744
    else
1745
      log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
1746
  nl_parse_end(&s);
1747
}
1748

    
1749
/*
1750
 *        Asynchronous Netlink interface
1751
 */
1752

    
1753
static sock *nl_async_sk;                /* BIRD socket for asynchronous notifications */
1754
static byte *nl_async_rx_buffer;        /* Receive buffer */
1755

    
1756
static void
1757
nl_async_msg(struct nlmsghdr *h)
1758
{
1759
  struct nl_parse_state s;
1760

    
1761
  switch (h->nlmsg_type)
1762
    {
1763
    case RTM_NEWROUTE:
1764
    case RTM_DELROUTE:
1765
      DBG("KRT: Received async route notification (%d)\n", h->nlmsg_type);
1766
      nl_parse_begin(&s, 0, 0);
1767
      nl_parse_route(&s, h);
1768
      nl_parse_end(&s);
1769
      break;
1770
    case RTM_NEWLINK:
1771
    case RTM_DELLINK:
1772
      DBG("KRT: Received async link notification (%d)\n", h->nlmsg_type);
1773
      if (kif_proto)
1774
        nl_parse_link(h, 0);
1775
      break;
1776
    case RTM_NEWADDR:
1777
    case RTM_DELADDR:
1778
      DBG("KRT: Received async address notification (%d)\n", h->nlmsg_type);
1779
      if (kif_proto)
1780
        nl_parse_addr(h, 0);
1781
      break;
1782
    default:
1783
      DBG("KRT: Received unknown async notification (%d)\n", h->nlmsg_type);
1784
    }
1785
}
1786

    
1787
static int
1788
nl_async_hook(sock *sk, uint size UNUSED)
1789
{
1790
  struct iovec iov = { nl_async_rx_buffer, NL_RX_SIZE };
1791
  struct sockaddr_nl sa;
1792
  struct msghdr m = {
1793
    .msg_name = &sa,
1794
    .msg_namelen = sizeof(sa),
1795
    .msg_iov = &iov,
1796
    .msg_iovlen = 1,
1797
  };
1798
  struct nlmsghdr *h;
1799
  int x;
1800
  uint len;
1801

    
1802
  x = recvmsg(sk->fd, &m, 0);
1803
  if (x < 0)
1804
    {
1805
      if (errno == ENOBUFS)
1806
        {
1807
          /*
1808
           *  Netlink reports some packets have been thrown away.
1809
           *  One day we might react to it by asking for route table
1810
           *  scan in near future.
1811
           */
1812
          return 1;        /* More data are likely to be ready */
1813
        }
1814
      else if (errno != EWOULDBLOCK)
1815
        log(L_ERR "Netlink recvmsg: %m");
1816
      return 0;
1817
    }
1818
  if (sa.nl_pid)                /* It isn't from the kernel */
1819
    {
1820
      DBG("Non-kernel packet\n");
1821
      return 1;
1822
    }
1823
  h = (void *) nl_async_rx_buffer;
1824
  len = x;
1825
  if (m.msg_flags & MSG_TRUNC)
1826
    {
1827
      log(L_WARN "Netlink got truncated asynchronous message");
1828
      return 1;
1829
    }
1830
  while (NLMSG_OK(h, len))
1831
    {
1832
      nl_async_msg(h);
1833
      h = NLMSG_NEXT(h, len);
1834
    }
1835
  if (len)
1836
    log(L_WARN "nl_async_hook: Found packet remnant of size %d", len);
1837
  return 1;
1838
}
1839

    
1840
static void
1841
nl_async_err_hook(sock *sk, int e UNUSED)
1842
{
1843
  nl_async_hook(sk, 0);
1844
}
1845

    
1846
static void
1847
nl_open_async(void)
1848
{
1849
  sock *sk;
1850
  struct sockaddr_nl sa;
1851
  int fd;
1852

    
1853
  if (nl_async_sk)
1854
    return;
1855

    
1856
  DBG("KRT: Opening async netlink socket\n");
1857

    
1858
  fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
1859
  if (fd < 0)
1860
    {
1861
      log(L_ERR "Unable to open asynchronous rtnetlink socket: %m");
1862
      return;
1863
    }
1864

    
1865
  bzero(&sa, sizeof(sa));
1866
  sa.nl_family = AF_NETLINK;
1867
  sa.nl_groups = RTMGRP_LINK |
1868
    RTMGRP_IPV4_IFADDR | RTMGRP_IPV4_ROUTE |
1869
    RTMGRP_IPV6_IFADDR | RTMGRP_IPV6_ROUTE;
1870

    
1871
  if (bind(fd, (struct sockaddr *) &sa, sizeof(sa)) < 0)
1872
    {
1873
      log(L_ERR "Unable to bind asynchronous rtnetlink socket: %m");
1874
      close(fd);
1875
      return;
1876
    }
1877

    
1878
  nl_async_rx_buffer = xmalloc(NL_RX_SIZE);
1879

    
1880
  sk = nl_async_sk = sk_new(krt_pool);
1881
  sk->type = SK_MAGIC;
1882
  sk->rx_hook = nl_async_hook;
1883
  sk->err_hook = nl_async_err_hook;
1884
  sk->fd = fd;
1885
  if (sk_open(sk) < 0)
1886
    bug("Netlink: sk_open failed");
1887
}
1888

    
1889

    
1890
/*
1891
 *        Interface to the UNIX krt module
1892
 */
1893

    
1894
void
1895
krt_sys_io_init(void)
1896
{
1897
  nl_linpool = lp_new(krt_pool, 4080);
1898
  HASH_INIT(nl_table_map, krt_pool, 6);
1899
}
1900

    
1901
int
1902
krt_sys_start(struct krt_proto *p)
1903
{
1904
  struct krt_proto *old = HASH_FIND(nl_table_map, RTH, p->af, krt_table_id(p));
1905

    
1906
  if (old)
1907
    {
1908
      log(L_ERR "%s: Kernel table %u already registered by %s",
1909
          p->p.name, krt_table_id(p), old->p.name);
1910
      return 0;
1911
    }
1912

    
1913
  HASH_INSERT2(nl_table_map, RTH, krt_pool, p);
1914

    
1915
  nl_open();
1916
  nl_open_async();
1917

    
1918
  return 1;
1919
}
1920

    
1921
void
1922
krt_sys_shutdown(struct krt_proto *p)
1923
{
1924
  HASH_REMOVE2(nl_table_map, RTH, krt_pool, p);
1925
}
1926

    
1927
int
1928
krt_sys_reconfigure(struct krt_proto *p UNUSED, struct krt_config *n, struct krt_config *o)
1929
{
1930
  return (n->sys.table_id == o->sys.table_id) && (n->sys.metric == o->sys.metric);
1931
}
1932

    
1933
void
1934
krt_sys_init_config(struct krt_config *cf)
1935
{
1936
  cf->sys.table_id = RT_TABLE_MAIN;
1937
  cf->sys.metric = 0;
1938
}
1939

    
1940
void
1941
krt_sys_copy_config(struct krt_config *d, struct krt_config *s)
1942
{
1943
  d->sys.table_id = s->sys.table_id;
1944
  d->sys.metric = s->sys.metric;
1945
}
1946

    
1947
static const char *krt_metrics_names[KRT_METRICS_MAX] = {
1948
  NULL, "lock", "mtu", "window", "rtt", "rttvar", "sstresh", "cwnd", "advmss",
1949
  "reordering", "hoplimit", "initcwnd", "features", "rto_min", "initrwnd", "quickack"
1950
};
1951

    
1952
static const char *krt_features_names[KRT_FEATURES_MAX] = {
1953
  "ecn", NULL, NULL, "allfrag"
1954
};
1955

    
1956
int
1957
krt_sys_get_attr(eattr *a, byte *buf, int buflen UNUSED)
1958
{
1959
  switch (a->id)
1960
  {
1961
  case EA_KRT_PREFSRC:
1962
    bsprintf(buf, "prefsrc");
1963
    return GA_NAME;
1964

    
1965
  case EA_KRT_REALM:
1966
    bsprintf(buf, "realm");
1967
    return GA_NAME;
1968

    
1969
  case EA_KRT_SCOPE:
1970
    bsprintf(buf, "scope");
1971
    return GA_NAME;
1972

    
1973
  case EA_KRT_LOCK:
1974
    buf += bsprintf(buf, "lock:");
1975
    ea_format_bitfield(a, buf, buflen, krt_metrics_names, 2, KRT_METRICS_MAX);
1976
    return GA_FULL;
1977

    
1978
  case EA_KRT_FEATURES:
1979
    buf += bsprintf(buf, "features:");
1980
    ea_format_bitfield(a, buf, buflen, krt_features_names, 0, KRT_FEATURES_MAX);
1981
    return GA_FULL;
1982

    
1983
  default:;
1984
    int id = (int)EA_ID(a->id) - KRT_METRICS_OFFSET;
1985
    if (id > 0 && id < KRT_METRICS_MAX)
1986
    {
1987
      bsprintf(buf, "%s", krt_metrics_names[id]);
1988
      return GA_NAME;
1989
    }
1990

    
1991
    return GA_UNKNOWN;
1992
  }
1993
}
1994

    
1995

    
1996

    
1997
void
1998
kif_sys_start(struct kif_proto *p UNUSED)
1999
{
2000
  nl_open();
2001
  nl_open_async();
2002
}
2003

    
2004
void
2005
kif_sys_shutdown(struct kif_proto *p UNUSED)
2006
{
2007
}