Statistics
| Branch: | Revision:

iof-bird-daemon / sysdep / unix / io.c @ 48e5f32d

History | View | Annotate | Download (37.7 KB)

1
/*
2
 *        BIRD Internet Routing Daemon -- Unix I/O
3
 *
4
 *        (c) 1998--2004 Martin Mares <mj@ucw.cz>
5
 *      (c) 2004       Ondrej Filip <feela@network.cz>
6
 *
7
 *        Can be freely distributed and used under the terms of the GNU GPL.
8
 */
9

    
10
/* Unfortunately, some glibc versions hide parts of RFC 3542 API
11
   if _GNU_SOURCE is not defined. */
12
#define _GNU_SOURCE 1
13

    
14
#include <stdio.h>
15
#include <stdlib.h>
16
#include <time.h>
17
#include <sys/time.h>
18
#include <sys/types.h>
19
#include <sys/socket.h>
20
#include <sys/uio.h>
21
#include <sys/un.h>
22
#include <unistd.h>
23
#include <fcntl.h>
24
#include <errno.h>
25
#include <netinet/in.h>
26
#include <netinet/tcp.h>
27
#include <netinet/udp.h>
28
#include <netinet/icmp6.h>
29

    
30
#include "nest/bird.h"
31
#include "lib/lists.h"
32
#include "lib/resource.h"
33
#include "lib/timer.h"
34
#include "lib/socket.h"
35
#include "lib/event.h"
36
#include "lib/string.h"
37
#include "nest/iface.h"
38

    
39
#include "lib/unix.h"
40
#include "lib/sysio.h"
41

    
42
/* Maximum number of calls of tx handler for one socket in one
43
 * select iteration. Should be small enough to not monopolize CPU by
44
 * one protocol instance.
45
 */
46
#define MAX_STEPS 4
47

    
48
/* Maximum number of calls of rx handler for all sockets in one select
49
   iteration. RX callbacks are often much more costly so we limit
50
   this to gen small latencies */
51
#define MAX_RX_STEPS 4
52

    
53
/*
54
 *        Tracked Files
55
 */
56

    
57
struct rfile {
58
  resource r;
59
  FILE *f;
60
};
61

    
62
static void
63
rf_free(resource *r)
64
{
65
  struct rfile *a = (struct rfile *) r;
66

    
67
  fclose(a->f);
68
}
69

    
70
static void
71
rf_dump(resource *r)
72
{
73
  struct rfile *a = (struct rfile *) r;
74

    
75
  debug("(FILE *%p)\n", a->f);
76
}
77

    
78
static struct resclass rf_class = {
79
  "FILE",
80
  sizeof(struct rfile),
81
  rf_free,
82
  rf_dump,
83
  NULL,
84
  NULL
85
};
86

    
87
void *
88
tracked_fopen(pool *p, char *name, char *mode)
89
{
90
  FILE *f = fopen(name, mode);
91

    
92
  if (f)
93
    {
94
      struct rfile *r = ralloc(p, &rf_class);
95
      r->f = f;
96
    }
97
  return f;
98
}
99

    
100
/**
101
 * DOC: Timers
102
 *
103
 * Timers are resources which represent a wish of a module to call
104
 * a function at the specified time. The platform dependent code
105
 * doesn't guarantee exact timing, only that a timer function
106
 * won't be called before the requested time.
107
 *
108
 * In BIRD, time is represented by values of the &bird_clock_t type
109
 * which are integral numbers interpreted as a relative number of seconds since
110
 * some fixed time point in past. The current time can be read
111
 * from variable @now with reasonable accuracy and is monotonic. There is also
112
 * a current 'absolute' time in variable @now_real reported by OS.
113
 *
114
 * Each timer is described by a &timer structure containing a pointer
115
 * to the handler function (@hook), data private to this function (@data),
116
 * time the function should be called at (@expires, 0 for inactive timers),
117
 * for the other fields see |timer.h|.
118
 */
119

    
120
#define NEAR_TIMER_LIMIT 4
121

    
122
static list near_timers, far_timers;
123
static bird_clock_t first_far_timer = TIME_INFINITY;
124

    
125
/* now must be different from 0, because 0 is a special value in timer->expires */
126
bird_clock_t now = 1, now_real, boot_time;
127

    
128
static void
129
update_times_plain(void)
130
{
131
  bird_clock_t new_time = time(NULL);
132
  int delta = new_time - now_real;
133

    
134
  if ((delta >= 0) && (delta < 60))
135
    now += delta;
136
  else if (now_real != 0)
137
   log(L_WARN "Time jump, delta %d s", delta);
138

    
139
  now_real = new_time;
140
}
141

    
142
static void
143
update_times_gettime(void)
144
{
145
  struct timespec ts;
146
  int rv;
147

    
148
  rv = clock_gettime(CLOCK_MONOTONIC, &ts);
149
  if (rv != 0)
150
    die("clock_gettime: %m");
151

    
152
  if (ts.tv_sec != now) {
153
    if (ts.tv_sec < now)
154
      log(L_ERR "Monotonic timer is broken");
155

    
156
    now = ts.tv_sec;
157
    now_real = time(NULL);
158
  }
159
}
160

    
161
static int clock_monotonic_available;
162

    
163
static inline void
164
update_times(void)
165
{
166
  if (clock_monotonic_available)
167
    update_times_gettime();
168
  else
169
    update_times_plain();
170
}
171

    
172
static inline void
173
init_times(void)
174
{
175
 struct timespec ts;
176
 clock_monotonic_available = (clock_gettime(CLOCK_MONOTONIC, &ts) == 0);
177
 if (!clock_monotonic_available)
178
   log(L_WARN "Monotonic timer is missing");
179
}
180

    
181

    
182
static void
183
tm_free(resource *r)
184
{
185
  timer *t = (timer *) r;
186

    
187
  tm_stop(t);
188
}
189

    
190
static void
191
tm_dump(resource *r)
192
{
193
  timer *t = (timer *) r;
194

    
195
  debug("(code %p, data %p, ", t->hook, t->data);
196
  if (t->randomize)
197
    debug("rand %d, ", t->randomize);
198
  if (t->recurrent)
199
    debug("recur %d, ", t->recurrent);
200
  if (t->expires)
201
    debug("expires in %d sec)\n", t->expires - now);
202
  else
203
    debug("inactive)\n");
204
}
205

    
206
static struct resclass tm_class = {
207
  "Timer",
208
  sizeof(timer),
209
  tm_free,
210
  tm_dump,
211
  NULL,
212
  NULL
213
};
214

    
215
/**
216
 * tm_new - create a timer
217
 * @p: pool
218
 *
219
 * This function creates a new timer resource and returns
220
 * a pointer to it. To use the timer, you need to fill in
221
 * the structure fields and call tm_start() to start timing.
222
 */
223
timer *
224
tm_new(pool *p)
225
{
226
  timer *t = ralloc(p, &tm_class);
227
  return t;
228
}
229

    
230
static inline void
231
tm_insert_near(timer *t)
232
{
233
  node *n = HEAD(near_timers);
234

    
235
  while (n->next && (SKIP_BACK(timer, n, n)->expires < t->expires))
236
    n = n->next;
237
  insert_node(&t->n, n->prev);
238
}
239

    
240
/**
241
 * tm_start - start a timer
242
 * @t: timer
243
 * @after: number of seconds the timer should be run after
244
 *
245
 * This function schedules the hook function of the timer to
246
 * be called after @after seconds. If the timer has been already
247
 * started, it's @expire time is replaced by the new value.
248
 *
249
 * You can have set the @randomize field of @t, the timeout
250
 * will be increased by a random number of seconds chosen
251
 * uniformly from range 0 .. @randomize.
252
 *
253
 * You can call tm_start() from the handler function of the timer
254
 * to request another run of the timer. Also, you can set the @recurrent
255
 * field to have the timer re-added automatically with the same timeout.
256
 */
257
void
258
tm_start(timer *t, unsigned after)
259
{
260
  bird_clock_t when;
261

    
262
  if (t->randomize)
263
    after += random() % (t->randomize + 1);
264
  when = now + after;
265
  if (t->expires == when)
266
    return;
267
  if (t->expires)
268
    rem_node(&t->n);
269
  t->expires = when;
270
  if (after <= NEAR_TIMER_LIMIT)
271
    tm_insert_near(t);
272
  else
273
    {
274
      if (!first_far_timer || first_far_timer > when)
275
        first_far_timer = when;
276
      add_tail(&far_timers, &t->n);
277
    }
278
}
279

    
280
/**
281
 * tm_stop - stop a timer
282
 * @t: timer
283
 *
284
 * This function stops a timer. If the timer is already stopped,
285
 * nothing happens.
286
 */
287
void
288
tm_stop(timer *t)
289
{
290
  if (t->expires)
291
    {
292
      rem_node(&t->n);
293
      t->expires = 0;
294
    }
295
}
296

    
297
static void
298
tm_dump_them(char *name, list *l)
299
{
300
  node *n;
301
  timer *t;
302

    
303
  debug("%s timers:\n", name);
304
  WALK_LIST(n, *l)
305
    {
306
      t = SKIP_BACK(timer, n, n);
307
      debug("%p ", t);
308
      tm_dump(&t->r);
309
    }
310
  debug("\n");
311
}
312

    
313
void
314
tm_dump_all(void)
315
{
316
  tm_dump_them("Near", &near_timers);
317
  tm_dump_them("Far", &far_timers);
318
}
319

    
320
static inline time_t
321
tm_first_shot(void)
322
{
323
  time_t x = first_far_timer;
324

    
325
  if (!EMPTY_LIST(near_timers))
326
    {
327
      timer *t = SKIP_BACK(timer, n, HEAD(near_timers));
328
      if (t->expires < x)
329
        x = t->expires;
330
    }
331
  return x;
332
}
333

    
334
static void
335
tm_shot(void)
336
{
337
  timer *t;
338
  node *n, *m;
339

    
340
  if (first_far_timer <= now)
341
    {
342
      bird_clock_t limit = now + NEAR_TIMER_LIMIT;
343
      first_far_timer = TIME_INFINITY;
344
      n = HEAD(far_timers);
345
      while (m = n->next)
346
        {
347
          t = SKIP_BACK(timer, n, n);
348
          if (t->expires <= limit)
349
            {
350
              rem_node(n);
351
              tm_insert_near(t);
352
            }
353
          else if (t->expires < first_far_timer)
354
            first_far_timer = t->expires;
355
          n = m;
356
        }
357
    }
358
  while ((n = HEAD(near_timers)) -> next)
359
    {
360
      int delay;
361
      t = SKIP_BACK(timer, n, n);
362
      if (t->expires > now)
363
        break;
364
      rem_node(n);
365
      delay = t->expires - now;
366
      t->expires = 0;
367
      if (t->recurrent)
368
        {
369
          int i = t->recurrent - delay;
370
          if (i < 0)
371
            i = 0;
372
          tm_start(t, i);
373
        }
374
      t->hook(t);
375
    }
376
}
377

    
378
/**
379
 * tm_parse_datetime - parse a date and time
380
 * @x: datetime string
381
 *
382
 * tm_parse_datetime() takes a textual representation of
383
 * a date and time (dd-mm-yyyy hh:mm:ss)
384
 * and converts it to the corresponding value of type &bird_clock_t.
385
 */
386
bird_clock_t
387
tm_parse_datetime(char *x)
388
{
389
  struct tm tm;
390
  int n;
391
  time_t t;
392

    
393
  if (sscanf(x, "%d-%d-%d %d:%d:%d%n", &tm.tm_mday, &tm.tm_mon, &tm.tm_year, &tm.tm_hour, &tm.tm_min, &tm.tm_sec, &n) != 6 || x[n])
394
    return tm_parse_date(x);
395
  tm.tm_mon--;
396
  tm.tm_year -= 1900;
397
  t = mktime(&tm);
398
  if (t == (time_t) -1)
399
    return 0;
400
  return t;
401
}
402
/**
403
 * tm_parse_date - parse a date
404
 * @x: date string
405
 *
406
 * tm_parse_date() takes a textual representation of a date (dd-mm-yyyy)
407
 * and converts it to the corresponding value of type &bird_clock_t.
408
 */
409
bird_clock_t
410
tm_parse_date(char *x)
411
{
412
  struct tm tm;
413
  int n;
414
  time_t t;
415

    
416
  if (sscanf(x, "%d-%d-%d%n", &tm.tm_mday, &tm.tm_mon, &tm.tm_year, &n) != 3 || x[n])
417
    return 0;
418
  tm.tm_mon--;
419
  tm.tm_year -= 1900;
420
  tm.tm_hour = tm.tm_min = tm.tm_sec = 0;
421
  t = mktime(&tm);
422
  if (t == (time_t) -1)
423
    return 0;
424
  return t;
425
}
426

    
427
static void
428
tm_format_reltime(char *x, struct tm *tm, bird_clock_t delta)
429
{
430
  static char *month_names[12] = { "Jan", "Feb", "Mar", "Apr", "May", "Jun",
431
                                   "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" };
432

    
433
  if (delta < 20*3600)
434
    bsprintf(x, "%02d:%02d", tm->tm_hour, tm->tm_min);
435
  else if (delta < 360*86400)
436
    bsprintf(x, "%s%02d", month_names[tm->tm_mon], tm->tm_mday);
437
  else
438
    bsprintf(x, "%d", tm->tm_year+1900);
439
}
440

    
441
#include "conf/conf.h"
442

    
443
/**
444
 * tm_format_datetime - convert date and time to textual representation
445
 * @x: destination buffer of size %TM_DATETIME_BUFFER_SIZE
446
 * @t: time
447
 *
448
 * This function formats the given relative time value @t to a textual
449
 * date/time representation (dd-mm-yyyy hh:mm:ss) in real time.
450
 */
451
void
452
tm_format_datetime(char *x, struct timeformat *fmt_spec, bird_clock_t t)
453
{
454
  const char *fmt_used;
455
  struct tm *tm;
456
  bird_clock_t delta = now - t;
457
  t = now_real - delta;
458
  tm = localtime(&t);
459

    
460
  if (fmt_spec->fmt1 == NULL)
461
    return tm_format_reltime(x, tm, delta);
462

    
463
  if ((fmt_spec->limit == 0) || (delta < fmt_spec->limit))
464
    fmt_used = fmt_spec->fmt1;
465
  else
466
    fmt_used = fmt_spec->fmt2;
467

    
468
  int rv = strftime(x, TM_DATETIME_BUFFER_SIZE, fmt_used, tm);
469
  if (((rv == 0) && fmt_used[0]) || (rv == TM_DATETIME_BUFFER_SIZE))
470
    strcpy(x, "<too-long>");
471
}
472

    
473
/**
474
 * DOC: Sockets
475
 *
476
 * Socket resources represent network connections. Their data structure (&socket)
477
 * contains a lot of fields defining the exact type of the socket, the local and
478
 * remote addresses and ports, pointers to socket buffers and finally pointers to
479
 * hook functions to be called when new data have arrived to the receive buffer
480
 * (@rx_hook), when the contents of the transmit buffer have been transmitted
481
 * (@tx_hook) and when an error or connection close occurs (@err_hook).
482
 *
483
 * Freeing of sockets from inside socket hooks is perfectly safe.
484
 */
485

    
486
#ifndef SOL_IP
487
#define SOL_IP IPPROTO_IP
488
#endif
489

    
490
#ifndef SOL_IPV6
491
#define SOL_IPV6 IPPROTO_IPV6
492
#endif
493

    
494
#ifndef SOL_ICMPV6
495
#define SOL_ICMPV6 IPPROTO_ICMPV6
496
#endif
497

    
498

    
499
static list sock_list;
500
static struct birdsock *current_sock;
501
static struct birdsock *stored_sock;
502
static int sock_recalc_fdsets_p;
503

    
504
static inline sock *
505
sk_next(sock *s)
506
{
507
  if (!s->n.next->next)
508
    return NULL;
509
  else
510
    return SKIP_BACK(sock, n, s->n.next);
511
}
512

    
513
static void
514
sk_alloc_bufs(sock *s)
515
{
516
  if (!s->rbuf && s->rbsize)
517
    s->rbuf = s->rbuf_alloc = xmalloc(s->rbsize);
518
  s->rpos = s->rbuf;
519
  if (!s->tbuf && s->tbsize)
520
    s->tbuf = s->tbuf_alloc = xmalloc(s->tbsize);
521
  s->tpos = s->ttx = s->tbuf;
522
}
523

    
524
static void
525
sk_free_bufs(sock *s)
526
{
527
  if (s->rbuf_alloc)
528
    {
529
      xfree(s->rbuf_alloc);
530
      s->rbuf = s->rbuf_alloc = NULL;
531
    }
532
  if (s->tbuf_alloc)
533
    {
534
      xfree(s->tbuf_alloc);
535
      s->tbuf = s->tbuf_alloc = NULL;
536
    }
537
}
538

    
539
static void
540
sk_free(resource *r)
541
{
542
  sock *s = (sock *) r;
543

    
544
  sk_free_bufs(s);
545
  if (s->fd >= 0)
546
    {
547
      close(s->fd);
548

    
549
      /* FIXME: we should call sk_stop() for SKF_THREAD sockets */
550
      if (s->flags & SKF_THREAD)
551
        return;
552

    
553
      if (s == current_sock)
554
        current_sock = sk_next(s);
555
      if (s == stored_sock)
556
        stored_sock = sk_next(s);
557
      rem_node(&s->n);
558
      sock_recalc_fdsets_p = 1;
559
    }
560
}
561

    
562
void
563
sk_set_rbsize(sock *s, uint val)
564
{
565
  ASSERT(s->rbuf_alloc == s->rbuf);
566

    
567
  if (s->rbsize == val)
568
    return;
569

    
570
  s->rbsize = val;
571
  xfree(s->rbuf_alloc);
572
  s->rbuf_alloc = xmalloc(val);
573
  s->rpos = s->rbuf = s->rbuf_alloc;
574
}
575

    
576
void
577
sk_set_tbsize(sock *s, uint val)
578
{
579
  ASSERT(s->tbuf_alloc == s->tbuf);
580

    
581
  if (s->tbsize == val)
582
    return;
583

    
584
  byte *old_tbuf = s->tbuf;
585

    
586
  s->tbsize = val;
587
  s->tbuf = s->tbuf_alloc = xrealloc(s->tbuf_alloc, val);
588
  s->tpos = s->tbuf + (s->tpos - old_tbuf);
589
  s->ttx  = s->tbuf + (s->ttx  - old_tbuf);
590
}
591

    
592
void
593
sk_set_tbuf(sock *s, void *tbuf)
594
{
595
  s->tbuf = tbuf ?: s->tbuf_alloc;
596
  s->ttx = s->tpos = s->tbuf;
597
}
598

    
599
void
600
sk_reallocate(sock *s)
601
{
602
  sk_free_bufs(s);
603
  sk_alloc_bufs(s);
604
}
605

    
606
static void
607
sk_dump(resource *r)
608
{
609
  sock *s = (sock *) r;
610
  static char *sk_type_names[] = { "TCP<", "TCP>", "TCP", "UDP", "UDP/MC", "IP", "IP/MC", "MAGIC", "UNIX<", "UNIX", "DEL!" };
611

    
612
  debug("(%s, ud=%p, sa=%08x, sp=%d, da=%08x, dp=%d, tos=%d, ttl=%d, if=%s)\n",
613
        sk_type_names[s->type],
614
        s->data,
615
        s->saddr,
616
        s->sport,
617
        s->daddr,
618
        s->dport,
619
        s->tos,
620
        s->ttl,
621
        s->iface ? s->iface->name : "none");
622
}
623

    
624
static struct resclass sk_class = {
625
  "Socket",
626
  sizeof(sock),
627
  sk_free,
628
  sk_dump,
629
  NULL,
630
  NULL
631
};
632

    
633
/**
634
 * sk_new - create a socket
635
 * @p: pool
636
 *
637
 * This function creates a new socket resource. If you want to use it,
638
 * you need to fill in all the required fields of the structure and
639
 * call sk_open() to do the actual opening of the socket.
640
 *
641
 * The real function name is sock_new(), sk_new() is a macro wrapper
642
 * to avoid collision with OpenSSL.
643
 */
644
sock *
645
sock_new(pool *p)
646
{
647
  sock *s = ralloc(p, &sk_class);
648
  s->pool = p;
649
  // s->saddr = s->daddr = IPA_NONE;
650
  s->tos = s->priority = s->ttl = -1;
651
  s->fd = -1;
652
  return s;
653
}
654

    
655
static void
656
sk_insert(sock *s)
657
{
658
  add_tail(&sock_list, &s->n);
659
  sock_recalc_fdsets_p = 1;
660
}
661

    
662
#ifdef IPV6
663

    
664
void
665
fill_in_sockaddr(struct sockaddr_in6 *sa, ip_addr a, struct iface *ifa, unsigned port)
666
{
667
  memset(sa, 0, sizeof (struct sockaddr_in6));
668
  sa->sin6_family = AF_INET6;
669
  sa->sin6_port = htons(port);
670
  sa->sin6_flowinfo = 0;
671
#ifdef HAVE_SIN_LEN
672
  sa->sin6_len = sizeof(struct sockaddr_in6);
673
#endif
674
  set_inaddr(&sa->sin6_addr, a);
675

    
676
  if (ifa && ipa_has_link_scope(a))
677
    sa->sin6_scope_id = ifa->index;
678
}
679

    
680
void
681
get_sockaddr(struct sockaddr_in6 *sa, ip_addr *a, struct iface **ifa, unsigned *port, int check)
682
{
683
  if (check && sa->sin6_family != AF_INET6)
684
    bug("get_sockaddr called for wrong address family (%d)", sa->sin6_family);
685
  if (port)
686
    *port = ntohs(sa->sin6_port);
687
  memcpy(a, &sa->sin6_addr, sizeof(*a));
688
  ipa_ntoh(*a);
689

    
690
  if (ifa && ipa_has_link_scope(*a))
691
    *ifa = if_find_by_index(sa->sin6_scope_id);
692
}
693

    
694
#else
695

    
696
void
697
fill_in_sockaddr(struct sockaddr_in *sa, ip_addr a, struct iface *ifa, unsigned port)
698
{
699
  memset (sa, 0, sizeof (struct sockaddr_in));
700
  sa->sin_family = AF_INET;
701
  sa->sin_port = htons(port);
702
#ifdef HAVE_SIN_LEN
703
  sa->sin_len = sizeof(struct sockaddr_in);
704
#endif
705
  set_inaddr(&sa->sin_addr, a);
706
}
707

    
708
void
709
get_sockaddr(struct sockaddr_in *sa, ip_addr *a, struct iface **ifa, unsigned *port, int check)
710
{
711
  if (check && sa->sin_family != AF_INET)
712
    bug("get_sockaddr called for wrong address family (%d)", sa->sin_family);
713
  if (port)
714
    *port = ntohs(sa->sin_port);
715
  memcpy(a, &sa->sin_addr.s_addr, sizeof(*a));
716
  ipa_ntoh(*a);
717
}
718

    
719
#endif
720

    
721

    
722
#ifdef IPV6
723

    
724
/* PKTINFO handling is also standardized in IPv6 */
725
#define CMSG_RX_SPACE (CMSG_SPACE(sizeof(struct in6_pktinfo)) + CMSG_SPACE(sizeof(int)))
726
#define CMSG_TX_SPACE CMSG_SPACE(sizeof(struct in6_pktinfo))
727

    
728
/*
729
 * RFC 2292 uses IPV6_PKTINFO for both the socket option and the cmsg
730
 * type, RFC 3542 changed the socket option to IPV6_RECVPKTINFO. If we
731
 * don't have IPV6_RECVPKTINFO we suppose the OS implements the older
732
 * RFC and we use IPV6_PKTINFO.
733
 */
734
#ifndef IPV6_RECVPKTINFO
735
#define IPV6_RECVPKTINFO IPV6_PKTINFO
736
#endif
737
/*
738
 * Same goes for IPV6_HOPLIMIT -> IPV6_RECVHOPLIMIT.
739
 */
740
#ifndef IPV6_RECVHOPLIMIT
741
#define IPV6_RECVHOPLIMIT IPV6_HOPLIMIT
742
#endif
743

    
744
static char *
745
sysio_register_cmsgs(sock *s)
746
{
747
  int ok = 1;
748

    
749
  if ((s->flags & SKF_LADDR_RX) &&
750
      (setsockopt(s->fd, SOL_IPV6, IPV6_RECVPKTINFO, &ok, sizeof(ok)) < 0))
751
    return "IPV6_RECVPKTINFO";
752

    
753
  if ((s->flags & SKF_TTL_RX) &&
754
      (setsockopt(s->fd, SOL_IPV6, IPV6_RECVHOPLIMIT, &ok, sizeof(ok)) < 0))
755
    return "IPV6_RECVHOPLIMIT";
756

    
757
  return NULL;
758
}
759

    
760
static void
761
sysio_process_rx_cmsgs(sock *s, struct msghdr *msg)
762
{
763
  struct cmsghdr *cm;
764
  struct in6_pktinfo *pi = NULL;
765
  int *hlim = NULL;
766

    
767
  for (cm = CMSG_FIRSTHDR(msg); cm != NULL; cm = CMSG_NXTHDR(msg, cm))
768
  {
769
    if (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_PKTINFO)
770
      pi = (struct in6_pktinfo *) CMSG_DATA(cm);
771

    
772
    if (cm->cmsg_level == SOL_IPV6 && cm->cmsg_type == IPV6_HOPLIMIT)
773
      hlim = (int *) CMSG_DATA(cm);
774
  }
775

    
776
  if (s->flags & SKF_LADDR_RX)
777
  {
778
    if (pi)
779
    {
780
      get_inaddr(&s->laddr, &pi->ipi6_addr);
781
      s->lifindex = pi->ipi6_ifindex;
782
    }
783
    else
784
    {
785
      s->laddr = IPA_NONE;
786
      s->lifindex = 0;
787
    }
788
  }
789

    
790
  if (s->flags & SKF_TTL_RX)
791
    s->ttl = hlim ? *hlim : -1;
792

    
793
  return;
794
}
795

    
796
static void
797
sysio_prepare_tx_cmsgs(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
798
{
799
  struct cmsghdr *cm;
800
  struct in6_pktinfo *pi;
801

    
802
  msg->msg_control = cbuf;
803
  msg->msg_controllen = cbuflen;
804

    
805
  cm = CMSG_FIRSTHDR(msg);
806
  cm->cmsg_level = SOL_IPV6;
807
  cm->cmsg_type = IPV6_PKTINFO;
808
  cm->cmsg_len = CMSG_LEN(sizeof(*pi));
809

    
810
  pi = (struct in6_pktinfo *) CMSG_DATA(cm);
811
  pi->ipi6_ifindex = s->iface ? s->iface->index : 0;
812
  set_inaddr(&pi->ipi6_addr, s->saddr);
813

    
814
  msg->msg_controllen = cm->cmsg_len;
815
}
816

    
817
#endif
818

    
819
static char *
820
sk_set_ttl_int(sock *s)
821
{
822
#ifdef IPV6
823
  if (setsockopt(s->fd, SOL_IPV6, IPV6_UNICAST_HOPS, &s->ttl, sizeof(s->ttl)) < 0)
824
    return "IPV6_UNICAST_HOPS";
825
#else
826
  if (setsockopt(s->fd, SOL_IP, IP_TTL, &s->ttl, sizeof(s->ttl)) < 0)
827
    return "IP_TTL";
828
#endif
829
  return NULL;
830
}
831

    
832
#define ERR(x) do { err = x; goto bad; } while(0)
833
#define WARN(x) log(L_WARN "sk_setup: %s: %m", x)
834

    
835
static char *
836
sk_setup(sock *s)
837
{
838
  int one = 1;
839
  int fd = s->fd;
840
  char *err = NULL;
841

    
842
  if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
843
    ERR("fcntl(O_NONBLOCK)");
844
  if (s->type == SK_UNIX)
845
    return NULL;
846

    
847
  if (ipa_nonzero(s->saddr) && !(s->flags & SKF_BIND))
848
    s->flags |= SKF_PKTINFO;
849

    
850
#ifdef CONFIG_USE_HDRINCL
851
  if ((s->type == SK_IP) && (s->flags & SKF_PKTINFO))
852
  {
853
    s->flags &= ~SKF_PKTINFO;
854
    s->flags |= SKF_HDRINCL;
855
    if (setsockopt(fd, SOL_IP, IP_HDRINCL, &one, sizeof(one)) < 0)
856
      ERR("IP_HDRINCL");
857
  }
858
#endif
859

    
860
  if (s->iface)
861
  {
862
#ifdef SO_BINDTODEVICE
863
    struct ifreq ifr;
864
    strcpy(ifr.ifr_name, s->iface->name);
865
    if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
866
      ERR("SO_BINDTODEVICE");
867
#endif
868

    
869
#ifdef CONFIG_UNIX_DONTROUTE
870
    if (setsockopt(s->fd, SOL_SOCKET, SO_DONTROUTE, &one, sizeof(one)) < 0)
871
      ERR("SO_DONTROUTE");
872
#endif
873
  }
874

    
875
  if ((s->ttl >= 0) && (err = sk_set_ttl_int(s)))
876
    goto bad;
877

    
878
  if (err = sysio_register_cmsgs(s))
879
    goto bad;
880

    
881

    
882
#ifdef IPV6
883
  if ((s->tos >= 0) && setsockopt(fd, SOL_IPV6, IPV6_TCLASS, &s->tos, sizeof(s->tos)) < 0)
884
    WARN("IPV6_TCLASS");
885
#else
886
  if ((s->tos >= 0) && setsockopt(fd, SOL_IP, IP_TOS, &s->tos, sizeof(s->tos)) < 0)
887
    WARN("IP_TOS");
888
#endif
889

    
890
  if (s->priority >= 0)
891
    sk_set_priority(s, s->priority);
892

    
893
#ifdef IPV6
894
  if ((s->flags & SKF_V6ONLY) && setsockopt(fd, SOL_IPV6, IPV6_V6ONLY, &one, sizeof(one)) < 0)
895
    WARN("IPV6_V6ONLY");
896
#endif
897

    
898
bad:
899
  return err;
900
}
901

    
902
/**
903
 * sk_set_ttl - set transmit TTL for given socket.
904
 * @s: socket
905
 * @ttl: TTL value
906
 *
907
 * Set TTL for already opened connections when TTL was not set before.
908
 * Useful for accepted connections when different ones should have 
909
 * different TTL.
910
 *
911
 * Result: 0 for success, -1 for an error.
912
 */
913

    
914
int
915
sk_set_ttl(sock *s, int ttl)
916
{
917
  char *err;
918

    
919
  s->ttl = ttl;
920
  if (err = sk_set_ttl_int(s))
921
    log(L_ERR "sk_set_ttl: %s: %m", err);
922

    
923
  return (err ? -1 : 0);
924
}
925

    
926
/**
927
 * sk_set_min_ttl - set minimal accepted TTL for given socket.
928
 * @s: socket
929
 * @ttl: TTL value
930
 *
931
 * Can be used in TTL security implementation
932
 *
933
 * Result: 0 for success, -1 for an error.
934
 */
935

    
936
int
937
sk_set_min_ttl(sock *s, int ttl)
938
{
939
  int err;
940
#ifdef IPV6
941
  err = sk_set_min_ttl6(s, ttl);
942
#else
943
  err = sk_set_min_ttl4(s, ttl);
944
#endif
945

    
946
  return err;
947
}
948

    
949
/**
950
 * sk_set_md5_auth - add / remove MD5 security association for given socket.
951
 * @s: socket
952
 * @a: IP address of the other side
953
 * @ifa: Interface for link-local IP address
954
 * @passwd: password used for MD5 authentication
955
 *
956
 * In TCP MD5 handling code in kernel, there is a set of pairs
957
 * (address, password) used to choose password according to
958
 * address of the other side. This function is useful for
959
 * listening socket, for active sockets it is enough to set
960
 * s->password field.
961
 *
962
 * When called with passwd != NULL, the new pair is added,
963
 * When called with passwd == NULL, the existing pair is removed.
964
 *
965
 * Result: 0 for success, -1 for an error.
966
 */
967

    
968
int
969
sk_set_md5_auth(sock *s, ip_addr a, struct iface *ifa, char *passwd)
970
{
971
  sockaddr sa;
972
  fill_in_sockaddr(&sa, a, ifa, 0);
973
  return sk_set_md5_auth_int(s, &sa, passwd);
974
}
975

    
976
int
977
sk_set_broadcast(sock *s, int enable)
978
{
979
  if (setsockopt(s->fd, SOL_SOCKET, SO_BROADCAST, &enable, sizeof(enable)) < 0)
980
    {
981
      log(L_ERR "sk_set_broadcast: SO_BROADCAST: %m");
982
      return -1;
983
    }
984

    
985
  return 0;
986
}
987

    
988

    
989
#ifdef IPV6
990

    
991
int
992
sk_set_ipv6_checksum(sock *s, int offset)
993
{
994
  if (setsockopt(s->fd, SOL_IPV6, IPV6_CHECKSUM, &offset, sizeof(offset)) < 0)
995
    {
996
      log(L_ERR "sk_set_ipv6_checksum: IPV6_CHECKSUM: %m");
997
      return -1;
998
    }
999

    
1000
  return 0;
1001
}
1002

    
1003
int
1004
sk_set_icmp_filter(sock *s, int p1, int p2)
1005
{
1006
  /* a bit of lame interface, but it is here only for Radv */
1007
  struct icmp6_filter f;
1008

    
1009
  ICMP6_FILTER_SETBLOCKALL(&f);
1010
  ICMP6_FILTER_SETPASS(p1, &f);
1011
  ICMP6_FILTER_SETPASS(p2, &f);
1012

    
1013
  if (setsockopt(s->fd, SOL_ICMPV6, ICMP6_FILTER, &f, sizeof(f)) < 0)
1014
    {
1015
      log(L_ERR "sk_setup_icmp_filter: ICMP6_FILTER: %m");
1016
      return -1;
1017
    }
1018

    
1019
  return 0;
1020
}
1021

    
1022
int
1023
sk_setup_multicast(sock *s)
1024
{
1025
  char *err;
1026
  int zero = 0;
1027
  int index;
1028

    
1029
  ASSERT(s->iface);
1030

    
1031
  index = s->iface->index;
1032
  if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_HOPS, &s->ttl, sizeof(s->ttl)) < 0)
1033
    ERR("IPV6_MULTICAST_HOPS");
1034
  if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_LOOP, &zero, sizeof(zero)) < 0)
1035
    ERR("IPV6_MULTICAST_LOOP");
1036
  if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_IF, &index, sizeof(index)) < 0)
1037
    ERR("IPV6_MULTICAST_IF");
1038

    
1039
  return 0;
1040

    
1041
bad:
1042
  log(L_ERR "sk_setup_multicast: %s: %m", err);
1043
  return -1;
1044
}
1045

    
1046
#ifdef CONFIG_IPV6_GLIBC_20
1047
#define ipv6mr_interface ipv6mr_ifindex
1048
#endif
1049

    
1050
int
1051
sk_join_group(sock *s, ip_addr maddr)
1052
{
1053
  struct ipv6_mreq mreq;
1054

    
1055
  set_inaddr(&mreq.ipv6mr_multiaddr, maddr);
1056
  mreq.ipv6mr_interface = s->iface->index;
1057

    
1058
  if (setsockopt(s->fd, SOL_IPV6, IPV6_JOIN_GROUP, &mreq, sizeof(mreq)) < 0)
1059
    {
1060
      log(L_ERR "sk_join_group: IPV6_JOIN_GROUP: %m");
1061
      return -1;
1062
    }
1063

    
1064
  return 0;
1065
}
1066

    
1067
int
1068
sk_leave_group(sock *s, ip_addr maddr)
1069
{
1070
  struct ipv6_mreq mreq;
1071
        
1072
  set_inaddr(&mreq.ipv6mr_multiaddr, maddr);
1073
  mreq.ipv6mr_interface = s->iface->index;
1074

    
1075
  if (setsockopt(s->fd, SOL_IPV6, IPV6_LEAVE_GROUP, &mreq, sizeof(mreq)) < 0)
1076
    {
1077
      log(L_ERR "sk_leave_group: IPV6_LEAVE_GROUP: %m");
1078
      return -1;
1079
    }
1080

    
1081
  return 0;
1082
}
1083

    
1084
#else /* IPV4 */
1085

    
1086
int
1087
sk_setup_multicast(sock *s)
1088
{
1089
  char *err;
1090

    
1091
  ASSERT(s->iface);
1092

    
1093
  if (err = sysio_setup_multicast(s))
1094
    {
1095
      log(L_ERR "sk_setup_multicast: %s: %m", err);
1096
      return -1;
1097
    }
1098

    
1099
  return 0;
1100
}
1101

    
1102
int
1103
sk_join_group(sock *s, ip_addr maddr)
1104
{
1105
 char *err;
1106

    
1107
 if (err = sysio_join_group(s, maddr))
1108
    {
1109
      log(L_ERR "sk_join_group: %s: %m", err);
1110
      return -1;
1111
    }
1112

    
1113
  return 0;
1114
}
1115

    
1116
int
1117
sk_leave_group(sock *s, ip_addr maddr)
1118
{
1119
 char *err;
1120

    
1121
 if (err = sysio_leave_group(s, maddr))
1122
    {
1123
      log(L_ERR "sk_leave_group: %s: %m", err);
1124
      return -1;
1125
    }
1126

    
1127
  return 0;
1128
}
1129

    
1130
#endif 
1131

    
1132

    
1133
static void
1134
sk_tcp_connected(sock *s)
1135
{
1136
  sockaddr lsa;
1137
  int lsa_len = sizeof(lsa);
1138
  if (getsockname(s->fd, (struct sockaddr *) &lsa, &lsa_len) == 0)
1139
    get_sockaddr(&lsa, &s->saddr, &s->iface, &s->sport, 1);
1140

    
1141
  s->type = SK_TCP;
1142
  sk_alloc_bufs(s);
1143
  s->tx_hook(s);
1144
}
1145

    
1146
static int
1147
sk_passive_connected(sock *s, struct sockaddr *sa, int al, int type)
1148
{
1149
  int fd = accept(s->fd, sa, &al);
1150
  if (fd >= 0)
1151
    {
1152
      sock *t = sk_new(s->pool);
1153
      char *err;
1154
      t->type = type;
1155
      t->fd = fd;
1156
      t->ttl = s->ttl;
1157
      t->tos = s->tos;
1158
      t->rbsize = s->rbsize;
1159
      t->tbsize = s->tbsize;
1160
      if (type == SK_TCP)
1161
        {
1162
          sockaddr lsa;
1163
          int lsa_len = sizeof(lsa);
1164
          if (getsockname(fd, (struct sockaddr *) &lsa, &lsa_len) == 0)
1165
            get_sockaddr(&lsa, &t->saddr, &t->iface, &t->sport, 1);
1166

    
1167
          get_sockaddr((sockaddr *) sa, &t->daddr, &t->iface, &t->dport, 1);
1168
        }
1169
      sk_insert(t);
1170
      if (err = sk_setup(t))
1171
        {
1172
          log(L_ERR "Incoming connection: %s: %m", err);
1173
          rfree(t);
1174
          return 1;
1175
        }
1176
      sk_alloc_bufs(t);
1177
      s->rx_hook(t, 0);
1178
      return 1;
1179
    }
1180
  else if (errno != EINTR && errno != EAGAIN)
1181
    {
1182
      s->err_hook(s, errno);
1183
    }
1184
  return 0;
1185
}
1186

    
1187
/**
1188
 * sk_open - open a socket
1189
 * @s: socket
1190
 *
1191
 * This function takes a socket resource created by sk_new() and
1192
 * initialized by the user and binds a corresponding network connection
1193
 * to it.
1194
 *
1195
 * Result: 0 for success, -1 for an error.
1196
 */
1197
int
1198
sk_open(sock *s)
1199
{
1200
  int fd;
1201
  int one = 1;
1202
  int do_bind = 0;
1203
  int bind_port = 0;
1204
  ip_addr bind_addr = IPA_NONE;
1205
  sockaddr sa;
1206
  char *err;
1207

    
1208
  switch (s->type)
1209
    {
1210
    case SK_TCP_ACTIVE:
1211
      s->ttx = "";                        /* Force s->ttx != s->tpos */
1212
      /* Fall thru */
1213
    case SK_TCP_PASSIVE:
1214
      fd = socket(BIRD_PF, SOCK_STREAM, IPPROTO_TCP);
1215
      bind_port = s->sport;
1216
      bind_addr = s->saddr;
1217
      do_bind = bind_port || ipa_nonzero(bind_addr);
1218
      break;
1219

    
1220
    case SK_UDP:
1221
      fd = socket(BIRD_PF, SOCK_DGRAM, IPPROTO_UDP);
1222
      bind_port = s->sport;
1223
      bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
1224
      do_bind = 1;
1225
      break;
1226

    
1227
    case SK_IP:
1228
      fd = socket(BIRD_PF, SOCK_RAW, s->dport);
1229
      bind_port = 0;
1230
      bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
1231
      do_bind = ipa_nonzero(bind_addr);
1232
      break;
1233

    
1234
    case SK_MAGIC:
1235
      fd = s->fd;
1236
      break;
1237

    
1238
    default:
1239
      bug("sk_open() called for invalid sock type %d", s->type);
1240
    }
1241
  if (fd < 0)
1242
    die("sk_open: socket: %m");
1243
  s->fd = fd;
1244

    
1245
  if (err = sk_setup(s))
1246
    goto bad;
1247

    
1248
  if (do_bind)
1249
    {
1250
      if (bind_port)
1251
        {
1252
          if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &one, sizeof(one)) < 0)
1253
            ERR("SO_REUSEADDR");
1254

    
1255
#ifdef CONFIG_NO_IFACE_BIND
1256
          /* Workaround missing ability to bind to an iface */
1257
          if ((s->type == SK_UDP) && s->iface && ipa_zero(bind_addr))
1258
          {
1259
            if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &one, sizeof(one)) < 0)
1260
              ERR("SO_REUSEPORT");
1261
          }
1262
#endif
1263
        }
1264

    
1265
      fill_in_sockaddr(&sa, bind_addr, s->iface, bind_port);
1266
      if (bind(fd, (struct sockaddr *) &sa, sizeof(sa)) < 0)
1267
        ERR("bind");
1268
    }
1269

    
1270
  fill_in_sockaddr(&sa, s->daddr, s->iface, s->dport);
1271

    
1272
  if (s->password)
1273
    {
1274
      int rv = sk_set_md5_auth_int(s, &sa, s->password);
1275
      if (rv < 0)
1276
        goto bad_no_log;
1277
    }
1278

    
1279
  switch (s->type)
1280
    {
1281
    case SK_TCP_ACTIVE:
1282
      if (connect(fd, (struct sockaddr *) &sa, sizeof(sa)) >= 0)
1283
        sk_tcp_connected(s);
1284
      else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS &&
1285
               errno != ECONNREFUSED && errno != EHOSTUNREACH && errno != ENETUNREACH)
1286
        ERR("connect");
1287
      break;
1288
    case SK_TCP_PASSIVE:
1289
      if (listen(fd, 8))
1290
        ERR("listen");
1291
      break;
1292
    case SK_MAGIC:
1293
      break;
1294
    default:
1295
      sk_alloc_bufs(s);
1296
#ifdef IPV6
1297
#ifdef IPV6_MTU_DISCOVER
1298
      {
1299
        int dont = IPV6_PMTUDISC_DONT;
1300
        if (setsockopt(fd, SOL_IPV6, IPV6_MTU_DISCOVER, &dont, sizeof(dont)) < 0)
1301
          ERR("IPV6_MTU_DISCOVER");
1302
      }
1303
#endif
1304
#else
1305
#ifdef IP_PMTUDISC
1306
      {
1307
        int dont = IP_PMTUDISC_DONT;
1308
        if (setsockopt(fd, SOL_IP, IP_PMTUDISC, &dont, sizeof(dont)) < 0)
1309
          ERR("IP_PMTUDISC");
1310
      }
1311
#endif
1312
#endif
1313
    }
1314

    
1315
  if (!(s->flags & SKF_THREAD))
1316
    sk_insert(s);
1317
  return 0;
1318

    
1319
bad:
1320
  log(L_ERR "sk_open: %s: %m", err);
1321
bad_no_log:
1322
  close(fd);
1323
  s->fd = -1;
1324
  return -1;
1325
}
1326

    
1327
void
1328
sk_open_unix(sock *s, char *name)
1329
{
1330
  int fd;
1331
  struct sockaddr_un sa;
1332
  char *err;
1333

    
1334
  fd = socket(AF_UNIX, SOCK_STREAM, 0);
1335
  if (fd < 0)
1336
    ERR("socket");
1337
  s->fd = fd;
1338
  if (err = sk_setup(s))
1339
    goto bad;
1340
  unlink(name);
1341

    
1342
  /* Path length checked in test_old_bird() */
1343
  sa.sun_family = AF_UNIX;
1344
  strcpy(sa.sun_path, name);
1345
  if (bind(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) < 0)
1346
    ERR("bind");
1347
  if (listen(fd, 8))
1348
    ERR("listen");
1349
  sk_insert(s);
1350
  return;
1351

    
1352
 bad:
1353
  log(L_ERR "sk_open_unix: %s: %m", err);
1354
  die("Unable to create control socket %s", name);
1355
}
1356

    
1357

    
1358
static inline int
1359
sk_sendmsg(sock *s)
1360
{
1361
  struct iovec iov = {s->tbuf, s->tpos - s->tbuf};
1362
  byte cmsg_buf[CMSG_TX_SPACE];
1363
  sockaddr dst;
1364

    
1365
  fill_in_sockaddr(&dst, s->daddr, s->iface, s->dport);
1366

    
1367
  struct msghdr msg = {
1368
    .msg_name = &dst,
1369
    .msg_namelen = sizeof(dst),
1370
    .msg_iov = &iov,
1371
    .msg_iovlen = 1
1372
  };
1373

    
1374
#ifdef CONFIG_USE_HDRINCL
1375
  byte hdr[20];
1376
  struct iovec iov2[2] = { {hdr, 20}, iov };
1377

    
1378
  if (s->flags & SKF_HDRINCL)
1379
  {
1380
    fill_ip_header(s, hdr, iov.iov_len);
1381
    msg.msg_iov = iov2;
1382
    msg.msg_iovlen = 2;
1383
  }
1384
#endif
1385

    
1386
  if (s->flags & SKF_PKTINFO)
1387
    sysio_prepare_tx_cmsgs(s, &msg, cmsg_buf, sizeof(cmsg_buf));
1388

    
1389
  return sendmsg(s->fd, &msg, 0);
1390
}
1391

    
1392
static inline int
1393
sk_recvmsg(sock *s)
1394
{
1395
  struct iovec iov = {s->rbuf, s->rbsize};
1396
  byte cmsg_buf[CMSG_RX_SPACE];
1397
  sockaddr src;
1398

    
1399
  struct msghdr msg = {
1400
    .msg_name = &src,
1401
    .msg_namelen = sizeof(src),
1402
    .msg_iov = &iov,
1403
    .msg_iovlen = 1,
1404
    .msg_control = cmsg_buf,
1405
    .msg_controllen = sizeof(cmsg_buf),
1406
    .msg_flags = 0
1407
  };
1408

    
1409
  int rv = recvmsg(s->fd, &msg, 0);
1410
  if (rv < 0)
1411
    return rv;
1412

    
1413
  //ifdef IPV4
1414
  //  if (cf_type == SK_IP)
1415
  //    rv = ipv4_skip_header(pbuf, rv);
1416
  //endif
1417

    
1418
  get_sockaddr(&src, &s->faddr, NULL, &s->fport, 1);
1419
  sysio_process_rx_cmsgs(s, &msg);
1420

    
1421
  if (msg.msg_flags & MSG_TRUNC)
1422
    s->flags |= SKF_TRUNCATED;
1423
  else
1424
    s->flags &= ~SKF_TRUNCATED;
1425

    
1426
  return rv;
1427
}
1428

    
1429

    
1430
static inline void reset_tx_buffer(sock *s) { s->ttx = s->tpos = s->tbuf; }
1431

    
1432
static int
1433
sk_maybe_write(sock *s)
1434
{
1435
  int e;
1436

    
1437
  switch (s->type)
1438
    {
1439
    case SK_TCP:
1440
    case SK_MAGIC:
1441
    case SK_UNIX:
1442
      while (s->ttx != s->tpos)
1443
        {
1444
          e = write(s->fd, s->ttx, s->tpos - s->ttx);
1445
          if (e < 0)
1446
            {
1447
              if (errno != EINTR && errno != EAGAIN)
1448
                {
1449
                  reset_tx_buffer(s);
1450
                  /* EPIPE is just a connection close notification during TX */
1451
                  s->err_hook(s, (errno != EPIPE) ? errno : 0);
1452
                  return -1;
1453
                }
1454
              return 0;
1455
            }
1456
          s->ttx += e;
1457
        }
1458
      reset_tx_buffer(s);
1459
      return 1;
1460
    case SK_UDP:
1461
    case SK_IP:
1462
      {
1463
        if (s->tbuf == s->tpos)
1464
          return 1;
1465

    
1466
        e = sk_sendmsg(s);
1467

    
1468
        if (e < 0)
1469
          {
1470
            if (errno != EINTR && errno != EAGAIN)
1471
              {
1472
                reset_tx_buffer(s);
1473
                s->err_hook(s, errno);
1474
                return -1;
1475
              }
1476

    
1477
            if (!s->tx_hook)
1478
              reset_tx_buffer(s);
1479
            return 0;
1480
          }
1481
        reset_tx_buffer(s);
1482
        return 1;
1483
      }
1484
    default:
1485
      bug("sk_maybe_write: unknown socket type %d", s->type);
1486
    }
1487
}
1488

    
1489
int
1490
sk_rx_ready(sock *s)
1491
{
1492
  fd_set rd, wr;
1493
  struct timeval timo;
1494
  int rv;
1495

    
1496
  FD_ZERO(&rd);
1497
  FD_ZERO(&wr);
1498
  FD_SET(s->fd, &rd);
1499

    
1500
  timo.tv_sec = 0;
1501
  timo.tv_usec = 0;
1502

    
1503
 redo:
1504
  rv = select(s->fd+1, &rd, &wr, NULL, &timo);
1505
  
1506
  if ((rv < 0) && (errno == EINTR || errno == EAGAIN))
1507
    goto redo;
1508

    
1509
  return rv;
1510
}
1511

    
1512
/**
1513
 * sk_send - send data to a socket
1514
 * @s: socket
1515
 * @len: number of bytes to send
1516
 *
1517
 * This function sends @len bytes of data prepared in the
1518
 * transmit buffer of the socket @s to the network connection.
1519
 * If the packet can be sent immediately, it does so and returns
1520
 * 1, else it queues the packet for later processing, returns 0
1521
 * and calls the @tx_hook of the socket when the tranmission
1522
 * takes place.
1523
 */
1524
int
1525
sk_send(sock *s, unsigned len)
1526
{
1527
  s->ttx = s->tbuf;
1528
  s->tpos = s->tbuf + len;
1529
  return sk_maybe_write(s);
1530
}
1531

    
1532
/**
1533
 * sk_send_to - send data to a specific destination
1534
 * @s: socket
1535
 * @len: number of bytes to send
1536
 * @addr: IP address to send the packet to
1537
 * @port: port to send the packet to
1538
 *
1539
 * This is a sk_send() replacement for connection-less packet sockets
1540
 * which allows destination of the packet to be chosen dynamically.
1541
 * Raw IP sockets should use 0 for @port.
1542
 */
1543
int
1544
sk_send_to(sock *s, unsigned len, ip_addr addr, unsigned port)
1545
{
1546
  s->daddr = addr;
1547
  if (port)
1548
    s->dport = port;
1549

    
1550
  s->ttx = s->tbuf;
1551
  s->tpos = s->tbuf + len;
1552
  return sk_maybe_write(s);
1553
}
1554

    
1555
/*
1556
int
1557
sk_send_full(sock *s, unsigned len, struct iface *ifa,
1558
             ip_addr saddr, ip_addr daddr, unsigned dport)
1559
{
1560
  s->iface = ifa;
1561
  s->saddr = saddr;
1562
  s->daddr = daddr;
1563
  s->dport = dport;
1564
  s->ttx = s->tbuf;
1565
  s->tpos = s->tbuf + len;
1566
  return sk_maybe_write(s);
1567
}
1568
*/
1569

    
1570
 /* sk_read() and sk_write() are called from BFD's event loop */
1571

    
1572
int
1573
sk_read(sock *s)
1574
{
1575
  switch (s->type)
1576
    {
1577
    case SK_TCP_PASSIVE:
1578
      {
1579
        sockaddr sa;
1580
        return sk_passive_connected(s, (struct sockaddr *) &sa, sizeof(sa), SK_TCP);
1581
      }
1582
    case SK_UNIX_PASSIVE:
1583
      {
1584
        struct sockaddr_un sa;
1585
        return sk_passive_connected(s, (struct sockaddr *) &sa, sizeof(sa), SK_UNIX);
1586
      }
1587
    case SK_TCP:
1588
    case SK_UNIX:
1589
      {
1590
        int c = read(s->fd, s->rpos, s->rbuf + s->rbsize - s->rpos);
1591

    
1592
        if (c < 0)
1593
          {
1594
            if (errno != EINTR && errno != EAGAIN)
1595
              s->err_hook(s, errno);
1596
          }
1597
        else if (!c)
1598
          s->err_hook(s, 0);
1599
        else
1600
          {
1601
            s->rpos += c;
1602
            if (s->rx_hook(s, s->rpos - s->rbuf))
1603
              {
1604
                /* We need to be careful since the socket could have been deleted by the hook */
1605
                if (current_sock == s)
1606
                  s->rpos = s->rbuf;
1607
              }
1608
            return 1;
1609
          }
1610
        return 0;
1611
      }
1612
    case SK_MAGIC:
1613
      return s->rx_hook(s, 0);
1614
    default:
1615
      {
1616
        int e;
1617

    
1618
        e = sk_recvmsg(s);
1619

    
1620
        if (e < 0)
1621
          {
1622
            if (errno != EINTR && errno != EAGAIN)
1623
              s->err_hook(s, errno);
1624
            return 0;
1625
          }
1626

    
1627
        s->rpos = s->rbuf + e;
1628
        s->rx_hook(s, e);
1629
        return 1;
1630
      }
1631
    }
1632
}
1633

    
1634
int
1635
sk_write(sock *s)
1636
{
1637
  switch (s->type)
1638
    {
1639
    case SK_TCP_ACTIVE:
1640
      {
1641
        sockaddr sa;
1642
        fill_in_sockaddr(&sa, s->daddr, s->iface, s->dport);
1643
        if (connect(s->fd, (struct sockaddr *) &sa, sizeof(sa)) >= 0 || errno == EISCONN)
1644
          sk_tcp_connected(s);
1645
        else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS)
1646
          s->err_hook(s, errno);
1647
        return 0;
1648
      }
1649
    default:
1650
      if (s->ttx != s->tpos && sk_maybe_write(s) > 0)
1651
        {
1652
          if (s->tx_hook)
1653
            s->tx_hook(s);
1654
          return 1;
1655
        }
1656
      return 0;
1657
    }
1658
}
1659

    
1660
void
1661
sk_dump_all(void)
1662
{
1663
  node *n;
1664
  sock *s;
1665

    
1666
  debug("Open sockets:\n");
1667
  WALK_LIST(n, sock_list)
1668
    {
1669
      s = SKIP_BACK(sock, n, n);
1670
      debug("%p ", s);
1671
      sk_dump(&s->r);
1672
    }
1673
  debug("\n");
1674
}
1675

    
1676
#undef ERR
1677
#undef WARN
1678

    
1679
/*
1680
 *        Main I/O Loop
1681
 */
1682

    
1683
volatile int async_config_flag;                /* Asynchronous reconfiguration/dump scheduled */
1684
volatile int async_dump_flag;
1685

    
1686
void
1687
io_init(void)
1688
{
1689
  init_list(&near_timers);
1690
  init_list(&far_timers);
1691
  init_list(&sock_list);
1692
  init_list(&global_event_list);
1693
  krt_io_init();
1694
  init_times();
1695
  update_times();
1696
  boot_time = now;
1697
  srandom((int) now_real);
1698
}
1699

    
1700
static int short_loops = 0;
1701
#define SHORT_LOOP_MAX 10
1702

    
1703
void
1704
io_loop(void)
1705
{
1706
  fd_set rd, wr;
1707
  struct timeval timo;
1708
  time_t tout;
1709
  int hi, events;
1710
  sock *s;
1711
  node *n;
1712

    
1713
  sock_recalc_fdsets_p = 1;
1714
  for(;;)
1715
    {
1716
      events = ev_run_list(&global_event_list);
1717
      update_times();
1718
      tout = tm_first_shot();
1719
      if (tout <= now)
1720
        {
1721
          tm_shot();
1722
          continue;
1723
        }
1724
      timo.tv_sec = events ? 0 : MIN(tout - now, 3);
1725
      timo.tv_usec = 0;
1726

    
1727
      if (sock_recalc_fdsets_p)
1728
        {
1729
          sock_recalc_fdsets_p = 0;
1730
          FD_ZERO(&rd);
1731
          FD_ZERO(&wr);
1732
        }
1733

    
1734
      hi = 0;
1735
      WALK_LIST(n, sock_list)
1736
        {
1737
          s = SKIP_BACK(sock, n, n);
1738
          if (s->rx_hook)
1739
            {
1740
              FD_SET(s->fd, &rd);
1741
              if (s->fd > hi)
1742
                hi = s->fd;
1743
            }
1744
          else
1745
            FD_CLR(s->fd, &rd);
1746
          if (s->tx_hook && s->ttx != s->tpos)
1747
            {
1748
              FD_SET(s->fd, &wr);
1749
              if (s->fd > hi)
1750
                hi = s->fd;
1751
            }
1752
          else
1753
            FD_CLR(s->fd, &wr);
1754
        }
1755

    
1756
      /*
1757
       * Yes, this is racy. But even if the signal comes before this test
1758
       * and entering select(), it gets caught on the next timer tick.
1759
       */
1760

    
1761
      if (async_config_flag)
1762
        {
1763
          async_config();
1764
          async_config_flag = 0;
1765
          continue;
1766
        }
1767
      if (async_dump_flag)
1768
        {
1769
          async_dump();
1770
          async_dump_flag = 0;
1771
          continue;
1772
        }
1773
      if (async_shutdown_flag)
1774
        {
1775
          async_shutdown();
1776
          async_shutdown_flag = 0;
1777
          continue;
1778
        }
1779

    
1780
      /* And finally enter select() to find active sockets */
1781
      hi = select(hi+1, &rd, &wr, NULL, &timo);
1782

    
1783
      if (hi < 0)
1784
        {
1785
          if (errno == EINTR || errno == EAGAIN)
1786
            continue;
1787
          die("select: %m");
1788
        }
1789
      if (hi)
1790
        {
1791
          /* guaranteed to be non-empty */
1792
          current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
1793

    
1794
          while (current_sock)
1795
            {
1796
              sock *s = current_sock;
1797
              int e;
1798
              int steps;
1799

    
1800
              steps = MAX_STEPS;
1801
              if ((s->type >= SK_MAGIC) && FD_ISSET(s->fd, &rd) && s->rx_hook)
1802
                do
1803
                  {
1804
                    steps--;
1805
                    e = sk_read(s);
1806
                    if (s != current_sock)
1807
                      goto next;
1808
                  }
1809
                while (e && s->rx_hook && steps);
1810

    
1811
              steps = MAX_STEPS;
1812
              if (FD_ISSET(s->fd, &wr))
1813
                do
1814
                  {
1815
                    steps--;
1816
                    e = sk_write(s);
1817
                    if (s != current_sock)
1818
                      goto next;
1819
                  }
1820
                while (e && steps);
1821
              current_sock = sk_next(s);
1822
            next: ;
1823
            }
1824

    
1825
          short_loops++;
1826
          if (events && (short_loops < SHORT_LOOP_MAX))
1827
            continue;
1828
          short_loops = 0;
1829

    
1830
          int count = 0;
1831
          current_sock = stored_sock;
1832
          if (current_sock == NULL)
1833
            current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
1834

    
1835
          while (current_sock && count < MAX_RX_STEPS)
1836
            {
1837
              sock *s = current_sock;
1838
              int e;
1839

    
1840
              if ((s->type < SK_MAGIC) && FD_ISSET(s->fd, &rd) && s->rx_hook)
1841
                {
1842
                  count++;
1843
                  e = sk_read(s);
1844
                  if (s != current_sock)
1845
                      goto next2;
1846
                }
1847
              current_sock = sk_next(s);
1848
            next2: ;
1849
            }
1850

    
1851
          stored_sock = current_sock;
1852
        }
1853
    }
1854
}
1855

    
1856
void
1857
test_old_bird(char *path)
1858
{
1859
  int fd;
1860
  struct sockaddr_un sa;
1861

    
1862
  fd = socket(AF_UNIX, SOCK_STREAM, 0);
1863
  if (fd < 0)
1864
    die("Cannot create socket: %m");
1865
  if (strlen(path) >= sizeof(sa.sun_path))
1866
    die("Socket path too long");
1867
  bzero(&sa, sizeof(sa));
1868
  sa.sun_family = AF_UNIX;
1869
  strcpy(sa.sun_path, path);
1870
  if (connect(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) == 0)
1871
    die("I found another BIRD running.");
1872
  close(fd);
1873
}
1874

    
1875