Statistics
| Branch: | Revision:

iof-bird-daemon / sysdep / unix / io.c @ e0835db4

History | View | Annotate | Download (48.4 KB)

1
/*
2
 *        BIRD Internet Routing Daemon -- Unix I/O
3
 *
4
 *        (c) 1998--2004 Martin Mares <mj@ucw.cz>
5
 *      (c) 2004       Ondrej Filip <feela@network.cz>
6
 *
7
 *        Can be freely distributed and used under the terms of the GNU GPL.
8
 */
9

    
10
/* Unfortunately, some glibc versions hide parts of RFC 3542 API
11
   if _GNU_SOURCE is not defined. */
12
#ifndef _GNU_SOURCE
13
#define _GNU_SOURCE
14
#endif
15

    
16
#include <stdio.h>
17
#include <stdlib.h>
18
#include <time.h>
19
#include <sys/time.h>
20
#include <sys/types.h>
21
#include <sys/socket.h>
22
#include <sys/uio.h>
23
#include <sys/un.h>
24
#include <poll.h>
25
#include <unistd.h>
26
#include <fcntl.h>
27
#include <errno.h>
28
#include <net/if.h>
29
#include <netinet/in.h>
30
#include <netinet/tcp.h>
31
#include <netinet/udp.h>
32
#include <netinet/icmp6.h>
33

    
34
#include "nest/bird.h"
35
#include "lib/lists.h"
36
#include "lib/resource.h"
37
#include "lib/socket.h"
38
#include "lib/event.h"
39
#include "lib/timer.h"
40
#include "lib/string.h"
41
#include "nest/iface.h"
42
#include "conf/conf.h"
43

    
44
#include "sysdep/unix/unix.h"
45
#include CONFIG_INCLUDE_SYSIO_H
46

    
47
/* Maximum number of calls of tx handler for one socket in one
48
 * poll iteration. Should be small enough to not monopolize CPU by
49
 * one protocol instance.
50
 */
51
#define MAX_STEPS 4
52

    
53
/* Maximum number of calls of rx handler for all sockets in one poll
54
   iteration. RX callbacks are often much more costly so we limit
55
   this to gen small latencies */
56
#define MAX_RX_STEPS 4
57

    
58

    
59
/*
60
 *        Tracked Files
61
 */
62

    
63
struct rfile {
64
  resource r;
65
  FILE *f;
66
};
67

    
68
static void
69
rf_free(resource *r)
70
{
71
  struct rfile *a = (struct rfile *) r;
72

    
73
  fclose(a->f);
74
}
75

    
76
static void
77
rf_dump(resource *r)
78
{
79
  struct rfile *a = (struct rfile *) r;
80

    
81
  debug("(FILE *%p)\n", a->f);
82
}
83

    
84
static struct resclass rf_class = {
85
  "FILE",
86
  sizeof(struct rfile),
87
  rf_free,
88
  rf_dump,
89
  NULL,
90
  NULL
91
};
92

    
93
struct rfile *
94
rf_open(pool *p, char *name, char *mode)
95
{
96
  FILE *f = fopen(name, mode);
97

    
98
  if (!f)
99
    return NULL;
100

    
101
  struct rfile *r = ralloc(p, &rf_class);
102
  r->f = f;
103
  return r;
104
}
105

    
106
void *
107
rf_file(struct rfile *f)
108
{
109
  return f->f;
110
}
111

    
112
int
113
rf_fileno(struct rfile *f)
114
{
115
  return fileno(f->f);
116
}
117

    
118

    
119
/*
120
 *        Time clock
121
 */
122

    
123
btime boot_time;
124

    
125
void
126
times_init(struct timeloop *loop)
127
{
128
  struct timespec ts;
129
  int rv;
130

    
131
  rv = clock_gettime(CLOCK_MONOTONIC, &ts);
132
  if (rv < 0)
133
    die("Monotonic clock is missing");
134

    
135
  if ((ts.tv_sec < 0) || (((s64) ts.tv_sec) > ((s64) 1 << 40)))
136
    log(L_WARN "Monotonic clock is crazy");
137

    
138
  loop->last_time = ts.tv_sec S + ts.tv_nsec NS;
139
  loop->real_time = 0;
140
}
141

    
142
void
143
times_update(struct timeloop *loop)
144
{
145
  struct timespec ts;
146
  int rv;
147

    
148
  rv = clock_gettime(CLOCK_MONOTONIC, &ts);
149
  if (rv < 0)
150
    die("clock_gettime: %m");
151

    
152
  btime new_time = ts.tv_sec S + ts.tv_nsec NS;
153

    
154
  if (new_time < loop->last_time)
155
    log(L_ERR "Monotonic clock is broken");
156

    
157
  loop->last_time = new_time;
158
  loop->real_time = 0;
159
}
160

    
161
void
162
times_update_real_time(struct timeloop *loop)
163
{
164
  struct timespec ts;
165
  int rv;
166

    
167
  rv = clock_gettime(CLOCK_REALTIME, &ts);
168
  if (rv < 0)
169
    die("clock_gettime: %m");
170

    
171
  loop->real_time = ts.tv_sec S + ts.tv_nsec NS;
172
}
173

    
174

    
175
/**
176
 * DOC: Sockets
177
 *
178
 * Socket resources represent network connections. Their data structure (&socket)
179
 * contains a lot of fields defining the exact type of the socket, the local and
180
 * remote addresses and ports, pointers to socket buffers and finally pointers to
181
 * hook functions to be called when new data have arrived to the receive buffer
182
 * (@rx_hook), when the contents of the transmit buffer have been transmitted
183
 * (@tx_hook) and when an error or connection close occurs (@err_hook).
184
 *
185
 * Freeing of sockets from inside socket hooks is perfectly safe.
186
 */
187

    
188
#ifndef SOL_IP
189
#define SOL_IP IPPROTO_IP
190
#endif
191

    
192
#ifndef SOL_IPV6
193
#define SOL_IPV6 IPPROTO_IPV6
194
#endif
195

    
196
#ifndef SOL_ICMPV6
197
#define SOL_ICMPV6 IPPROTO_ICMPV6
198
#endif
199

    
200

    
201
/*
202
 *        Sockaddr helper functions
203
 */
204

    
205
static inline int UNUSED sockaddr_length(int af)
206
{ return (af == AF_INET) ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); }
207

    
208
static inline void
209
sockaddr_fill4(struct sockaddr_in *sa, ip_addr a, uint port)
210
{
211
  memset(sa, 0, sizeof(struct sockaddr_in));
212
#ifdef HAVE_STRUCT_SOCKADDR_SA_LEN
213
  sa->sin_len = sizeof(struct sockaddr_in);
214
#endif
215
  sa->sin_family = AF_INET;
216
  sa->sin_port = htons(port);
217
  sa->sin_addr = ipa_to_in4(a);
218
}
219

    
220
static inline void
221
sockaddr_fill6(struct sockaddr_in6 *sa, ip_addr a, struct iface *ifa, uint port)
222
{
223
  memset(sa, 0, sizeof(struct sockaddr_in6));
224
#ifdef SIN6_LEN
225
  sa->sin6_len = sizeof(struct sockaddr_in6);
226
#endif
227
  sa->sin6_family = AF_INET6;
228
  sa->sin6_port = htons(port);
229
  sa->sin6_flowinfo = 0;
230
  sa->sin6_addr = ipa_to_in6(a);
231

    
232
  if (ifa && ipa_is_link_local(a))
233
    sa->sin6_scope_id = ifa->index;
234
}
235

    
236
void
237
sockaddr_fill(sockaddr *sa, int af, ip_addr a, struct iface *ifa, uint port)
238
{
239
  if (af == AF_INET)
240
    sockaddr_fill4((struct sockaddr_in *) sa, a, port);
241
  else if (af == AF_INET6)
242
    sockaddr_fill6((struct sockaddr_in6 *) sa, a, ifa, port);
243
  else
244
    bug("Unknown AF");
245
}
246

    
247
static inline void
248
sockaddr_read4(struct sockaddr_in *sa, ip_addr *a, uint *port)
249
{
250
  *port = ntohs(sa->sin_port);
251
  *a = ipa_from_in4(sa->sin_addr);
252
}
253

    
254
static inline void
255
sockaddr_read6(struct sockaddr_in6 *sa, ip_addr *a, struct iface **ifa, uint *port)
256
{
257
  *port = ntohs(sa->sin6_port);
258
  *a = ipa_from_in6(sa->sin6_addr);
259

    
260
  if (ifa && ipa_is_link_local(*a))
261
    *ifa = if_find_by_index(sa->sin6_scope_id);
262
}
263

    
264
int
265
sockaddr_read(sockaddr *sa, int af, ip_addr *a, struct iface **ifa, uint *port)
266
{
267
  if (sa->sa.sa_family != af)
268
    goto fail;
269

    
270
  if (af == AF_INET)
271
    sockaddr_read4((struct sockaddr_in *) sa, a, port);
272
  else if (af == AF_INET6)
273
    sockaddr_read6((struct sockaddr_in6 *) sa, a, ifa, port);
274
  else
275
    goto fail;
276

    
277
  return 0;
278

    
279
 fail:
280
  *a = IPA_NONE;
281
  *port = 0;
282
  return -1;
283
}
284

    
285

    
286
/*
287
 *        IPv6 multicast syscalls
288
 */
289

    
290
/* Fortunately standardized in RFC 3493 */
291

    
292
#define INIT_MREQ6(maddr,ifa) \
293
  { .ipv6mr_multiaddr = ipa_to_in6(maddr), .ipv6mr_interface = ifa->index }
294

    
295
static inline int
296
sk_setup_multicast6(sock *s)
297
{
298
  int index = s->iface->index;
299
  int ttl = s->ttl;
300
  int n = 0;
301

    
302
  if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_IF, &index, sizeof(index)) < 0)
303
    ERR("IPV6_MULTICAST_IF");
304

    
305
  if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_HOPS, &ttl, sizeof(ttl)) < 0)
306
    ERR("IPV6_MULTICAST_HOPS");
307

    
308
  if (setsockopt(s->fd, SOL_IPV6, IPV6_MULTICAST_LOOP, &n, sizeof(n)) < 0)
309
    ERR("IPV6_MULTICAST_LOOP");
310

    
311
  return 0;
312
}
313

    
314
static inline int
315
sk_join_group6(sock *s, ip_addr maddr)
316
{
317
  struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
318

    
319
  if (setsockopt(s->fd, SOL_IPV6, IPV6_JOIN_GROUP, &mr, sizeof(mr)) < 0)
320
    ERR("IPV6_JOIN_GROUP");
321

    
322
  return 0;
323
}
324

    
325
static inline int
326
sk_leave_group6(sock *s, ip_addr maddr)
327
{
328
  struct ipv6_mreq mr = INIT_MREQ6(maddr, s->iface);
329

    
330
  if (setsockopt(s->fd, SOL_IPV6, IPV6_LEAVE_GROUP, &mr, sizeof(mr)) < 0)
331
    ERR("IPV6_LEAVE_GROUP");
332

    
333
  return 0;
334
}
335

    
336

    
337
/*
338
 *        IPv6 packet control messages
339
 */
340

    
341
/* Also standardized, in RFC 3542 */
342

    
343
/*
344
 * RFC 2292 uses IPV6_PKTINFO for both the socket option and the cmsg
345
 * type, RFC 3542 changed the socket option to IPV6_RECVPKTINFO. If we
346
 * don't have IPV6_RECVPKTINFO we suppose the OS implements the older
347
 * RFC and we use IPV6_PKTINFO.
348
 */
349
#ifndef IPV6_RECVPKTINFO
350
#define IPV6_RECVPKTINFO IPV6_PKTINFO
351
#endif
352
/*
353
 * Same goes for IPV6_HOPLIMIT -> IPV6_RECVHOPLIMIT.
354
 */
355
#ifndef IPV6_RECVHOPLIMIT
356
#define IPV6_RECVHOPLIMIT IPV6_HOPLIMIT
357
#endif
358

    
359

    
360
#define CMSG6_SPACE_PKTINFO CMSG_SPACE(sizeof(struct in6_pktinfo))
361
#define CMSG6_SPACE_TTL CMSG_SPACE(sizeof(int))
362

    
363
static inline int
364
sk_request_cmsg6_pktinfo(sock *s)
365
{
366
  int y = 1;
367

    
368
  if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVPKTINFO, &y, sizeof(y)) < 0)
369
    ERR("IPV6_RECVPKTINFO");
370

    
371
  return 0;
372
}
373

    
374
static inline int
375
sk_request_cmsg6_ttl(sock *s)
376
{
377
  int y = 1;
378

    
379
  if (setsockopt(s->fd, SOL_IPV6, IPV6_RECVHOPLIMIT, &y, sizeof(y)) < 0)
380
    ERR("IPV6_RECVHOPLIMIT");
381

    
382
  return 0;
383
}
384

    
385
static inline void
386
sk_process_cmsg6_pktinfo(sock *s, struct cmsghdr *cm)
387
{
388
  if (cm->cmsg_type == IPV6_PKTINFO)
389
  {
390
    struct in6_pktinfo *pi = (struct in6_pktinfo *) CMSG_DATA(cm);
391
    s->laddr = ipa_from_in6(pi->ipi6_addr);
392
    s->lifindex = pi->ipi6_ifindex;
393
  }
394
}
395

    
396
static inline void
397
sk_process_cmsg6_ttl(sock *s, struct cmsghdr *cm)
398
{
399
  if (cm->cmsg_type == IPV6_HOPLIMIT)
400
    s->rcv_ttl = * (int *) CMSG_DATA(cm);
401
}
402

    
403
static inline void
404
sk_prepare_cmsgs6(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
405
{
406
  struct cmsghdr *cm;
407
  struct in6_pktinfo *pi;
408
  int controllen = 0;
409

    
410
  msg->msg_control = cbuf;
411
  msg->msg_controllen = cbuflen;
412

    
413
  cm = CMSG_FIRSTHDR(msg);
414
  cm->cmsg_level = SOL_IPV6;
415
  cm->cmsg_type = IPV6_PKTINFO;
416
  cm->cmsg_len = CMSG_LEN(sizeof(*pi));
417
  controllen += CMSG_SPACE(sizeof(*pi));
418

    
419
  pi = (struct in6_pktinfo *) CMSG_DATA(cm);
420
  pi->ipi6_ifindex = s->iface ? s->iface->index : 0;
421
  pi->ipi6_addr = ipa_to_in6(s->saddr);
422

    
423
  msg->msg_controllen = controllen;
424
}
425

    
426

    
427
/*
428
 *        Miscellaneous socket syscalls
429
 */
430

    
431
static inline int
432
sk_set_ttl4(sock *s, int ttl)
433
{
434
  if (setsockopt(s->fd, SOL_IP, IP_TTL, &ttl, sizeof(ttl)) < 0)
435
    ERR("IP_TTL");
436

    
437
  return 0;
438
}
439

    
440
static inline int
441
sk_set_ttl6(sock *s, int ttl)
442
{
443
  if (setsockopt(s->fd, SOL_IPV6, IPV6_UNICAST_HOPS, &ttl, sizeof(ttl)) < 0)
444
    ERR("IPV6_UNICAST_HOPS");
445

    
446
  return 0;
447
}
448

    
449
static inline int
450
sk_set_tos4(sock *s, int tos)
451
{
452
  if (setsockopt(s->fd, SOL_IP, IP_TOS, &tos, sizeof(tos)) < 0)
453
    ERR("IP_TOS");
454

    
455
  return 0;
456
}
457

    
458
static inline int
459
sk_set_tos6(sock *s, int tos)
460
{
461
  if (setsockopt(s->fd, SOL_IPV6, IPV6_TCLASS, &tos, sizeof(tos)) < 0)
462
    ERR("IPV6_TCLASS");
463

    
464
  return 0;
465
}
466

    
467
static inline int
468
sk_set_high_port(sock *s UNUSED)
469
{
470
  /* Port range setting is optional, ignore it if not supported */
471

    
472
#ifdef IP_PORTRANGE
473
  if (sk_is_ipv4(s))
474
  {
475
    int range = IP_PORTRANGE_HIGH;
476
    if (setsockopt(s->fd, SOL_IP, IP_PORTRANGE, &range, sizeof(range)) < 0)
477
      ERR("IP_PORTRANGE");
478
  }
479
#endif
480

    
481
#ifdef IPV6_PORTRANGE
482
  if (sk_is_ipv6(s))
483
  {
484
    int range = IPV6_PORTRANGE_HIGH;
485
    if (setsockopt(s->fd, SOL_IPV6, IPV6_PORTRANGE, &range, sizeof(range)) < 0)
486
      ERR("IPV6_PORTRANGE");
487
  }
488
#endif
489

    
490
  return 0;
491
}
492

    
493
static inline byte *
494
sk_skip_ip_header(byte *pkt, int *len)
495
{
496
  if ((*len < 20) || ((*pkt & 0xf0) != 0x40))
497
    return NULL;
498

    
499
  int hlen = (*pkt & 0x0f) * 4;
500
  if ((hlen < 20) || (hlen > *len))
501
    return NULL;
502

    
503
  *len -= hlen;
504
  return pkt + hlen;
505
}
506

    
507
byte *
508
sk_rx_buffer(sock *s, int *len)
509
{
510
  if (sk_is_ipv4(s) && (s->type == SK_IP))
511
    return sk_skip_ip_header(s->rbuf, len);
512
  else
513
    return s->rbuf;
514
}
515

    
516

    
517
/*
518
 *        Public socket functions
519
 */
520

    
521
/**
522
 * sk_setup_multicast - enable multicast for given socket
523
 * @s: socket
524
 *
525
 * Prepare transmission of multicast packets for given datagram socket.
526
 * The socket must have defined @iface.
527
 *
528
 * Result: 0 for success, -1 for an error.
529
 */
530

    
531
int
532
sk_setup_multicast(sock *s)
533
{
534
  ASSERT(s->iface);
535

    
536
  if (sk_is_ipv4(s))
537
    return sk_setup_multicast4(s);
538
  else
539
    return sk_setup_multicast6(s);
540
}
541

    
542
/**
543
 * sk_join_group - join multicast group for given socket
544
 * @s: socket
545
 * @maddr: multicast address
546
 *
547
 * Join multicast group for given datagram socket and associated interface.
548
 * The socket must have defined @iface.
549
 *
550
 * Result: 0 for success, -1 for an error.
551
 */
552

    
553
int
554
sk_join_group(sock *s, ip_addr maddr)
555
{
556
  if (sk_is_ipv4(s))
557
    return sk_join_group4(s, maddr);
558
  else
559
    return sk_join_group6(s, maddr);
560
}
561

    
562
/**
563
 * sk_leave_group - leave multicast group for given socket
564
 * @s: socket
565
 * @maddr: multicast address
566
 *
567
 * Leave multicast group for given datagram socket and associated interface.
568
 * The socket must have defined @iface.
569
 *
570
 * Result: 0 for success, -1 for an error.
571
 */
572

    
573
int
574
sk_leave_group(sock *s, ip_addr maddr)
575
{
576
  if (sk_is_ipv4(s))
577
    return sk_leave_group4(s, maddr);
578
  else
579
    return sk_leave_group6(s, maddr);
580
}
581

    
582
/**
583
 * sk_setup_broadcast - enable broadcast for given socket
584
 * @s: socket
585
 *
586
 * Allow reception and transmission of broadcast packets for given datagram
587
 * socket. The socket must have defined @iface. For transmission, packets should
588
 * be send to @brd address of @iface.
589
 *
590
 * Result: 0 for success, -1 for an error.
591
 */
592

    
593
int
594
sk_setup_broadcast(sock *s)
595
{
596
  int y = 1;
597

    
598
  if (setsockopt(s->fd, SOL_SOCKET, SO_BROADCAST, &y, sizeof(y)) < 0)
599
    ERR("SO_BROADCAST");
600

    
601
  return 0;
602
}
603

    
604
/**
605
 * sk_set_ttl - set transmit TTL for given socket
606
 * @s: socket
607
 * @ttl: TTL value
608
 *
609
 * Set TTL for already opened connections when TTL was not set before. Useful
610
 * for accepted connections when different ones should have different TTL.
611
 *
612
 * Result: 0 for success, -1 for an error.
613
 */
614

    
615
int
616
sk_set_ttl(sock *s, int ttl)
617
{
618
  s->ttl = ttl;
619

    
620
  if (sk_is_ipv4(s))
621
    return sk_set_ttl4(s, ttl);
622
  else
623
    return sk_set_ttl6(s, ttl);
624
}
625

    
626
/**
627
 * sk_set_min_ttl - set minimal accepted TTL for given socket
628
 * @s: socket
629
 * @ttl: TTL value
630
 *
631
 * Set minimal accepted TTL for given socket. Can be used for TTL security.
632
 * implementations.
633
 *
634
 * Result: 0 for success, -1 for an error.
635
 */
636

    
637
int
638
sk_set_min_ttl(sock *s, int ttl)
639
{
640
  if (sk_is_ipv4(s))
641
    return sk_set_min_ttl4(s, ttl);
642
  else
643
    return sk_set_min_ttl6(s, ttl);
644
}
645

    
646
#if 0
647
/**
648
 * sk_set_md5_auth - add / remove MD5 security association for given socket
649
 * @s: socket
650
 * @local: IP address of local side
651
 * @remote: IP address of remote side
652
 * @ifa: Interface for link-local IP address
653
 * @passwd: Password used for MD5 authentication
654
 * @setkey: Update also system SA/SP database
655
 *
656
 * In TCP MD5 handling code in kernel, there is a set of security associations
657
 * used for choosing password and other authentication parameters according to
658
 * the local and remote address. This function is useful for listening socket,
659
 * for active sockets it may be enough to set s->password field.
660
 *
661
 * When called with passwd != NULL, the new pair is added,
662
 * When called with passwd == NULL, the existing pair is removed.
663
 *
664
 * Note that while in Linux, the MD5 SAs are specific to socket, in BSD they are
665
 * stored in global SA/SP database (but the behavior also must be enabled on
666
 * per-socket basis). In case of multiple sockets to the same neighbor, the
667
 * socket-specific state must be configured for each socket while global state
668
 * just once per src-dst pair. The @setkey argument controls whether the global
669
 * state (SA/SP database) is also updated.
670
 *
671
 * Result: 0 for success, -1 for an error.
672
 */
673

674
int
675
sk_set_md5_auth(sock *s, ip_addr local, ip_addr remote, struct iface *ifa, char *passwd, int setkey)
676
{ DUMMY; }
677
#endif
678

    
679
/**
680
 * sk_set_ipv6_checksum - specify IPv6 checksum offset for given socket
681
 * @s: socket
682
 * @offset: offset
683
 *
684
 * Specify IPv6 checksum field offset for given raw IPv6 socket. After that, the
685
 * kernel will automatically fill it for outgoing packets and check it for
686
 * incoming packets. Should not be used on ICMPv6 sockets, where the position is
687
 * known to the kernel.
688
 *
689
 * Result: 0 for success, -1 for an error.
690
 */
691

    
692
int
693
sk_set_ipv6_checksum(sock *s, int offset)
694
{
695
  if (setsockopt(s->fd, SOL_IPV6, IPV6_CHECKSUM, &offset, sizeof(offset)) < 0)
696
    ERR("IPV6_CHECKSUM");
697

    
698
  return 0;
699
}
700

    
701
int
702
sk_set_icmp6_filter(sock *s, int p1, int p2)
703
{
704
  /* a bit of lame interface, but it is here only for Radv */
705
  struct icmp6_filter f;
706

    
707
  ICMP6_FILTER_SETBLOCKALL(&f);
708
  ICMP6_FILTER_SETPASS(p1, &f);
709
  ICMP6_FILTER_SETPASS(p2, &f);
710

    
711
  if (setsockopt(s->fd, SOL_ICMPV6, ICMP6_FILTER, &f, sizeof(f)) < 0)
712
    ERR("ICMP6_FILTER");
713

    
714
  return 0;
715
}
716

    
717
void
718
sk_log_error(sock *s, const char *p)
719
{
720
  log(L_ERR "%s: Socket error: %s%#m", p, s->err);
721
}
722

    
723

    
724
/*
725
 *        Actual struct birdsock code
726
 */
727

    
728
static list sock_list;
729
static struct birdsock *current_sock;
730
static struct birdsock *stored_sock;
731

    
732
static inline sock *
733
sk_next(sock *s)
734
{
735
  if (!s->n.next->next)
736
    return NULL;
737
  else
738
    return SKIP_BACK(sock, n, s->n.next);
739
}
740

    
741
static void
742
sk_alloc_bufs(sock *s)
743
{
744
  if (!s->rbuf && s->rbsize)
745
    s->rbuf = s->rbuf_alloc = xmalloc(s->rbsize);
746
  s->rpos = s->rbuf;
747
  if (!s->tbuf && s->tbsize)
748
    s->tbuf = s->tbuf_alloc = xmalloc(s->tbsize);
749
  s->tpos = s->ttx = s->tbuf;
750
}
751

    
752
static void
753
sk_free_bufs(sock *s)
754
{
755
  if (s->rbuf_alloc)
756
  {
757
    xfree(s->rbuf_alloc);
758
    s->rbuf = s->rbuf_alloc = NULL;
759
  }
760
  if (s->tbuf_alloc)
761
  {
762
    xfree(s->tbuf_alloc);
763
    s->tbuf = s->tbuf_alloc = NULL;
764
  }
765
}
766

    
767
#ifdef HAVE_LIBSSH
768
static void
769
sk_ssh_free(sock *s)
770
{
771
  struct ssh_sock *ssh = s->ssh;
772

    
773
  if (s->ssh == NULL)
774
    return;
775

    
776
  s->ssh = NULL;
777

    
778
  if (ssh->channel)
779
  {
780
    if (ssh_channel_is_open(ssh->channel))
781
      ssh_channel_close(ssh->channel);
782
    ssh_channel_free(ssh->channel);
783
    ssh->channel = NULL;
784
  }
785

    
786
  if (ssh->session)
787
  {
788
    ssh_disconnect(ssh->session);
789
    ssh_free(ssh->session);
790
    ssh->session = NULL;
791
  }
792
}
793
#endif
794

    
795
static void
796
sk_free(resource *r)
797
{
798
  sock *s = (sock *) r;
799

    
800
  sk_free_bufs(s);
801

    
802
#ifdef HAVE_LIBSSH
803
  if (s->type == SK_SSH || s->type == SK_SSH_ACTIVE)
804
    sk_ssh_free(s);
805
#endif
806

    
807
  if (s->fd < 0)
808
    return;
809

    
810
  /* FIXME: we should call sk_stop() for SKF_THREAD sockets */
811
  if (!(s->flags & SKF_THREAD))
812
  {
813
    if (s == current_sock)
814
      current_sock = sk_next(s);
815
    if (s == stored_sock)
816
      stored_sock = sk_next(s);
817
    rem_node(&s->n);
818
  }
819

    
820
  if (s->type != SK_SSH && s->type != SK_SSH_ACTIVE)
821
    close(s->fd);
822

    
823
  s->fd = -1;
824
}
825

    
826
void
827
sk_set_rbsize(sock *s, uint val)
828
{
829
  ASSERT(s->rbuf_alloc == s->rbuf);
830

    
831
  if (s->rbsize == val)
832
    return;
833

    
834
  s->rbsize = val;
835
  xfree(s->rbuf_alloc);
836
  s->rbuf_alloc = xmalloc(val);
837
  s->rpos = s->rbuf = s->rbuf_alloc;
838
}
839

    
840
void
841
sk_set_tbsize(sock *s, uint val)
842
{
843
  ASSERT(s->tbuf_alloc == s->tbuf);
844

    
845
  if (s->tbsize == val)
846
    return;
847

    
848
  byte *old_tbuf = s->tbuf;
849

    
850
  s->tbsize = val;
851
  s->tbuf = s->tbuf_alloc = xrealloc(s->tbuf_alloc, val);
852
  s->tpos = s->tbuf + (s->tpos - old_tbuf);
853
  s->ttx  = s->tbuf + (s->ttx  - old_tbuf);
854
}
855

    
856
void
857
sk_set_tbuf(sock *s, void *tbuf)
858
{
859
  s->tbuf = tbuf ?: s->tbuf_alloc;
860
  s->ttx = s->tpos = s->tbuf;
861
}
862

    
863
void
864
sk_reallocate(sock *s)
865
{
866
  sk_free_bufs(s);
867
  sk_alloc_bufs(s);
868
}
869

    
870
static void
871
sk_dump(resource *r)
872
{
873
  sock *s = (sock *) r;
874
  static char *sk_type_names[] = { "TCP<", "TCP>", "TCP", "UDP", NULL, "IP", NULL, "MAGIC", "UNIX<", "UNIX", "SSH>", "SSH", "DEL!" };
875

    
876
  debug("(%s, ud=%p, sa=%I, sp=%d, da=%I, dp=%d, tos=%d, ttl=%d, if=%s)\n",
877
        sk_type_names[s->type],
878
        s->data,
879
        s->saddr,
880
        s->sport,
881
        s->daddr,
882
        s->dport,
883
        s->tos,
884
        s->ttl,
885
        s->iface ? s->iface->name : "none");
886
}
887

    
888
static struct resclass sk_class = {
889
  "Socket",
890
  sizeof(sock),
891
  sk_free,
892
  sk_dump,
893
  NULL,
894
  NULL
895
};
896

    
897
/**
898
 * sk_new - create a socket
899
 * @p: pool
900
 *
901
 * This function creates a new socket resource. If you want to use it,
902
 * you need to fill in all the required fields of the structure and
903
 * call sk_open() to do the actual opening of the socket.
904
 *
905
 * The real function name is sock_new(), sk_new() is a macro wrapper
906
 * to avoid collision with OpenSSL.
907
 */
908
sock *
909
sock_new(pool *p)
910
{
911
  sock *s = ralloc(p, &sk_class);
912
  s->pool = p;
913
  // s->saddr = s->daddr = IPA_NONE;
914
  s->tos = s->priority = s->ttl = -1;
915
  s->fd = -1;
916
  return s;
917
}
918

    
919
static int
920
sk_setup(sock *s)
921
{
922
  int y = 1;
923
  int fd = s->fd;
924

    
925
  if (s->type == SK_SSH_ACTIVE)
926
    return 0;
927

    
928
  if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
929
    ERR("O_NONBLOCK");
930

    
931
  if (!s->af)
932
    return 0;
933

    
934
  if (ipa_nonzero(s->saddr) && !(s->flags & SKF_BIND))
935
    s->flags |= SKF_PKTINFO;
936

    
937
#ifdef CONFIG_USE_HDRINCL
938
  if (sk_is_ipv4(s) && (s->type == SK_IP) && (s->flags & SKF_PKTINFO))
939
  {
940
    s->flags &= ~SKF_PKTINFO;
941
    s->flags |= SKF_HDRINCL;
942
    if (setsockopt(fd, SOL_IP, IP_HDRINCL, &y, sizeof(y)) < 0)
943
      ERR("IP_HDRINCL");
944
  }
945
#endif
946

    
947
  if (s->vrf && !s->iface)
948
  {
949
    /* Bind socket to associated VRF interface.
950
       This is Linux-specific, but so is SO_BINDTODEVICE. */
951
#ifdef SO_BINDTODEVICE
952
    struct ifreq ifr = {};
953
    strcpy(ifr.ifr_name, s->vrf->name);
954
    if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
955
      ERR("SO_BINDTODEVICE");
956
#endif
957
  }
958

    
959
  if (s->iface)
960
  {
961
#ifdef SO_BINDTODEVICE
962
    struct ifreq ifr = {};
963
    strcpy(ifr.ifr_name, s->iface->name);
964
    if (setsockopt(s->fd, SOL_SOCKET, SO_BINDTODEVICE, &ifr, sizeof(ifr)) < 0)
965
      ERR("SO_BINDTODEVICE");
966
#endif
967

    
968
#ifdef CONFIG_UNIX_DONTROUTE
969
    if (setsockopt(s->fd, SOL_SOCKET, SO_DONTROUTE, &y, sizeof(y)) < 0)
970
      ERR("SO_DONTROUTE");
971
#endif
972
  }
973

    
974
  if (sk_is_ipv4(s))
975
  {
976
    if (s->flags & SKF_LADDR_RX)
977
      if (sk_request_cmsg4_pktinfo(s) < 0)
978
        return -1;
979

    
980
    if (s->flags & SKF_TTL_RX)
981
      if (sk_request_cmsg4_ttl(s) < 0)
982
        return -1;
983

    
984
    if ((s->type == SK_UDP) || (s->type == SK_IP))
985
      if (sk_disable_mtu_disc4(s) < 0)
986
        return -1;
987

    
988
    if (s->ttl >= 0)
989
      if (sk_set_ttl4(s, s->ttl) < 0)
990
        return -1;
991

    
992
    if (s->tos >= 0)
993
      if (sk_set_tos4(s, s->tos) < 0)
994
        return -1;
995
  }
996

    
997
  if (sk_is_ipv6(s))
998
  {
999
    if ((s->type == SK_TCP_PASSIVE) || (s->type == SK_TCP_ACTIVE) || (s->type == SK_UDP))
1000
      if (setsockopt(fd, SOL_IPV6, IPV6_V6ONLY, &y, sizeof(y)) < 0)
1001
        ERR("IPV6_V6ONLY");
1002

    
1003
    if (s->flags & SKF_LADDR_RX)
1004
      if (sk_request_cmsg6_pktinfo(s) < 0)
1005
        return -1;
1006

    
1007
    if (s->flags & SKF_TTL_RX)
1008
      if (sk_request_cmsg6_ttl(s) < 0)
1009
        return -1;
1010

    
1011
    if ((s->type == SK_UDP) || (s->type == SK_IP))
1012
      if (sk_disable_mtu_disc6(s) < 0)
1013
        return -1;
1014

    
1015
    if (s->ttl >= 0)
1016
      if (sk_set_ttl6(s, s->ttl) < 0)
1017
        return -1;
1018

    
1019
    if (s->tos >= 0)
1020
      if (sk_set_tos6(s, s->tos) < 0)
1021
        return -1;
1022
  }
1023

    
1024
  /* Must be after sk_set_tos4() as setting ToS on Linux also mangles priority */
1025
  if (s->priority >= 0)
1026
    if (sk_set_priority(s, s->priority) < 0)
1027
      return -1;
1028

    
1029
  return 0;
1030
}
1031

    
1032
static void
1033
sk_insert(sock *s)
1034
{
1035
  add_tail(&sock_list, &s->n);
1036
}
1037

    
1038
static void
1039
sk_tcp_connected(sock *s)
1040
{
1041
  sockaddr sa;
1042
  int sa_len = sizeof(sa);
1043

    
1044
  if ((getsockname(s->fd, &sa.sa, &sa_len) < 0) ||
1045
      (sockaddr_read(&sa, s->af, &s->saddr, &s->iface, &s->sport) < 0))
1046
    log(L_WARN "SOCK: Cannot get local IP address for TCP>");
1047

    
1048
  s->type = SK_TCP;
1049
  sk_alloc_bufs(s);
1050
  s->tx_hook(s);
1051
}
1052

    
1053
#ifdef HAVE_LIBSSH
1054
static void
1055
sk_ssh_connected(sock *s)
1056
{
1057
  sk_alloc_bufs(s);
1058
  s->type = SK_SSH;
1059
  s->tx_hook(s);
1060
}
1061
#endif
1062

    
1063
static int
1064
sk_passive_connected(sock *s, int type)
1065
{
1066
  sockaddr loc_sa, rem_sa;
1067
  int loc_sa_len = sizeof(loc_sa);
1068
  int rem_sa_len = sizeof(rem_sa);
1069

    
1070
  int fd = accept(s->fd, ((type == SK_TCP) ? &rem_sa.sa : NULL), &rem_sa_len);
1071
  if (fd < 0)
1072
  {
1073
    if ((errno != EINTR) && (errno != EAGAIN))
1074
      s->err_hook(s, errno);
1075
    return 0;
1076
  }
1077

    
1078
  sock *t = sk_new(s->pool);
1079
  t->type = type;
1080
  t->data = s->data;
1081
  t->af = s->af;
1082
  t->fd = fd;
1083
  t->ttl = s->ttl;
1084
  t->tos = s->tos;
1085
  t->vrf = s->vrf;
1086
  t->rbsize = s->rbsize;
1087
  t->tbsize = s->tbsize;
1088

    
1089
  if (type == SK_TCP)
1090
  {
1091
    if ((getsockname(fd, &loc_sa.sa, &loc_sa_len) < 0) ||
1092
        (sockaddr_read(&loc_sa, s->af, &t->saddr, &t->iface, &t->sport) < 0))
1093
      log(L_WARN "SOCK: Cannot get local IP address for TCP<");
1094

    
1095
    if (sockaddr_read(&rem_sa, s->af, &t->daddr, &t->iface, &t->dport) < 0)
1096
      log(L_WARN "SOCK: Cannot get remote IP address for TCP<");
1097
  }
1098

    
1099
  if (sk_setup(t) < 0)
1100
  {
1101
    /* FIXME: Call err_hook instead ? */
1102
    log(L_ERR "SOCK: Incoming connection: %s%#m", t->err);
1103

    
1104
    /* FIXME: handle it better in rfree() */
1105
    close(t->fd);
1106
    t->fd = -1;
1107
    rfree(t);
1108
    return 1;
1109
  }
1110

    
1111
  sk_insert(t);
1112
  sk_alloc_bufs(t);
1113
  s->rx_hook(t, 0);
1114
  return 1;
1115
}
1116

    
1117
#ifdef HAVE_LIBSSH
1118
/*
1119
 * Return SSH_OK or SSH_AGAIN or SSH_ERROR
1120
 */
1121
static int
1122
sk_ssh_connect(sock *s)
1123
{
1124
  s->fd = ssh_get_fd(s->ssh->session);
1125

    
1126
  /* Big fall thru automata */
1127
  switch (s->ssh->state)
1128
  {
1129
  case SK_SSH_CONNECT:
1130
  {
1131
    switch (ssh_connect(s->ssh->session))
1132
    {
1133
    case SSH_AGAIN:
1134
      /* A quick look into libSSH shows that ssh_get_fd() should return non-(-1)
1135
       * after SSH_AGAIN is returned by ssh_connect(). This is however nowhere
1136
       * documented but our code relies on that.
1137
       */
1138
      return SSH_AGAIN;
1139

    
1140
    case SSH_OK:
1141
      break;
1142

    
1143
    default:
1144
      return SSH_ERROR;
1145
    }
1146
  } /* fallthrough */
1147

    
1148
  case SK_SSH_SERVER_KNOWN:
1149
  {
1150
    s->ssh->state = SK_SSH_SERVER_KNOWN;
1151

    
1152
    if (s->ssh->server_hostkey_path)
1153
    {
1154
      int server_identity_is_ok = 1;
1155

    
1156
      /* Check server identity */
1157
      switch (ssh_is_server_known(s->ssh->session))
1158
      {
1159
#define LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s,msg,args...) log(L_WARN "SSH Identity %s@%s:%u: " msg, (s)->ssh->username, (s)->host, (s)->dport, ## args);
1160
      case SSH_SERVER_KNOWN_OK:
1161
        /* The server is known and has not changed. */
1162
        break;
1163

    
1164
      case SSH_SERVER_NOT_KNOWN:
1165
        LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server is unknown, its public key was not found in the known host file %s", s->ssh->server_hostkey_path);
1166
        break;
1167

    
1168
      case SSH_SERVER_KNOWN_CHANGED:
1169
        LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server key has changed. Either you are under attack or the administrator changed the key.");
1170
        server_identity_is_ok = 0;
1171
        break;
1172

    
1173
      case SSH_SERVER_FILE_NOT_FOUND:
1174
        LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The known host file %s does not exist", s->ssh->server_hostkey_path);
1175
        server_identity_is_ok = 0;
1176
        break;
1177

    
1178
      case SSH_SERVER_ERROR:
1179
        LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "Some error happened");
1180
        server_identity_is_ok = 0;
1181
        break;
1182

    
1183
      case SSH_SERVER_FOUND_OTHER:
1184
        LOG_WARN_ABOUT_SSH_SERVER_VALIDATION(s, "The server gave use a key of a type while we had an other type recorded. " \
1185
                                             "It is a possible attack.");
1186
        server_identity_is_ok = 0;
1187
        break;
1188
      }
1189

    
1190
      if (!server_identity_is_ok)
1191
        return SSH_ERROR;
1192
    }
1193
  } /* fallthrough */
1194

    
1195
  case SK_SSH_USERAUTH:
1196
  {
1197
    s->ssh->state = SK_SSH_USERAUTH;
1198
    switch (ssh_userauth_publickey_auto(s->ssh->session, NULL, NULL))
1199
    {
1200
    case SSH_AUTH_AGAIN:
1201
      return SSH_AGAIN;
1202

    
1203
    case SSH_AUTH_SUCCESS:
1204
      break;
1205

    
1206
    default:
1207
      return SSH_ERROR;
1208
    }
1209
  } /* fallthrough */
1210

    
1211
  case SK_SSH_CHANNEL:
1212
  {
1213
    s->ssh->state = SK_SSH_CHANNEL;
1214
    s->ssh->channel = ssh_channel_new(s->ssh->session);
1215
    if (s->ssh->channel == NULL)
1216
      return SSH_ERROR;
1217
  } /* fallthrough */
1218

    
1219
  case SK_SSH_SESSION:
1220
  {
1221
    s->ssh->state = SK_SSH_SESSION;
1222
    switch (ssh_channel_open_session(s->ssh->channel))
1223
    {
1224
    case SSH_AGAIN:
1225
      return SSH_AGAIN;
1226

    
1227
    case SSH_OK:
1228
      break;
1229

    
1230
    default:
1231
      return SSH_ERROR;
1232
    }
1233
  } /* fallthrough */
1234

    
1235
  case SK_SSH_SUBSYSTEM:
1236
  {
1237
    s->ssh->state = SK_SSH_SUBSYSTEM;
1238
    if (s->ssh->subsystem)
1239
    {
1240
      switch (ssh_channel_request_subsystem(s->ssh->channel, s->ssh->subsystem))
1241
      {
1242
      case SSH_AGAIN:
1243
        return SSH_AGAIN;
1244

    
1245
      case SSH_OK:
1246
        break;
1247

    
1248
      default:
1249
        return SSH_ERROR;
1250
      }
1251
    }
1252
  } /* fallthrough */
1253

    
1254
  case SK_SSH_ESTABLISHED:
1255
    s->ssh->state = SK_SSH_ESTABLISHED;
1256
  }
1257

    
1258
  return SSH_OK;
1259
}
1260

    
1261
/*
1262
 * Return file descriptor number if success
1263
 * Return -1 if failed
1264
 */
1265
static int
1266
sk_open_ssh(sock *s)
1267
{
1268
  if (!s->ssh)
1269
    bug("sk_open() sock->ssh is not allocated");
1270

    
1271
  ssh_session sess = ssh_new();
1272
  if (sess == NULL)
1273
    ERR2("Cannot create a ssh session");
1274
  s->ssh->session = sess;
1275

    
1276
  const int verbosity = SSH_LOG_NOLOG;
1277
  ssh_options_set(sess, SSH_OPTIONS_LOG_VERBOSITY, &verbosity);
1278
  ssh_options_set(sess, SSH_OPTIONS_HOST, s->host);
1279
  ssh_options_set(sess, SSH_OPTIONS_PORT, &(s->dport));
1280
  /* TODO: Add SSH_OPTIONS_BINDADDR */
1281
  ssh_options_set(sess, SSH_OPTIONS_USER, s->ssh->username);
1282

    
1283
  if (s->ssh->server_hostkey_path)
1284
    ssh_options_set(sess, SSH_OPTIONS_KNOWNHOSTS, s->ssh->server_hostkey_path);
1285

    
1286
  if (s->ssh->client_privkey_path)
1287
    ssh_options_set(sess, SSH_OPTIONS_IDENTITY, s->ssh->client_privkey_path);
1288

    
1289
  ssh_set_blocking(sess, 0);
1290

    
1291
  switch (sk_ssh_connect(s))
1292
  {
1293
  case SSH_AGAIN:
1294
    break;
1295

    
1296
  case SSH_OK:
1297
    sk_ssh_connected(s);
1298
    break;
1299

    
1300
  case SSH_ERROR:
1301
    ERR2(ssh_get_error(sess));
1302
    break;
1303
  }
1304

    
1305
  return ssh_get_fd(sess);
1306

    
1307
 err:
1308
  return -1;
1309
}
1310
#endif
1311

    
1312
/**
1313
 * sk_open - open a socket
1314
 * @s: socket
1315
 *
1316
 * This function takes a socket resource created by sk_new() and
1317
 * initialized by the user and binds a corresponding network connection
1318
 * to it.
1319
 *
1320
 * Result: 0 for success, -1 for an error.
1321
 */
1322
int
1323
sk_open(sock *s)
1324
{
1325
  int af = AF_UNSPEC;
1326
  int fd = -1;
1327
  int do_bind = 0;
1328
  int bind_port = 0;
1329
  ip_addr bind_addr = IPA_NONE;
1330
  sockaddr sa;
1331

    
1332
  if (s->type <= SK_IP)
1333
  {
1334
    /*
1335
     * For TCP/IP sockets, Address family (IPv4 or IPv6) can be specified either
1336
     * explicitly (SK_IPV4 or SK_IPV6) or implicitly (based on saddr, daddr).
1337
     * But the specifications have to be consistent.
1338
     */
1339

    
1340
    switch (s->subtype)
1341
    {
1342
    case 0:
1343
      ASSERT(ipa_zero(s->saddr) || ipa_zero(s->daddr) ||
1344
             (ipa_is_ip4(s->saddr) == ipa_is_ip4(s->daddr)));
1345
      af = (ipa_is_ip4(s->saddr) || ipa_is_ip4(s->daddr)) ? AF_INET : AF_INET6;
1346
      break;
1347

    
1348
    case SK_IPV4:
1349
      ASSERT(ipa_zero(s->saddr) || ipa_is_ip4(s->saddr));
1350
      ASSERT(ipa_zero(s->daddr) || ipa_is_ip4(s->daddr));
1351
      af = AF_INET;
1352
      break;
1353

    
1354
    case SK_IPV6:
1355
      ASSERT(ipa_zero(s->saddr) || !ipa_is_ip4(s->saddr));
1356
      ASSERT(ipa_zero(s->daddr) || !ipa_is_ip4(s->daddr));
1357
      af = AF_INET6;
1358
      break;
1359

    
1360
    default:
1361
      bug("Invalid subtype %d", s->subtype);
1362
    }
1363
  }
1364

    
1365
  switch (s->type)
1366
  {
1367
  case SK_TCP_ACTIVE:
1368
    s->ttx = "";                        /* Force s->ttx != s->tpos */
1369
    /* Fall thru */
1370
  case SK_TCP_PASSIVE:
1371
    fd = socket(af, SOCK_STREAM, IPPROTO_TCP);
1372
    bind_port = s->sport;
1373
    bind_addr = s->saddr;
1374
    do_bind = bind_port || ipa_nonzero(bind_addr);
1375
    break;
1376

    
1377
#ifdef HAVE_LIBSSH
1378
  case SK_SSH_ACTIVE:
1379
    s->ttx = "";                        /* Force s->ttx != s->tpos */
1380
    fd = sk_open_ssh(s);
1381
    break;
1382
#endif
1383

    
1384
  case SK_UDP:
1385
    fd = socket(af, SOCK_DGRAM, IPPROTO_UDP);
1386
    bind_port = s->sport;
1387
    bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
1388
    do_bind = 1;
1389
    break;
1390

    
1391
  case SK_IP:
1392
    fd = socket(af, SOCK_RAW, s->dport);
1393
    bind_port = 0;
1394
    bind_addr = (s->flags & SKF_BIND) ? s->saddr : IPA_NONE;
1395
    do_bind = ipa_nonzero(bind_addr);
1396
    break;
1397

    
1398
  case SK_MAGIC:
1399
    af = 0;
1400
    fd = s->fd;
1401
    break;
1402

    
1403
  default:
1404
    bug("sk_open() called for invalid sock type %d", s->type);
1405
  }
1406

    
1407
  if (fd < 0)
1408
    ERR("socket");
1409

    
1410
  s->af = af;
1411
  s->fd = fd;
1412

    
1413
  if (sk_setup(s) < 0)
1414
    goto err;
1415

    
1416
  if (do_bind)
1417
  {
1418
    if (bind_port)
1419
    {
1420
      int y = 1;
1421

    
1422
      if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &y, sizeof(y)) < 0)
1423
        ERR2("SO_REUSEADDR");
1424

    
1425
#ifdef CONFIG_NO_IFACE_BIND
1426
      /* Workaround missing ability to bind to an iface */
1427
      if ((s->type == SK_UDP) && s->iface && ipa_zero(bind_addr))
1428
      {
1429
        if (setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &y, sizeof(y)) < 0)
1430
          ERR2("SO_REUSEPORT");
1431
      }
1432
#endif
1433
    }
1434
    else
1435
      if (s->flags & SKF_HIGH_PORT)
1436
        if (sk_set_high_port(s) < 0)
1437
          log(L_WARN "Socket error: %s%#m", s->err);
1438

    
1439
    sockaddr_fill(&sa, s->af, bind_addr, s->iface, bind_port);
1440
    if (bind(fd, &sa.sa, SA_LEN(sa)) < 0)
1441
      ERR2("bind");
1442
  }
1443

    
1444
  if (s->password)
1445
    if (sk_set_md5_auth(s, s->saddr, s->daddr, s->iface, s->password, 0) < 0)
1446
      goto err;
1447

    
1448
  switch (s->type)
1449
  {
1450
  case SK_TCP_ACTIVE:
1451
    sockaddr_fill(&sa, s->af, s->daddr, s->iface, s->dport);
1452
    if (connect(fd, &sa.sa, SA_LEN(sa)) >= 0)
1453
      sk_tcp_connected(s);
1454
    else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS &&
1455
             errno != ECONNREFUSED && errno != EHOSTUNREACH && errno != ENETUNREACH)
1456
      ERR2("connect");
1457
    break;
1458

    
1459
  case SK_TCP_PASSIVE:
1460
    if (listen(fd, 8) < 0)
1461
      ERR2("listen");
1462
    break;
1463

    
1464
  case SK_SSH_ACTIVE:
1465
  case SK_MAGIC:
1466
    break;
1467

    
1468
  default:
1469
    sk_alloc_bufs(s);
1470
  }
1471

    
1472
  if (!(s->flags & SKF_THREAD))
1473
    sk_insert(s);
1474

    
1475
  return 0;
1476

    
1477
err:
1478
  close(fd);
1479
  s->fd = -1;
1480
  return -1;
1481
}
1482

    
1483
int
1484
sk_open_unix(sock *s, char *name)
1485
{
1486
  struct sockaddr_un sa;
1487
  int fd;
1488

    
1489
  /* We are sloppy during error (leak fd and not set s->err), but we die anyway */
1490

    
1491
  fd = socket(AF_UNIX, SOCK_STREAM, 0);
1492
  if (fd < 0)
1493
    return -1;
1494

    
1495
  if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0)
1496
    return -1;
1497

    
1498
  /* Path length checked in test_old_bird() */
1499
  sa.sun_family = AF_UNIX;
1500
  strcpy(sa.sun_path, name);
1501

    
1502
  if (bind(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) < 0)
1503
    return -1;
1504

    
1505
  if (listen(fd, 8) < 0)
1506
    return -1;
1507

    
1508
  s->fd = fd;
1509
  sk_insert(s);
1510
  return 0;
1511
}
1512

    
1513

    
1514
#define CMSG_RX_SPACE MAX(CMSG4_SPACE_PKTINFO+CMSG4_SPACE_TTL, \
1515
                          CMSG6_SPACE_PKTINFO+CMSG6_SPACE_TTL)
1516
#define CMSG_TX_SPACE MAX(CMSG4_SPACE_PKTINFO,CMSG6_SPACE_PKTINFO)
1517

    
1518
static void
1519
sk_prepare_cmsgs(sock *s, struct msghdr *msg, void *cbuf, size_t cbuflen)
1520
{
1521
  if (sk_is_ipv4(s))
1522
    sk_prepare_cmsgs4(s, msg, cbuf, cbuflen);
1523
  else
1524
    sk_prepare_cmsgs6(s, msg, cbuf, cbuflen);
1525
}
1526

    
1527
static void
1528
sk_process_cmsgs(sock *s, struct msghdr *msg)
1529
{
1530
  struct cmsghdr *cm;
1531

    
1532
  s->laddr = IPA_NONE;
1533
  s->lifindex = 0;
1534
  s->rcv_ttl = -1;
1535

    
1536
  for (cm = CMSG_FIRSTHDR(msg); cm != NULL; cm = CMSG_NXTHDR(msg, cm))
1537
  {
1538
    if ((cm->cmsg_level == SOL_IP) && sk_is_ipv4(s))
1539
    {
1540
      sk_process_cmsg4_pktinfo(s, cm);
1541
      sk_process_cmsg4_ttl(s, cm);
1542
    }
1543

    
1544
    if ((cm->cmsg_level == SOL_IPV6) && sk_is_ipv6(s))
1545
    {
1546
      sk_process_cmsg6_pktinfo(s, cm);
1547
      sk_process_cmsg6_ttl(s, cm);
1548
    }
1549
  }
1550
}
1551

    
1552

    
1553
static inline int
1554
sk_sendmsg(sock *s)
1555
{
1556
  struct iovec iov = {s->tbuf, s->tpos - s->tbuf};
1557
  byte cmsg_buf[CMSG_TX_SPACE];
1558
  sockaddr dst;
1559
  int flags = 0;
1560

    
1561
  sockaddr_fill(&dst, s->af, s->daddr, s->iface, s->dport);
1562

    
1563
  struct msghdr msg = {
1564
    .msg_name = &dst.sa,
1565
    .msg_namelen = SA_LEN(dst),
1566
    .msg_iov = &iov,
1567
    .msg_iovlen = 1
1568
  };
1569

    
1570
#ifdef CONFIG_DONTROUTE_UNICAST
1571
  /* FreeBSD silently changes TTL to 1 when MSG_DONTROUTE is used, therefore we
1572
     cannot use it for other cases (e.g. when TTL security is used). */
1573
  if (ipa_is_ip4(s->daddr) && ip4_is_unicast(ipa_to_ip4(s->daddr)) && (s->ttl == 1))
1574
    flags = MSG_DONTROUTE;
1575
#endif
1576

    
1577
#ifdef CONFIG_USE_HDRINCL
1578
  byte hdr[20];
1579
  struct iovec iov2[2] = { {hdr, 20}, iov };
1580

    
1581
  if (s->flags & SKF_HDRINCL)
1582
  {
1583
    sk_prepare_ip_header(s, hdr, iov.iov_len);
1584
    msg.msg_iov = iov2;
1585
    msg.msg_iovlen = 2;
1586
  }
1587
#endif
1588

    
1589
  if (s->flags & SKF_PKTINFO)
1590
    sk_prepare_cmsgs(s, &msg, cmsg_buf, sizeof(cmsg_buf));
1591

    
1592
  return sendmsg(s->fd, &msg, flags);
1593
}
1594

    
1595
static inline int
1596
sk_recvmsg(sock *s)
1597
{
1598
  struct iovec iov = {s->rbuf, s->rbsize};
1599
  byte cmsg_buf[CMSG_RX_SPACE];
1600
  sockaddr src;
1601

    
1602
  struct msghdr msg = {
1603
    .msg_name = &src.sa,
1604
    .msg_namelen = sizeof(src), // XXXX ??
1605
    .msg_iov = &iov,
1606
    .msg_iovlen = 1,
1607
    .msg_control = cmsg_buf,
1608
    .msg_controllen = sizeof(cmsg_buf),
1609
    .msg_flags = 0
1610
  };
1611

    
1612
  int rv = recvmsg(s->fd, &msg, 0);
1613
  if (rv < 0)
1614
    return rv;
1615

    
1616
  //ifdef IPV4
1617
  //  if (cf_type == SK_IP)
1618
  //    rv = ipv4_skip_header(pbuf, rv);
1619
  //endif
1620

    
1621
  sockaddr_read(&src, s->af, &s->faddr, NULL, &s->fport);
1622
  sk_process_cmsgs(s, &msg);
1623

    
1624
  if (msg.msg_flags & MSG_TRUNC)
1625
    s->flags |= SKF_TRUNCATED;
1626
  else
1627
    s->flags &= ~SKF_TRUNCATED;
1628

    
1629
  return rv;
1630
}
1631

    
1632

    
1633
static inline void reset_tx_buffer(sock *s) { s->ttx = s->tpos = s->tbuf; }
1634

    
1635
static int
1636
sk_maybe_write(sock *s)
1637
{
1638
  int e;
1639

    
1640
  switch (s->type)
1641
  {
1642
  case SK_TCP:
1643
  case SK_MAGIC:
1644
  case SK_UNIX:
1645
    while (s->ttx != s->tpos)
1646
    {
1647
      e = write(s->fd, s->ttx, s->tpos - s->ttx);
1648

    
1649
      if (e < 0)
1650
      {
1651
        if (errno != EINTR && errno != EAGAIN)
1652
        {
1653
          reset_tx_buffer(s);
1654
          /* EPIPE is just a connection close notification during TX */
1655
          s->err_hook(s, (errno != EPIPE) ? errno : 0);
1656
          return -1;
1657
        }
1658
        return 0;
1659
      }
1660
      s->ttx += e;
1661
    }
1662
    reset_tx_buffer(s);
1663
    return 1;
1664

    
1665
#ifdef HAVE_LIBSSH
1666
  case SK_SSH:
1667
    while (s->ttx != s->tpos)
1668
    {
1669
      e = ssh_channel_write(s->ssh->channel, s->ttx, s->tpos - s->ttx);
1670

    
1671
      if (e < 0)
1672
      {
1673
        s->err = ssh_get_error(s->ssh->session);
1674
        s->err_hook(s, ssh_get_error_code(s->ssh->session));
1675

    
1676
        reset_tx_buffer(s);
1677
        /* EPIPE is just a connection close notification during TX */
1678
        s->err_hook(s, (errno != EPIPE) ? errno : 0);
1679
        return -1;
1680
      }
1681
      s->ttx += e;
1682
    }
1683
    reset_tx_buffer(s);
1684
    return 1;
1685
#endif
1686

    
1687
  case SK_UDP:
1688
  case SK_IP:
1689
    {
1690
      if (s->tbuf == s->tpos)
1691
        return 1;
1692

    
1693
      e = sk_sendmsg(s);
1694

    
1695
      if (e < 0)
1696
      {
1697
        if (errno != EINTR && errno != EAGAIN)
1698
        {
1699
          reset_tx_buffer(s);
1700
          s->err_hook(s, errno);
1701
          return -1;
1702
        }
1703

    
1704
        if (!s->tx_hook)
1705
          reset_tx_buffer(s);
1706
        return 0;
1707
      }
1708
      reset_tx_buffer(s);
1709
      return 1;
1710
    }
1711

    
1712
  default:
1713
    bug("sk_maybe_write: unknown socket type %d", s->type);
1714
  }
1715
}
1716

    
1717
int
1718
sk_rx_ready(sock *s)
1719
{
1720
  int rv;
1721
  struct pollfd pfd = { .fd = s->fd };
1722
  pfd.events |= POLLIN;
1723

    
1724
 redo:
1725
  rv = poll(&pfd, 1, 0);
1726

    
1727
  if ((rv < 0) && (errno == EINTR || errno == EAGAIN))
1728
    goto redo;
1729

    
1730
  return rv;
1731
}
1732

    
1733
/**
1734
 * sk_send - send data to a socket
1735
 * @s: socket
1736
 * @len: number of bytes to send
1737
 *
1738
 * This function sends @len bytes of data prepared in the
1739
 * transmit buffer of the socket @s to the network connection.
1740
 * If the packet can be sent immediately, it does so and returns
1741
 * 1, else it queues the packet for later processing, returns 0
1742
 * and calls the @tx_hook of the socket when the tranmission
1743
 * takes place.
1744
 */
1745
int
1746
sk_send(sock *s, unsigned len)
1747
{
1748
  s->ttx = s->tbuf;
1749
  s->tpos = s->tbuf + len;
1750
  return sk_maybe_write(s);
1751
}
1752

    
1753
/**
1754
 * sk_send_to - send data to a specific destination
1755
 * @s: socket
1756
 * @len: number of bytes to send
1757
 * @addr: IP address to send the packet to
1758
 * @port: port to send the packet to
1759
 *
1760
 * This is a sk_send() replacement for connection-less packet sockets
1761
 * which allows destination of the packet to be chosen dynamically.
1762
 * Raw IP sockets should use 0 for @port.
1763
 */
1764
int
1765
sk_send_to(sock *s, unsigned len, ip_addr addr, unsigned port)
1766
{
1767
  s->daddr = addr;
1768
  if (port)
1769
    s->dport = port;
1770

    
1771
  s->ttx = s->tbuf;
1772
  s->tpos = s->tbuf + len;
1773
  return sk_maybe_write(s);
1774
}
1775

    
1776
/*
1777
int
1778
sk_send_full(sock *s, unsigned len, struct iface *ifa,
1779
             ip_addr saddr, ip_addr daddr, unsigned dport)
1780
{
1781
  s->iface = ifa;
1782
  s->saddr = saddr;
1783
  s->daddr = daddr;
1784
  s->dport = dport;
1785
  s->ttx = s->tbuf;
1786
  s->tpos = s->tbuf + len;
1787
  return sk_maybe_write(s);
1788
}
1789
*/
1790

    
1791
static void
1792
call_rx_hook(sock *s, int size)
1793
{
1794
  if (s->rx_hook(s, size))
1795
  {
1796
    /* We need to be careful since the socket could have been deleted by the hook */
1797
    if (current_sock == s)
1798
      s->rpos = s->rbuf;
1799
  }
1800
}
1801

    
1802
#ifdef HAVE_LIBSSH
1803
static int
1804
sk_read_ssh(sock *s)
1805
{
1806
  ssh_channel rchans[2] = { s->ssh->channel, NULL };
1807
  struct timeval timev = { 1, 0 };
1808

    
1809
  if (ssh_channel_select(rchans, NULL, NULL, &timev) == SSH_EINTR)
1810
    return 1; /* Try again */
1811

    
1812
  if (ssh_channel_is_eof(s->ssh->channel) != 0)
1813
  {
1814
    /* The remote side is closing the connection */
1815
    s->err_hook(s, 0);
1816
    return 0;
1817
  }
1818

    
1819
  if (rchans[0] == NULL)
1820
    return 0; /* No data is available on the socket */
1821

    
1822
  const uint used_bytes = s->rpos - s->rbuf;
1823
  const int read_bytes = ssh_channel_read_nonblocking(s->ssh->channel, s->rpos, s->rbsize - used_bytes, 0);
1824
  if (read_bytes > 0)
1825
  {
1826
    /* Received data */
1827
    s->rpos += read_bytes;
1828
    call_rx_hook(s, used_bytes + read_bytes);
1829
    return 1;
1830
  }
1831
  else if (read_bytes == 0)
1832
  {
1833
    if (ssh_channel_is_eof(s->ssh->channel) != 0)
1834
    {
1835
        /* The remote side is closing the connection */
1836
        s->err_hook(s, 0);
1837
    }
1838
  }
1839
  else
1840
  {
1841
    s->err = ssh_get_error(s->ssh->session);
1842
    s->err_hook(s, ssh_get_error_code(s->ssh->session));
1843
  }
1844

    
1845
  return 0; /* No data is available on the socket */
1846
}
1847
#endif
1848

    
1849
 /* sk_read() and sk_write() are called from BFD's event loop */
1850

    
1851
int
1852
sk_read(sock *s, int revents)
1853
{
1854
  switch (s->type)
1855
  {
1856
  case SK_TCP_PASSIVE:
1857
    return sk_passive_connected(s, SK_TCP);
1858

    
1859
  case SK_UNIX_PASSIVE:
1860
    return sk_passive_connected(s, SK_UNIX);
1861

    
1862
  case SK_TCP:
1863
  case SK_UNIX:
1864
    {
1865
      int c = read(s->fd, s->rpos, s->rbuf + s->rbsize - s->rpos);
1866

    
1867
      if (c < 0)
1868
      {
1869
        if (errno != EINTR && errno != EAGAIN)
1870
          s->err_hook(s, errno);
1871
        else if (errno == EAGAIN && !(revents & POLLIN))
1872
        {
1873
          log(L_ERR "Got EAGAIN from read when revents=%x (without POLLIN)", revents);
1874
          s->err_hook(s, 0);
1875
        }
1876
      }
1877
      else if (!c)
1878
        s->err_hook(s, 0);
1879
      else
1880
      {
1881
        s->rpos += c;
1882
        call_rx_hook(s, s->rpos - s->rbuf);
1883
        return 1;
1884
      }
1885
      return 0;
1886
    }
1887

    
1888
#ifdef HAVE_LIBSSH
1889
  case SK_SSH:
1890
    return sk_read_ssh(s);
1891
#endif
1892

    
1893
  case SK_MAGIC:
1894
    return s->rx_hook(s, 0);
1895

    
1896
  default:
1897
    {
1898
      int e = sk_recvmsg(s);
1899

    
1900
      if (e < 0)
1901
      {
1902
        if (errno != EINTR && errno != EAGAIN)
1903
          s->err_hook(s, errno);
1904
        return 0;
1905
      }
1906

    
1907
      s->rpos = s->rbuf + e;
1908
      s->rx_hook(s, e);
1909
      return 1;
1910
    }
1911
  }
1912
}
1913

    
1914
int
1915
sk_write(sock *s)
1916
{
1917
  switch (s->type)
1918
  {
1919
  case SK_TCP_ACTIVE:
1920
    {
1921
      sockaddr sa;
1922
      sockaddr_fill(&sa, s->af, s->daddr, s->iface, s->dport);
1923

    
1924
      if (connect(s->fd, &sa.sa, SA_LEN(sa)) >= 0 || errno == EISCONN)
1925
        sk_tcp_connected(s);
1926
      else if (errno != EINTR && errno != EAGAIN && errno != EINPROGRESS)
1927
        s->err_hook(s, errno);
1928
      return 0;
1929
    }
1930

    
1931
#ifdef HAVE_LIBSSH
1932
  case SK_SSH_ACTIVE:
1933
    {
1934
      switch (sk_ssh_connect(s))
1935
      {
1936
        case SSH_OK:
1937
          sk_ssh_connected(s);
1938
          break;
1939

    
1940
        case SSH_AGAIN:
1941
          return 1;
1942

    
1943
        case SSH_ERROR:
1944
          s->err = ssh_get_error(s->ssh->session);
1945
          s->err_hook(s, ssh_get_error_code(s->ssh->session));
1946
          break;
1947
      }
1948
      return 0;
1949
    }
1950
#endif
1951

    
1952
  default:
1953
    if (s->ttx != s->tpos && sk_maybe_write(s) > 0)
1954
    {
1955
      if (s->tx_hook)
1956
        s->tx_hook(s);
1957
      return 1;
1958
    }
1959
    return 0;
1960
  }
1961
}
1962

    
1963
int sk_is_ipv4(sock *s)
1964
{ return s->af == AF_INET; }
1965

    
1966
int sk_is_ipv6(sock *s)
1967
{ return s->af == AF_INET6; }
1968

    
1969
void
1970
sk_err(sock *s, int revents)
1971
{
1972
  int se = 0, sse = sizeof(se);
1973
  if ((s->type != SK_MAGIC) && (revents & POLLERR))
1974
    if (getsockopt(s->fd, SOL_SOCKET, SO_ERROR, &se, &sse) < 0)
1975
    {
1976
      log(L_ERR "IO: Socket error: SO_ERROR: %m");
1977
      se = 0;
1978
    }
1979

    
1980
  s->err_hook(s, se);
1981
}
1982

    
1983
void
1984
sk_dump_all(void)
1985
{
1986
  node *n;
1987
  sock *s;
1988

    
1989
  debug("Open sockets:\n");
1990
  WALK_LIST(n, sock_list)
1991
  {
1992
    s = SKIP_BACK(sock, n, n);
1993
    debug("%p ", s);
1994
    sk_dump(&s->r);
1995
  }
1996
  debug("\n");
1997
}
1998

    
1999

    
2000
/*
2001
 *        Internal event log and watchdog
2002
 */
2003

    
2004
#define EVENT_LOG_LENGTH 32
2005

    
2006
struct event_log_entry
2007
{
2008
  void *hook;
2009
  void *data;
2010
  btime timestamp;
2011
  btime duration;
2012
};
2013

    
2014
static struct event_log_entry event_log[EVENT_LOG_LENGTH];
2015
static struct event_log_entry *event_open;
2016
static int event_log_pos, event_log_num, watchdog_active;
2017
static btime last_time;
2018
static btime loop_time;
2019

    
2020
static void
2021
io_update_time(void)
2022
{
2023
  struct timespec ts;
2024
  int rv;
2025

    
2026
  /*
2027
   * This is third time-tracking procedure (after update_times() above and
2028
   * times_update() in BFD), dedicated to internal event log and latency
2029
   * tracking. Hopefully, we consolidate these sometimes.
2030
   */
2031

    
2032
  rv = clock_gettime(CLOCK_MONOTONIC, &ts);
2033
  if (rv < 0)
2034
    die("clock_gettime: %m");
2035

    
2036
  last_time = ts.tv_sec S + ts.tv_nsec NS;
2037

    
2038
  if (event_open)
2039
  {
2040
    event_open->duration = last_time - event_open->timestamp;
2041

    
2042
    if (event_open->duration > config->latency_limit)
2043
      log(L_WARN "Event 0x%p 0x%p took %d ms",
2044
          event_open->hook, event_open->data, (int) (event_open->duration TO_MS));
2045

    
2046
    event_open = NULL;
2047
  }
2048
}
2049

    
2050
/**
2051
 * io_log_event - mark approaching event into event log
2052
 * @hook: event hook address
2053
 * @data: event data address
2054
 *
2055
 * Store info (hook, data, timestamp) about the following internal event into
2056
 * a circular event log (@event_log). When latency tracking is enabled, the log
2057
 * entry is kept open (in @event_open) so the duration can be filled later.
2058
 */
2059
void
2060
io_log_event(void *hook, void *data)
2061
{
2062
  if (config->latency_debug)
2063
    io_update_time();
2064

    
2065
  struct event_log_entry *en = event_log + event_log_pos;
2066

    
2067
  en->hook = hook;
2068
  en->data = data;
2069
  en->timestamp = last_time;
2070
  en->duration = 0;
2071

    
2072
  event_log_num++;
2073
  event_log_pos++;
2074
  event_log_pos %= EVENT_LOG_LENGTH;
2075

    
2076
  event_open = config->latency_debug ? en : NULL;
2077
}
2078

    
2079
static inline void
2080
io_close_event(void)
2081
{
2082
  if (event_open)
2083
    io_update_time();
2084
}
2085

    
2086
void
2087
io_log_dump(void)
2088
{
2089
  int i;
2090

    
2091
  log(L_DEBUG "Event log:");
2092
  for (i = 0; i < EVENT_LOG_LENGTH; i++)
2093
  {
2094
    struct event_log_entry *en = event_log + (event_log_pos + i) % EVENT_LOG_LENGTH;
2095
    if (en->hook)
2096
      log(L_DEBUG "  Event 0x%p 0x%p at %8d for %d ms", en->hook, en->data,
2097
          (int) ((last_time - en->timestamp) TO_MS), (int) (en->duration TO_MS));
2098
  }
2099
}
2100

    
2101
void
2102
watchdog_sigalrm(int sig UNUSED)
2103
{
2104
  /* Update last_time and duration, but skip latency check */
2105
  config->latency_limit = 0xffffffff;
2106
  io_update_time();
2107

    
2108
  /* We want core dump */
2109
  abort();
2110
}
2111

    
2112
static inline void
2113
watchdog_start1(void)
2114
{
2115
  io_update_time();
2116

    
2117
  loop_time = last_time;
2118
}
2119

    
2120
static inline void
2121
watchdog_start(void)
2122
{
2123
  io_update_time();
2124

    
2125
  loop_time = last_time;
2126
  event_log_num = 0;
2127

    
2128
  if (config->watchdog_timeout)
2129
  {
2130
    alarm(config->watchdog_timeout);
2131
    watchdog_active = 1;
2132
  }
2133
}
2134

    
2135
static inline void
2136
watchdog_stop(void)
2137
{
2138
  io_update_time();
2139

    
2140
  if (watchdog_active)
2141
  {
2142
    alarm(0);
2143
    watchdog_active = 0;
2144
  }
2145

    
2146
  btime duration = last_time - loop_time;
2147
  if (duration > config->watchdog_warning)
2148
    log(L_WARN "I/O loop cycle took %d ms for %d events",
2149
        (int) (duration TO_MS), event_log_num);
2150
}
2151

    
2152

    
2153
/*
2154
 *        Main I/O Loop
2155
 */
2156

    
2157
volatile int async_config_flag;                /* Asynchronous reconfiguration/dump scheduled */
2158
volatile int async_dump_flag;
2159
volatile int async_shutdown_flag;
2160

    
2161
void
2162
io_init(void)
2163
{
2164
  init_list(&sock_list);
2165
  init_list(&global_event_list);
2166
  krt_io_init();
2167
  // XXX init_times();
2168
  // XXX update_times();
2169
  boot_time = current_time();
2170

    
2171
  u64 now = (u64) current_real_time();
2172
  srandom((uint) (now ^ (now >> 32)));
2173
}
2174

    
2175
static int short_loops = 0;
2176
#define SHORT_LOOP_MAX 10
2177

    
2178
void
2179
io_loop(void)
2180
{
2181
  int poll_tout, timeout;
2182
  int nfds, events, pout;
2183
  timer *t;
2184
  sock *s;
2185
  node *n;
2186
  int fdmax = 256;
2187
  struct pollfd *pfd = xmalloc(fdmax * sizeof(struct pollfd));
2188

    
2189
  watchdog_start1();
2190
  for(;;)
2191
    {
2192
      times_update(&main_timeloop);
2193
      events = ev_run_list(&global_event_list);
2194
      timers_fire(&main_timeloop);
2195
      io_close_event();
2196

    
2197
      // FIXME
2198
      poll_tout = (events ? 0 : 3000); /* Time in milliseconds */
2199
      if (t = timers_first(&main_timeloop))
2200
      {
2201
        times_update(&main_timeloop);
2202
        timeout = (tm_remains(t) TO_MS) + 1;
2203
        poll_tout = MIN(poll_tout, timeout);
2204
      }
2205

    
2206
      nfds = 0;
2207
      WALK_LIST(n, sock_list)
2208
        {
2209
          pfd[nfds] = (struct pollfd) { .fd = -1 }; /* everything other set to 0 by this */
2210
          s = SKIP_BACK(sock, n, n);
2211
          if (s->rx_hook)
2212
            {
2213
              pfd[nfds].fd = s->fd;
2214
              pfd[nfds].events |= POLLIN;
2215
            }
2216
          if (s->tx_hook && s->ttx != s->tpos)
2217
            {
2218
              pfd[nfds].fd = s->fd;
2219
              pfd[nfds].events |= POLLOUT;
2220
            }
2221
          if (pfd[nfds].fd != -1)
2222
            {
2223
              s->index = nfds;
2224
              nfds++;
2225
            }
2226
          else
2227
            s->index = -1;
2228

    
2229
          if (nfds >= fdmax)
2230
            {
2231
              fdmax *= 2;
2232
              pfd = xrealloc(pfd, fdmax * sizeof(struct pollfd));
2233
            }
2234
        }
2235

    
2236
      /*
2237
       * Yes, this is racy. But even if the signal comes before this test
2238
       * and entering poll(), it gets caught on the next timer tick.
2239
       */
2240

    
2241
      if (async_config_flag)
2242
        {
2243
          io_log_event(async_config, NULL);
2244
          async_config();
2245
          async_config_flag = 0;
2246
          continue;
2247
        }
2248
      if (async_dump_flag)
2249
        {
2250
          io_log_event(async_dump, NULL);
2251
          async_dump();
2252
          async_dump_flag = 0;
2253
          continue;
2254
        }
2255
      if (async_shutdown_flag)
2256
        {
2257
          io_log_event(async_shutdown, NULL);
2258
          async_shutdown();
2259
          async_shutdown_flag = 0;
2260
          continue;
2261
        }
2262

    
2263
      /* And finally enter poll() to find active sockets */
2264
      watchdog_stop();
2265
      pout = poll(pfd, nfds, poll_tout);
2266
      watchdog_start();
2267

    
2268
      if (pout < 0)
2269
        {
2270
          if (errno == EINTR || errno == EAGAIN)
2271
            continue;
2272
          die("poll: %m");
2273
        }
2274
      if (pout)
2275
        {
2276
          times_update(&main_timeloop);
2277

    
2278
          /* guaranteed to be non-empty */
2279
          current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
2280

    
2281
          while (current_sock)
2282
            {
2283
              sock *s = current_sock;
2284
              if (s->index == -1)
2285
                {
2286
                  current_sock = sk_next(s);
2287
                  goto next;
2288
                }
2289

    
2290
              int e;
2291
              int steps;
2292

    
2293
              steps = MAX_STEPS;
2294
              if (s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook)
2295
                do
2296
                  {
2297
                    steps--;
2298
                    io_log_event(s->rx_hook, s->data);
2299
                    e = sk_read(s, pfd[s->index].revents);
2300
                    if (s != current_sock)
2301
                      goto next;
2302
                  }
2303
                while (e && s->rx_hook && steps);
2304

    
2305
              steps = MAX_STEPS;
2306
              if (pfd[s->index].revents & POLLOUT)
2307
                do
2308
                  {
2309
                    steps--;
2310
                    io_log_event(s->tx_hook, s->data);
2311
                    e = sk_write(s);
2312
                    if (s != current_sock)
2313
                      goto next;
2314
                  }
2315
                while (e && steps);
2316

    
2317
              current_sock = sk_next(s);
2318
            next: ;
2319
            }
2320

    
2321
          short_loops++;
2322
          if (events && (short_loops < SHORT_LOOP_MAX))
2323
            continue;
2324
          short_loops = 0;
2325

    
2326
          int count = 0;
2327
          current_sock = stored_sock;
2328
          if (current_sock == NULL)
2329
            current_sock = SKIP_BACK(sock, n, HEAD(sock_list));
2330

    
2331
          while (current_sock && count < MAX_RX_STEPS)
2332
            {
2333
              sock *s = current_sock;
2334
              if (s->index == -1)
2335
                {
2336
                  current_sock = sk_next(s);
2337
                  goto next2;
2338
                }
2339

    
2340
              if (!s->fast_rx && (pfd[s->index].revents & POLLIN) && s->rx_hook)
2341
                {
2342
                  count++;
2343
                  io_log_event(s->rx_hook, s->data);
2344
                  sk_read(s, pfd[s->index].revents);
2345
                  if (s != current_sock)
2346
                    goto next2;
2347
                }
2348

    
2349
              if (pfd[s->index].revents & (POLLHUP | POLLERR))
2350
                {
2351
                  sk_err(s, pfd[s->index].revents);
2352
                  if (s != current_sock)
2353
                    goto next2;
2354
                }
2355

    
2356
              current_sock = sk_next(s);
2357
            next2: ;
2358
            }
2359

    
2360

    
2361
          stored_sock = current_sock;
2362
        }
2363
    }
2364
}
2365

    
2366
void
2367
test_old_bird(char *path)
2368
{
2369
  int fd;
2370
  struct sockaddr_un sa;
2371

    
2372
  fd = socket(AF_UNIX, SOCK_STREAM, 0);
2373
  if (fd < 0)
2374
    die("Cannot create socket: %m");
2375
  if (strlen(path) >= sizeof(sa.sun_path))
2376
    die("Socket path too long");
2377
  bzero(&sa, sizeof(sa));
2378
  sa.sun_family = AF_UNIX;
2379
  strcpy(sa.sun_path, path);
2380
  if (connect(fd, (struct sockaddr *) &sa, SUN_LEN(&sa)) == 0)
2381
    die("I found another BIRD running.");
2382
  close(fd);
2383
}