TFO: better detection of client fast-open connections
[exim.git] / src / src / ip.c
... / ...
CommitLineData
1/*************************************************
2* Exim - an Internet mail transport agent *
3*************************************************/
4
5/* Copyright (c) University of Cambridge 1995 - 2017 */
6/* See the file NOTICE for conditions of use and distribution. */
7
8/* Functions for doing things with sockets. With the advent of IPv6 this has
9got messier, so that it's worth pulling out the code into separate functions
10that other parts of Exim can call, especially as there are now several
11different places in the code where sockets are used. */
12
13
14#include "exim.h"
15
16
17/*************************************************
18* Create a socket *
19*************************************************/
20
21/* Socket creation happens in a number of places so it's packaged here for
22convenience.
23
24Arguments:
25 type SOCK_DGRAM or SOCK_STREAM
26 af AF_INET or AF_INET6
27
28Returns: socket number or -1 on failure
29*/
30
31int
32ip_socket(int type, int af)
33{
34int sock = socket(af, type, 0);
35if (sock < 0)
36 log_write(0, LOG_MAIN, "IPv%c socket creation failed: %s",
37 (af == AF_INET6)? '6':'4', strerror(errno));
38return sock;
39}
40
41
42
43
44#if HAVE_IPV6
45/*************************************************
46* Convert printing address to numeric *
47*************************************************/
48
49/* This function converts the textual form of an IP address into a numeric form
50in an appropriate structure in an IPv6 environment. The getaddrinfo() function
51can (apparently) handle more complicated addresses (e.g. those containing
52scopes) than inet_pton() in some environments. We use hints to tell it that the
53input must be a numeric address.
54
55However, apparently some operating systems (or libraries) don't support
56getaddrinfo(), so there is a build-time option to revert to inet_pton() (which
57does not support scopes).
58
59Arguments:
60 address textual form of the address
61 addr where to copy back the answer
62
63Returns: nothing - failure provokes a panic-die
64*/
65
66static void
67ip_addrinfo(const uschar *address, struct sockaddr_in6 *saddr)
68{
69#ifdef IPV6_USE_INET_PTON
70
71 if (inet_pton(AF_INET6, CCS address, &saddr->sin6_addr) != 1)
72 log_write(0, LOG_MAIN|LOG_PANIC_DIE, "unable to parse \"%s\" as an "
73 "IP address", address);
74 saddr->sin6_family = AF_INET6;
75
76#else
77
78 int rc;
79 struct addrinfo hints, *res;
80 memset(&hints, 0, sizeof(hints));
81 hints.ai_family = AF_INET6;
82 hints.ai_socktype = SOCK_STREAM;
83 hints.ai_flags = AI_NUMERICHOST;
84 if ((rc = getaddrinfo(CCS address, NULL, &hints, &res)) != 0 || res == NULL)
85 log_write(0, LOG_MAIN|LOG_PANIC_DIE, "unable to parse \"%s\" as an "
86 "IP address: %s", address,
87 (rc == 0)? "NULL result returned" : gai_strerror(rc));
88 memcpy(saddr, res->ai_addr, res->ai_addrlen);
89 freeaddrinfo(res);
90
91#endif
92}
93#endif /* HAVE_IPV6 */
94
95
96/*************************************************
97* Bind socket to interface and port *
98*************************************************/
99
100int
101ip_addr(void * sin_, int af, const uschar * address, int port)
102{
103union sockaddr_46 * sin = sin_;
104memset(sin, 0, sizeof(*sin));
105
106/* Setup code when using an IPv6 socket. The wildcard address is ":", to
107ensure an IPv6 socket is used. */
108
109#if HAVE_IPV6
110if (af == AF_INET6)
111 {
112 if (address[0] == ':' && address[1] == 0)
113 {
114 sin->v6.sin6_family = AF_INET6;
115 sin->v6.sin6_addr = in6addr_any;
116 }
117 else
118 ip_addrinfo(address, &sin->v6); /* Panic-dies on error */
119 sin->v6.sin6_port = htons(port);
120 return sizeof(sin->v6);
121 }
122else
123#else /* HAVE_IPv6 */
124af = af; /* Avoid compiler warning */
125#endif /* HAVE_IPV6 */
126
127/* Setup code when using IPv4 socket. The wildcard address is "". */
128
129 {
130 sin->v4.sin_family = AF_INET;
131 sin->v4.sin_port = htons(port);
132 sin->v4.sin_addr.s_addr = address[0] == 0
133 ? (S_ADDR_TYPE)INADDR_ANY
134 : (S_ADDR_TYPE)inet_addr(CS address);
135 return sizeof(sin->v4);
136 }
137}
138
139
140
141/* This function binds a socket to a local interface address and port. For a
142wildcard IPv6 bind, the address is ":".
143
144Arguments:
145 sock the socket
146 af AF_INET or AF_INET6 - the socket type
147 address the IP address, in text form
148 port the IP port (host order)
149
150Returns: the result of bind()
151*/
152
153int
154ip_bind(int sock, int af, uschar *address, int port)
155{
156union sockaddr_46 sin;
157int s_len = ip_addr(&sin, af, address, port);
158return bind(sock, (struct sockaddr *)&sin, s_len);
159}
160
161
162
163/*************************************************
164* Connect socket to remote host *
165*************************************************/
166
167/* This function connects a socket to a remote address and port. The socket may
168or may not have previously been bound to a local interface. The socket is not
169closed, even in cases of error. It is expected that the calling function, which
170created the socket, will be the one that closes it.
171
172Arguments:
173 sock the socket
174 af AF_INET6 or AF_INET for the socket type
175 address the remote address, in text form
176 port the remote port
177 timeout a timeout (zero for indefinite timeout)
178 fastopen non-null iff TCP_FASTOPEN can be used; may indicate early-data to
179 be sent in SYN segment
180
181Returns: 0 on success; -1 on failure, with errno set
182*/
183
184int
185ip_connect(int sock, int af, const uschar *address, int port, int timeout,
186 const blob * fastopen)
187{
188struct sockaddr_in s_in4;
189struct sockaddr *s_ptr;
190int s_len, rc, save_errno;
191
192/* For an IPv6 address, use an IPv6 sockaddr structure. */
193
194#if HAVE_IPV6
195struct sockaddr_in6 s_in6;
196if (af == AF_INET6)
197 {
198 memset(&s_in6, 0, sizeof(s_in6));
199 ip_addrinfo(address, &s_in6); /* Panic-dies on error */
200 s_in6.sin6_port = htons(port);
201 s_ptr = (struct sockaddr *)&s_in6;
202 s_len = sizeof(s_in6);
203 }
204else
205#else /* HAVE_IPV6 */
206af = af; /* Avoid compiler warning */
207#endif /* HAVE_IPV6 */
208
209/* For an IPv4 address, use an IPv4 sockaddr structure, even on a system with
210IPv6 support. */
211
212 {
213 memset(&s_in4, 0, sizeof(s_in4));
214 s_in4.sin_family = AF_INET;
215 s_in4.sin_port = htons(port);
216 s_in4.sin_addr.s_addr = (S_ADDR_TYPE)inet_addr(CCS address);
217 s_ptr = (struct sockaddr *)&s_in4;
218 s_len = sizeof(s_in4);
219 }
220
221/* If no connection timeout is set, just call connect() without setting a
222timer, thereby allowing the inbuilt OS timeout to operate. */
223
224callout_address = string_sprintf("[%s]:%d", address, port);
225sigalrm_seen = FALSE;
226if (timeout > 0) alarm(timeout);
227
228#if defined(TCP_FASTOPEN) && defined(MSG_FASTOPEN)
229/* TCP Fast Open, if the system has a cookie from a previous call to
230this peer, can send data in the SYN packet. The peer can send data
231before it gets our ACK of its SYN,ACK - the latter is useful for
232the SMTP banner. Other (than SMTP) cases of TCP connections can
233possibly use the data-on-syn, so support that too. */
234
235if (fastopen)
236 {
237 if ((rc = sendto(sock, fastopen->data, fastopen->len,
238 MSG_FASTOPEN | MSG_DONTWAIT, s_ptr, s_len)) >= 0)
239 {
240 DEBUG(D_transport|D_v)
241 debug_printf("TCP_FASTOPEN mode connection, with data\n");
242 tcp_out_fastopen = TRUE;
243 }
244 else if (errno == EINPROGRESS) /* expected for nonready peer */
245 {
246 if (!fastopen->data)
247 {
248 DEBUG(D_transport|D_v)
249 debug_printf("TCP_FASTOPEN mode connection, no data\n");
250 tcp_out_fastopen = TRUE;
251 rc = 0;
252 }
253 else if ( (rc = send(sock, fastopen->data, fastopen->len, 0)) < 0
254 && errno == EINPROGRESS) /* expected for nonready peer */
255 rc = 0;
256 }
257 else if(errno == EOPNOTSUPP)
258 {
259 DEBUG(D_transport)
260 debug_printf("Tried TCP Fast Open but apparently not enabled by sysctl\n");
261 goto legacy_connect;
262 }
263 }
264else
265#endif
266 {
267legacy_connect:
268 if ((rc = connect(sock, s_ptr, s_len)) >= 0)
269 if ( fastopen && fastopen->data && fastopen->len
270 && send(sock, fastopen->data, fastopen->len, 0) < 0)
271 rc = -1;
272 }
273
274save_errno = errno;
275alarm(0);
276
277/* There is a testing facility for simulating a connection timeout, as I
278can't think of any other way of doing this. It converts a connection refused
279into a timeout if the timeout is set to 999999. */
280
281if (running_in_test_harness && save_errno == ECONNREFUSED && timeout == 999999)
282 {
283 rc = -1;
284 save_errno = EINTR;
285 sigalrm_seen = TRUE;
286 }
287
288/* Success */
289
290if (rc >= 0)
291 return 0;
292
293/* A failure whose error code is "Interrupted system call" is in fact
294an externally applied timeout if the signal handler has been run. */
295
296errno = save_errno == EINTR && sigalrm_seen ? ETIMEDOUT : save_errno;
297return -1;
298}
299
300
301
302/*************************************************
303* Create connected socket to remote host *
304*************************************************/
305
306/* Create a socket and connect to host (name or number, ipv6 ok)
307 at one of port-range.
308
309Arguments:
310 type SOCK_DGRAM or SOCK_STREAM
311 af AF_INET6 or AF_INET for the socket type
312 address the remote address, in text form
313 portlo,porthi the remote port range
314 timeout a timeout
315 connhost if not NULL, host_item filled in with connection details
316 errstr pointer for allocated string on error
317XXX could add early-data support
318
319Return:
320 socket fd, or -1 on failure (having allocated an error string)
321*/
322int
323ip_connectedsocket(int type, const uschar * hostname, int portlo, int porthi,
324 int timeout, host_item * connhost, uschar ** errstr)
325{
326int namelen, port;
327host_item shost;
328host_item *h;
329int af = 0, fd, fd4 = -1, fd6 = -1;
330blob * fastopen = tcp_fastopen_ok && type == SOCK_STREAM
331 ? &tcp_fastopen_nodata : NULL;
332
333shost.next = NULL;
334shost.address = NULL;
335shost.port = portlo;
336shost.mx = -1;
337
338namelen = Ustrlen(hostname);
339
340/* Anything enclosed in [] must be an IP address. */
341
342if (hostname[0] == '[' &&
343 hostname[namelen - 1] == ']')
344 {
345 uschar * host = string_copyn(hostname+1, namelen-2);
346 if (string_is_ip_address(host, NULL) == 0)
347 {
348 *errstr = string_sprintf("malformed IP address \"%s\"", hostname);
349 return -1;
350 }
351 shost.name = shost.address = host;
352 }
353
354/* Otherwise check for an unadorned IP address */
355
356else if (string_is_ip_address(hostname, NULL) != 0)
357 shost.name = shost.address = string_copyn(hostname, namelen);
358
359/* Otherwise lookup IP address(es) from the name */
360
361else
362 {
363 shost.name = string_copyn(hostname, namelen);
364 if (host_find_byname(&shost, NULL, HOST_FIND_QUALIFY_SINGLE,
365 NULL, FALSE) != HOST_FOUND)
366 {
367 *errstr = string_sprintf("no IP address found for host %s", shost.name);
368 return -1;
369 }
370 }
371
372/* Try to connect to the server - test each IP till one works */
373
374for (h = &shost; h; h = h->next)
375 {
376 fd = Ustrchr(h->address, ':') != 0
377 ? fd6 < 0 ? (fd6 = ip_socket(type, af = AF_INET6)) : fd6
378 : fd4 < 0 ? (fd4 = ip_socket(type, af = AF_INET )) : fd4;
379
380 if (fd < 0)
381 {
382 *errstr = string_sprintf("failed to create socket: %s", strerror(errno));
383 goto bad;
384 }
385
386 for(port = portlo; port <= porthi; port++)
387 if (ip_connect(fd, af, h->address, port, timeout, fastopen) == 0)
388 {
389 if (fd != fd6) close(fd6);
390 if (fd != fd4) close(fd4);
391 if (connhost)
392 {
393 h->port = port;
394 *connhost = *h;
395 connhost->next = NULL;
396 }
397 return fd;
398 }
399 }
400
401*errstr = string_sprintf("failed to connect to any address for %s: %s",
402 hostname, strerror(errno));
403
404bad:
405 close(fd4); close(fd6); return -1;
406}
407
408
409int
410ip_tcpsocket(const uschar * hostport, uschar ** errstr, int tmo)
411{
412int scan;
413uschar hostname[256];
414unsigned int portlow, porthigh;
415
416/* extract host and port part */
417scan = sscanf(CS hostport, "%255s %u-%u", hostname, &portlow, &porthigh);
418if (scan != 3)
419 {
420 if (scan != 2)
421 {
422 *errstr = string_sprintf("invalid socket '%s'", hostport);
423 return -1;
424 }
425 porthigh = portlow;
426 }
427
428return ip_connectedsocket(SOCK_STREAM, hostname, portlow, porthigh,
429 tmo, NULL, errstr);
430}
431
432int
433ip_unixsocket(const uschar * path, uschar ** errstr)
434{
435int sock;
436struct sockaddr_un server;
437
438if ((sock = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
439 {
440 *errstr = US"can't open UNIX socket.";
441 return -1;
442 }
443
444callout_address = string_copy(path);
445server.sun_family = AF_UNIX;
446Ustrncpy(server.sun_path, path, sizeof(server.sun_path)-1);
447server.sun_path[sizeof(server.sun_path)-1] = '\0';
448if (connect(sock, (struct sockaddr *) &server, sizeof(server)) < 0)
449 {
450 int err = errno;
451 (void)close(sock);
452 *errstr = string_sprintf("unable to connect to UNIX socket (%s): %s",
453 path, strerror(err));
454 return -1;
455 }
456return sock;
457}
458
459int
460ip_streamsocket(const uschar * spec, uschar ** errstr, int tmo)
461{
462return *spec == '/'
463 ? ip_unixsocket(spec, errstr) : ip_tcpsocket(spec, errstr, tmo);
464}
465
466/*************************************************
467* Set keepalive on a socket *
468*************************************************/
469
470/* Can be called for both incoming and outgoing sockets.
471
472Arguments:
473 sock the socket
474 address the remote host address, for failure logging
475 torf true for outgoing connection, false for incoming
476
477Returns: nothing
478*/
479
480void
481ip_keepalive(int sock, const uschar *address, BOOL torf)
482{
483int fodder = 1;
484if (setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
485 US (&fodder), sizeof(fodder)) != 0)
486 log_write(0, LOG_MAIN, "setsockopt(SO_KEEPALIVE) on connection %s %s "
487 "failed: %s", torf? "to":"from", address, strerror(errno));
488}
489
490
491
492/*************************************************
493* Receive from a socket with timeout *
494*************************************************/
495
496/*
497Arguments:
498 fd the file descriptor
499 timeout the timeout, seconds
500Returns: TRUE => ready for i/o
501 FALSE => timed out, or other error
502*/
503BOOL
504fd_ready(int fd, int timeout)
505{
506fd_set select_inset;
507time_t start_recv = time(NULL);
508int time_left = timeout;
509int rc;
510
511if (time_left <= 0)
512 {
513 errno = ETIMEDOUT;
514 return FALSE;
515 }
516/* Wait until the socket is ready */
517
518do
519 {
520 struct timeval tv = { .tv_sec = time_left, .tv_usec = 0 };
521 FD_ZERO (&select_inset);
522 FD_SET (fd, &select_inset);
523
524 /*DEBUG(D_transport) debug_printf("waiting for data on fd\n");*/
525 rc = select(fd + 1, (SELECT_ARG2_TYPE *)&select_inset, NULL, NULL, &tv);
526
527 /* If some interrupt arrived, just retry. We presume this to be rare,
528 but it can happen (e.g. the SIGUSR1 signal sent by exiwhat causes
529 select() to exit).
530
531 Aug 2004: Somebody set up a cron job that ran exiwhat every 2 minutes, making
532 the interrupt not at all rare. Since the timeout is typically more than 2
533 minutes, the effect was to block the timeout completely. To prevent this
534 happening again, we do an explicit time test and adjust the timeout
535 accordingly */
536
537 if (rc < 0 && errno == EINTR)
538 {
539 DEBUG(D_transport) debug_printf("EINTR while waiting for socket data\n");
540
541 /* Watch out, 'continue' jumps to the condition, not to the loops top */
542 time_left = timeout - (time(NULL) - start_recv);
543 if (time_left > 0) continue;
544 }
545
546 if (rc <= 0)
547 {
548 errno = ETIMEDOUT;
549 return FALSE;
550 }
551
552 /* Checking the FD_ISSET is not enough, if we're interrupted, the
553 select_inset may still contain the 'input'. */
554 }
555while (rc < 0 || !FD_ISSET(fd, &select_inset));
556return TRUE;
557}
558
559/* The timeout is implemented using select(), and we loop to cover select()
560getting interrupted, and the possibility of select() returning with a positive
561result but no ready descriptor. Is this in fact possible?
562
563Arguments:
564 sock the socket
565 buffer to read into
566 bufsize the buffer size
567 timeout the timeout
568
569Returns: > 0 => that much data read
570 <= 0 on error or EOF; errno set - zero for EOF
571*/
572
573int
574ip_recv(int sock, uschar *buffer, int buffsize, int timeout)
575{
576int rc;
577
578if (!fd_ready(sock, timeout))
579 return -1;
580
581/* The socket is ready, read from it (via TLS if it's active). On EOF (i.e.
582close down of the connection), set errno to zero; otherwise leave it alone. */
583
584#ifdef SUPPORT_TLS
585if (tls_out.active == sock)
586 rc = tls_read(FALSE, buffer, buffsize);
587else if (tls_in.active == sock)
588 rc = tls_read(TRUE, buffer, buffsize);
589else
590#endif
591 rc = recv(sock, buffer, buffsize, 0);
592
593if (rc > 0) return rc;
594if (rc == 0) errno = 0;
595return -1;
596}
597
598
599
600
601/*************************************************
602* Lookup address family of potential socket *
603*************************************************/
604
605/* Given a file-descriptor, check to see if it's a socket and, if so,
606return the address family; detects IPv4 vs IPv6. If not a socket then
607return -1.
608
609The value 0 is typically AF_UNSPEC, which should not be seen on a connected
610fd. If the return is -1, the errno will be from getsockname(); probably
611ENOTSOCK or ECONNRESET.
612
613Arguments: socket-or-not fd
614Returns: address family or -1
615*/
616
617int
618ip_get_address_family(int fd)
619{
620struct sockaddr_storage ss;
621socklen_t sslen = sizeof(ss);
622
623if (getsockname(fd, (struct sockaddr *) &ss, &sslen) < 0)
624 return -1;
625
626return (int) ss.ss_family;
627}
628
629
630
631
632/*************************************************
633* Lookup DSCP settings for a socket *
634*************************************************/
635
636struct dscp_name_tableentry {
637 const uschar *name;
638 int value;
639};
640/* Keep both of these tables sorted! */
641static struct dscp_name_tableentry dscp_table[] = {
642#ifdef IPTOS_DSCP_AF11
643 { CUS"af11", IPTOS_DSCP_AF11 },
644 { CUS"af12", IPTOS_DSCP_AF12 },
645 { CUS"af13", IPTOS_DSCP_AF13 },
646 { CUS"af21", IPTOS_DSCP_AF21 },
647 { CUS"af22", IPTOS_DSCP_AF22 },
648 { CUS"af23", IPTOS_DSCP_AF23 },
649 { CUS"af31", IPTOS_DSCP_AF31 },
650 { CUS"af32", IPTOS_DSCP_AF32 },
651 { CUS"af33", IPTOS_DSCP_AF33 },
652 { CUS"af41", IPTOS_DSCP_AF41 },
653 { CUS"af42", IPTOS_DSCP_AF42 },
654 { CUS"af43", IPTOS_DSCP_AF43 },
655 { CUS"ef", IPTOS_DSCP_EF },
656#endif
657#ifdef IPTOS_LOWCOST
658 { CUS"lowcost", IPTOS_LOWCOST },
659#endif
660 { CUS"lowdelay", IPTOS_LOWDELAY },
661#ifdef IPTOS_MINCOST
662 { CUS"mincost", IPTOS_MINCOST },
663#endif
664 { CUS"reliability", IPTOS_RELIABILITY },
665 { CUS"throughput", IPTOS_THROUGHPUT }
666};
667static int dscp_table_size =
668 sizeof(dscp_table) / sizeof(struct dscp_name_tableentry);
669
670/* DSCP values change by protocol family, and so do the options used for
671setsockopt(); this utility does all the lookups. It takes an unexpanded
672option string, expands it, strips off affix whitespace, then checks if it's
673a number. If all of what's left is a number, then that's how the option will
674be parsed and success/failure is a range check. If it's not all a number,
675then it must be a supported keyword.
676
677Arguments:
678 dscp_name a string, so far unvalidated
679 af address_family in use
680 level setsockopt level to use
681 optname setsockopt name to use
682 dscp_value value for dscp_name
683
684Returns: TRUE if okay to setsockopt(), else FALSE
685
686*level and *optname may be set even if FALSE is returned
687*/
688
689BOOL
690dscp_lookup(const uschar *dscp_name, int af,
691 int *level, int *optname, int *dscp_value)
692{
693uschar *dscp_lookup, *p;
694int first, last;
695long rawlong;
696
697if (af == AF_INET)
698 {
699 *level = IPPROTO_IP;
700 *optname = IP_TOS;
701 }
702#if HAVE_IPV6 && defined(IPV6_TCLASS)
703else if (af == AF_INET6)
704 {
705 *level = IPPROTO_IPV6;
706 *optname = IPV6_TCLASS;
707 }
708#endif
709else
710 {
711 DEBUG(D_transport)
712 debug_printf("Unhandled address family %d in dscp_lookup()\n", af);
713 return FALSE;
714 }
715if (!dscp_name)
716 {
717 DEBUG(D_transport)
718 debug_printf("[empty DSCP]\n");
719 return FALSE;
720 }
721dscp_lookup = expand_string(US dscp_name);
722if (dscp_lookup == NULL || *dscp_lookup == '\0')
723 return FALSE;
724
725p = dscp_lookup + Ustrlen(dscp_lookup) - 1;
726while (isspace(*p)) *p-- = '\0';
727while (isspace(*dscp_lookup) && dscp_lookup < p) dscp_lookup++;
728if (*dscp_lookup == '\0')
729 return FALSE;
730
731rawlong = Ustrtol(dscp_lookup, &p, 0);
732if (p != dscp_lookup && *p == '\0')
733 {
734 /* We have six bits available, which will end up shifted to fit in 0xFC mask.
735 RFC 2597 defines the values unshifted. */
736 if (rawlong < 0 || rawlong > 0x3F)
737 {
738 DEBUG(D_transport)
739 debug_printf("DSCP value %ld out of range, ignored.\n", rawlong);
740 return FALSE;
741 }
742 *dscp_value = rawlong << 2;
743 return TRUE;
744 }
745
746first = 0;
747last = dscp_table_size;
748while (last > first)
749 {
750 int middle = (first + last)/2;
751 int c = Ustrcmp(dscp_lookup, dscp_table[middle].name);
752 if (c == 0)
753 {
754 *dscp_value = dscp_table[middle].value;
755 return TRUE;
756 }
757 else if (c > 0)
758 first = middle + 1;
759 else
760 last = middle;
761 }
762return FALSE;
763}
764
765void
766dscp_list_to_stream(FILE *stream)
767{
768int i;
769for (i=0; i < dscp_table_size; ++i)
770 fprintf(stream, "%s\n", dscp_table[i].name);
771}
772
773
774/* End of ip.c */
775/* vi: aw ai sw=2
776*/