Fix too-often retry bug after 4xx with more hosts than hosts_max_retry.
[exim.git] / src / src / retry.c
1 /* $Cambridge: exim/src/src/retry.c,v 1.11 2006/07/03 15:39:06 ph10 Exp $ */
2
3 /*************************************************
4 * Exim - an Internet mail transport agent *
5 *************************************************/
6
7 /* Copyright (c) University of Cambridge 1995 - 2006 */
8 /* See the file NOTICE for conditions of use and distribution. */
9
10 /* Functions concerned with retrying unsuccessful deliveries. */
11
12
13 #include "exim.h"
14
15
16
17 /*************************************************
18 * Check the ultimate address timeout *
19 *************************************************/
20
21 /* This function tests whether a message has been on the queue longer than
22 the maximum retry time for a particular host.
23
24 Arguments:
25 host_key the key to look up a host retry rule
26 domain the domain to look up a domain retry rule
27 basic_errno a specific error number, or zero if none
28 more_errno additional data for the error
29 now the time
30
31 Returns: TRUE if the ultimate timeout has been reached
32 */
33
34 static BOOL
35 ultimate_address_timeout(uschar *host_key, uschar *domain, int basic_errno,
36 int more_errno, time_t now)
37 {
38 BOOL address_timeout = TRUE; /* no rule => timed out */
39
40 retry_config *retry =
41 retry_find_config(host_key+2, domain, basic_errno, more_errno);
42
43 if (retry != NULL && retry->rules != NULL)
44 {
45 retry_rule *last_rule;
46 for (last_rule = retry->rules;
47 last_rule->next != NULL;
48 last_rule = last_rule->next);
49 DEBUG(D_transport|D_retry)
50 debug_printf(" received_time=%d diff=%d timeout=%d\n",
51 received_time, (int)(now - received_time), last_rule->timeout);
52 address_timeout = (now - received_time > last_rule->timeout);
53 }
54 else
55 {
56 DEBUG(D_transport|D_retry)
57 debug_printf("no retry rule found: assume timed out\n");
58 }
59
60 return address_timeout;
61 }
62
63
64
65 /*************************************************
66 * Set status of a host+address item *
67 *************************************************/
68
69 /* This function is passed a host_item which contains a host name and an
70 IP address string. Its job is to set the status of the address if it is not
71 already set (indicated by hstatus_unknown). The possible values are:
72
73 hstatus_usable the address is not listed in the unusable tree, and does
74 not have a retry record, OR the time is past the next
75 try time, OR the message has been on the queue for more
76 than the maximum retry time for a failing host
77
78 hstatus_unusable the address is listed in the unusable tree, or does have
79 a retry record, and the time is not yet at the next retry
80 time.
81
82 hstatus_unusable_expired as above, but also the retry time has expired
83 for this address.
84
85 The reason a delivery is permitted when a message has been around for a very
86 long time is to allow the ultimate address timeout to operate after a delivery
87 failure. Otherwise some messages may stick around without being tried for too
88 long.
89
90 If a host retry record is retrieved from the hints database, the time of last
91 trying is filled into the last_try field of the host block. If a host is
92 generally usable, a check is made to see if there is a retry delay on this
93 specific message at this host.
94
95 If a non-standard port is being used, it is added to the retry key.
96
97 Arguments:
98 domain the address domain
99 host pointer to a host item
100 portstring "" for standard port, ":xxxx" for a non-standard port
101 include_ip_address TRUE to include the address in the key - this is
102 usual, but sometimes is not wanted
103 retry_host_key where to put a pointer to the key for the host-specific
104 retry record, if one is read and the host is usable
105 retry_message_key where to put a pointer to the key for the message+host
106 retry record, if one is read and the host is usable
107
108 Returns: TRUE if the host has expired but is usable because
109 its retry time has come
110 */
111
112 BOOL
113 retry_check_address(uschar *domain, host_item *host, uschar *portstring,
114 BOOL include_ip_address, uschar **retry_host_key, uschar **retry_message_key)
115 {
116 BOOL yield = FALSE;
117 time_t now = time(NULL);
118 uschar *host_key, *message_key;
119 open_db dbblock;
120 open_db *dbm_file;
121 tree_node *node;
122 dbdata_retry *host_retry_record, *message_retry_record;
123
124 *retry_host_key = *retry_message_key = NULL;
125
126 DEBUG(D_transport|D_retry) debug_printf("checking status of %s\n", host->name);
127
128 /* Do nothing if status already set; otherwise initialize status as usable. */
129
130 if (host->status != hstatus_unknown) return FALSE;
131 host->status = hstatus_usable;
132
133 /* Generate the host key for the unusable tree and the retry database. Ensure
134 host names are lower cased (that's what %S does). */
135
136 host_key = include_ip_address?
137 string_sprintf("T:%S:%s%s", host->name, host->address, portstring) :
138 string_sprintf("T:%S%s", host->name, portstring);
139
140 /* Generate the message-specific key */
141
142 message_key = string_sprintf("%s:%s", host_key, message_id);
143
144 /* Search the tree of unusable IP addresses. This is filled in when deliveries
145 fail, because the retry database itself is not updated until the end of all
146 deliveries (so as to do it all in one go). The tree records addresses that have
147 become unusable during this delivery process (i.e. those that will get put into
148 the retry database when it is updated). */
149
150 node = tree_search(tree_unusable, host_key);
151 if (node != NULL)
152 {
153 DEBUG(D_transport|D_retry) debug_printf("found in tree of unusables\n");
154 host->status = (node->data.val > 255)?
155 hstatus_unusable_expired : hstatus_unusable;
156 host->why = node->data.val & 255;
157 return FALSE;
158 }
159
160 /* Open the retry database, giving up if there isn't one. Otherwise, search for
161 the retry records, and then close the database again. */
162
163 if ((dbm_file = dbfn_open(US"retry", O_RDONLY, &dbblock, FALSE)) == NULL)
164 {
165 DEBUG(D_deliver|D_retry|D_hints_lookup)
166 debug_printf("no retry data available\n");
167 return FALSE;
168 }
169 host_retry_record = dbfn_read(dbm_file, host_key);
170 message_retry_record = dbfn_read(dbm_file, message_key);
171 dbfn_close(dbm_file);
172
173 /* Ignore the data if it is too old - too long since it was written */
174
175 if (host_retry_record == NULL)
176 {
177 DEBUG(D_transport|D_retry) debug_printf("no host retry record\n");
178 }
179 else if (now - host_retry_record->time_stamp > retry_data_expire)
180 {
181 host_retry_record = NULL;
182 DEBUG(D_transport|D_retry) debug_printf("host retry record too old\n");
183 }
184
185 if (message_retry_record == NULL)
186 {
187 DEBUG(D_transport|D_retry) debug_printf("no message retry record\n");
188 }
189 else if (now - message_retry_record->time_stamp > retry_data_expire)
190 {
191 message_retry_record = NULL;
192 DEBUG(D_transport|D_retry) debug_printf("message retry record too old\n");
193 }
194
195 /* If there's a host-specific retry record, check for reaching the retry
196 time (or forcing). If not, and the host is not expired, check for the message
197 having been around for longer than the maximum retry time for this host or
198 address. Allow the delivery if it has. Otherwise set the appropriate unusable
199 flag and return FALSE. Otherwise arrange to return TRUE if this is an expired
200 host. */
201
202 if (host_retry_record != NULL)
203 {
204 *retry_host_key = host_key;
205
206 /* We have not reached the next try time. Check for the ultimate address
207 timeout if the host has not expired. */
208
209 if (now < host_retry_record->next_try && !deliver_force)
210 {
211 DEBUG(D_transport|D_retry)
212 {
213 debug_printf("host retry time not reached: checking ultimate address "
214 "timeout\n");
215 debug_printf(" now=%d first_failed=%d next_try=%d expired=%d\n",
216 (int)now, (int)host_retry_record->first_failed,
217 (int)host_retry_record->next_try,
218 host_retry_record->expired);
219 }
220
221 if (!host_retry_record->expired &&
222 ultimate_address_timeout(host_key, domain,
223 host_retry_record->basic_errno, host_retry_record->more_errno, now))
224 {
225 DEBUG(D_transport|D_retry)
226 debug_printf("on queue longer than maximum retry for "
227 "address - allowing delivery\n");
228 return FALSE;
229 }
230
231 /* We have not hit the ultimate address timeout; host is unusable. */
232
233 host->status = (host_retry_record->expired)?
234 hstatus_unusable_expired : hstatus_unusable;
235 host->why = hwhy_retry;
236 host->last_try = host_retry_record->last_try;
237 return FALSE;
238 }
239
240 /* Host is usable; set return TRUE if expired. */
241
242 yield = host_retry_record->expired;
243 }
244
245 /* It's OK to try the host. If there's a message-specific retry record, check
246 for reaching its retry time (or forcing). If not, mark the host unusable,
247 unless the ultimate address timeout has been reached. */
248
249 if (message_retry_record != NULL)
250 {
251 *retry_message_key = message_key;
252 if (now < message_retry_record->next_try && !deliver_force)
253 {
254 DEBUG(D_transport|D_retry)
255 {
256 debug_printf("host+message retry time not reached: checking ultimate "
257 "address timeout\n");
258 debug_printf(" now=%d first_failed=%d next_try=%d expired=%d\n",
259 (int)now, (int)message_retry_record->first_failed,
260 (int)message_retry_record->next_try, message_retry_record->expired);
261 }
262 if (!ultimate_address_timeout(host_key, domain, 0, 0, now))
263 {
264 host->status = hstatus_unusable;
265 host->why = hwhy_retry;
266 }
267 else
268 {
269 DEBUG(D_transport|D_retry)
270 debug_printf("on queue longer than maximum retry for "
271 "address - allowing delivery\n");
272 }
273 return FALSE;
274 }
275 }
276
277 return yield;
278 }
279
280
281
282
283 /*************************************************
284 * Add a retry item to an address *
285 *************************************************/
286
287 /* Retry items are chained onto an address when it is deferred either by router
288 or by a transport, or if it succeeds or fails and there was a previous retry
289 item that now needs to be deleted. Sometimes there can be both kinds of item:
290 for example, if routing was deferred but then succeeded, and delivery then
291 deferred. In that case there is a delete item for the routing retry, and an
292 updating item for the delivery.
293
294 (But note that that is only visible at the outer level, because in remote
295 delivery subprocesses, the address starts "clean", with no retry items carried
296 in.)
297
298 These items are used at the end of a delivery attempt to update the retry
299 database. The keys start R: for routing delays and T: for transport delays.
300
301 Arguments:
302 addr the address block onto which to hang the item
303 key the retry key
304 flags delete, host, and message flags, copied into the block
305
306 Returns: nothing
307 */
308
309 void
310 retry_add_item(address_item *addr, uschar *key, int flags)
311 {
312 retry_item *rti = store_get(sizeof(retry_item));
313 rti->next = addr->retries;
314 addr->retries = rti;
315 rti->key = key;
316 rti->basic_errno = addr->basic_errno;
317 rti->more_errno = addr->more_errno;
318 rti->message = addr->message;
319 rti->flags = flags;
320
321 DEBUG(D_transport|D_retry)
322 {
323 int letter = rti->more_errno & 255;
324 debug_printf("added retry item for %s: errno=%d more_errno=", rti->key,
325 rti->basic_errno);
326 if (letter == 'A' || letter == 'M')
327 debug_printf("%d,%c", (rti->more_errno >> 8) & 255, letter);
328 else
329 debug_printf("%d", rti->more_errno);
330 debug_printf(" flags=%d\n", flags);
331 }
332 }
333
334
335
336 /*************************************************
337 * Find retry configuration data *
338 *************************************************/
339
340 /* Search the in-store retry information for the first retry item that applies
341 to a given destination. If the key contains an @ we are probably handling a
342 local delivery and have a complete address to search for; this happens when
343 retry_use_local_part is set on a router. Otherwise, the key is likely to be a
344 host name for a remote delivery, or a domain name for a local delivery. We
345 prepend *@ on the front of it so that it will match a retry item whose address
346 item pattern is independent of the local part. The alternate key, if set, is
347 always just a domain, so we treat it likewise.
348
349 Arguments:
350 key key for which retry info is wanted
351 alternate alternative key, always just a domain
352 basic_errno specific error predicate on the retry rule, or zero
353 more_errno additional data for errno predicate
354
355 Returns: pointer to retry rule, or NULL
356 */
357
358 retry_config *
359 retry_find_config(uschar *key, uschar *alternate, int basic_errno,
360 int more_errno)
361 {
362 int replace = 0;
363 uschar *use_key, *use_alternate;
364 uschar *colon = Ustrchr(key, ':');
365 retry_config *yield;
366
367 /* If there's a colon in the key, there are two possibilities:
368
369 (1) This is a key for a host, ip address, and possibly port, in the format
370
371 hostname:ip+port
372
373 In this case, we temporarily replace the colon with a zero, to terminate
374 the string after the host name.
375
376 (2) This is a key for a pipe, file, or autoreply delivery, in the format
377
378 pipe-or-file-or-auto:x@y
379
380 where x@y is the original address that provoked the delivery. The pipe or
381 file or auto will start with | or / or >, whereas a host name will start
382 with a letter or a digit. In this case we want to use the original address
383 to search for a retry rule. */
384
385 if (colon != NULL)
386 {
387 if (isalnum(*key))
388 replace = ':';
389 else
390 key = Ustrrchr(key, ':') + 1; /* Take from the last colon */
391 }
392
393 if (replace == 0) colon = key + Ustrlen(key);
394 *colon = 0;
395
396 /* Sort out the keys */
397
398 use_key = (Ustrchr(key, '@') != NULL)? key : string_sprintf("*@%s", key);
399 use_alternate = (alternate == NULL)? NULL : string_sprintf("*@%s", alternate);
400
401 /* Scan the configured retry items. */
402
403 for (yield = retries; yield != NULL; yield = yield->next)
404 {
405 uschar *plist = yield->pattern;
406 uschar *slist = yield->senders;
407
408 /* If a specific error is set for this item, check that we are handling that
409 specific error, and if so, check any additional error information if
410 required. */
411
412 if (yield->basic_errno != 0)
413 {
414 /* Special code is required for quota errors, as these can either be system
415 quota errors, or Exim's own quota imposition, which has a different error
416 number. Full partitions are also treated in the same way as quota errors.
417 */
418
419 if (yield->basic_errno == ERRNO_EXIMQUOTA)
420 {
421 if ((basic_errno != ERRNO_EXIMQUOTA && basic_errno != errno_quota &&
422 basic_errno != ENOSPC) ||
423 (yield->more_errno != 0 && yield->more_errno > more_errno))
424 continue;
425 }
426
427 /* The TLSREQUIRED error also covers TLSFAILURE. These are subtly different
428 errors, but not worth separating at this level. */
429
430 else if (yield->basic_errno == ERRNO_TLSREQUIRED)
431 {
432 if (basic_errno != ERRNO_TLSREQUIRED && basic_errno != ERRNO_TLSFAILURE)
433 continue;
434 }
435
436 /* Handle 4xx responses to MAIL, RCPT, or DATA. The code that was received
437 is in the 2nd least significant byte of more_errno (with 400 subtracted).
438 The required value is coded in the 2nd least significant byte of the
439 yield->more_errno field as follows:
440
441 255 => any 4xx code
442 >= 100 => the decade must match the value less 100
443 < 100 => the exact value must match
444 */
445
446 else if (yield->basic_errno == ERRNO_MAIL4XX ||
447 yield->basic_errno == ERRNO_RCPT4XX ||
448 yield->basic_errno == ERRNO_DATA4XX)
449 {
450 int wanted;
451 if (basic_errno != yield->basic_errno) continue;
452 wanted = (yield->more_errno >> 8) & 255;
453 if (wanted != 255)
454 {
455 int evalue = (more_errno >> 8) & 255;
456 if (wanted >= 100)
457 {
458 if ((evalue/10)*10 != wanted - 100) continue;
459 }
460 else if (evalue != wanted) continue;
461 }
462 }
463
464 /* There are some special cases for timeouts */
465
466 else if (yield->basic_errno == ETIMEDOUT)
467 {
468 if (basic_errno != ETIMEDOUT) continue;
469
470 /* Just RTEF_CTOUT in the rule => don't care about 'A'/'M' addresses */
471 if (yield->more_errno == RTEF_CTOUT)
472 {
473 if ((more_errno & RTEF_CTOUT) == 0) continue;
474 }
475
476 else if (yield->more_errno != 0)
477 {
478 int cf_errno = more_errno;
479 if ((yield->more_errno & RTEF_CTOUT) == 0) cf_errno &= ~RTEF_CTOUT;
480 if (yield->more_errno != cf_errno) continue;
481 }
482 }
483
484 /* Default checks for exact match */
485
486 else
487 {
488 if (yield->basic_errno != basic_errno ||
489 (yield->more_errno != 0 && yield->more_errno != more_errno))
490 continue;
491 }
492 }
493
494 /* If the "senders" condition is set, check it. Note that sender_address may
495 be null during -brt checking, in which case we do not use this rule. */
496
497 if (slist != NULL && (sender_address == NULL ||
498 match_address_list(sender_address, TRUE, TRUE, &slist, NULL, -1, 0,
499 NULL) != OK))
500 continue;
501
502 /* Check for a match between the address list item at the start of this retry
503 rule and either the main or alternate keys. */
504
505 if (match_address_list(use_key, TRUE, TRUE, &plist, NULL, -1, UCHAR_MAX+1,
506 NULL) == OK ||
507 (use_alternate != NULL &&
508 match_address_list(use_alternate, TRUE, TRUE, &plist, NULL, -1,
509 UCHAR_MAX+1, NULL) == OK))
510 break;
511 }
512
513 *colon = replace;
514 return yield;
515 }
516
517
518
519
520 /*************************************************
521 * Update retry database *
522 *************************************************/
523
524 /* Update the retry data for any directing/routing/transporting that was
525 deferred, or delete it for those that succeeded after a previous defer. This is
526 done all in one go to minimize opening/closing/locking of the database file.
527
528 Note that, because SMTP delivery involves a list of destinations to try, there
529 may be defer-type retry information for some of them even when the message was
530 successfully delivered. Likewise if it eventually failed.
531
532 This function may move addresses from the defer to the failed queue if the
533 ultimate retry time has expired.
534
535 Arguments:
536 addr_defer queue of deferred addresses
537 addr_failed queue of failed addresses
538 addr_succeed queue of successful addresses
539
540 Returns: nothing
541 */
542
543 void
544 retry_update(address_item **addr_defer, address_item **addr_failed,
545 address_item **addr_succeed)
546 {
547 open_db dbblock;
548 open_db *dbm_file = NULL;
549 time_t now = time(NULL);
550 int i;
551
552 DEBUG(D_retry) debug_printf("Processing retry items\n");
553
554 /* Three-times loop to handle succeeded, failed, and deferred addresses.
555 Deferred addresses must be handled after failed ones, because some may be moved
556 to the failed chain if they have timed out. */
557
558 for (i = 0; i < 3; i++)
559 {
560 address_item *endaddr, *addr;
561 address_item *last_first = NULL;
562 address_item **paddr = (i==0)? addr_succeed :
563 (i==1)? addr_failed : addr_defer;
564 address_item **saved_paddr = NULL;
565
566 DEBUG(D_retry) debug_printf("%s addresses:\n", (i == 0)? "Succeeded" :
567 (i == 1)? "Failed" : "Deferred");
568
569 /* Loop for each address on the chain. For deferred addresses, the whole
570 address times out unless one of its retry addresses has a retry rule that
571 hasn't yet timed out. Deferred addresses should not be requesting deletion
572 of retry items, but just in case they do by accident, treat that case
573 as "not timed out".
574
575 As well as handling the addresses themselves, we must also process any
576 retry items for any parent addresses - these are typically "delete" items,
577 because the parent must have succeeded in order to generate the child. */
578
579 while ((endaddr = *paddr) != NULL)
580 {
581 BOOL timed_out = FALSE;
582 retry_item *rti;
583
584 for (addr = endaddr; addr != NULL; addr = addr->parent)
585 {
586 int update_count = 0;
587 int timedout_count = 0;
588
589 DEBUG(D_retry) debug_printf("%s%s\n", addr->address, (addr->retries == NULL)?
590 ": no retry items" : "");
591
592 /* Loop for each retry item. */
593
594 for (rti = addr->retries; rti != NULL; rti = rti->next)
595 {
596 uschar *message;
597 int message_length, message_space, failing_interval, next_try;
598 retry_rule *rule, *final_rule;
599 retry_config *retry;
600 dbdata_retry *retry_record;
601
602 /* Open the retry database if it is not already open; failure to open
603 the file is logged, but otherwise ignored - deferred addresses will
604 get retried at the next opportunity. Not opening earlier than this saves
605 opening if no addresses have retry items - common when none have yet
606 reached their retry next try time. */
607
608 if (dbm_file == NULL)
609 dbm_file = dbfn_open(US"retry", O_RDWR, &dbblock, TRUE);
610
611 if (dbm_file == NULL)
612 {
613 DEBUG(D_deliver|D_retry|D_hints_lookup)
614 debug_printf("retry database not available for updating\n");
615 return;
616 }
617
618 /* If there are no deferred addresses, that is, if this message is
619 completing, and the retry item is for a message-specific SMTP error,
620 force it to be deleted, because there's no point in keeping data for
621 no-longer-existing messages. This situation can occur when a domain has
622 two hosts and a message-specific error occurs for the first of them,
623 but the address gets delivered to the second one. This optimization
624 doesn't succeed in cleaning out all the dead entries, but it helps. */
625
626 if (*addr_defer == NULL && (rti->flags & rf_message) != 0)
627 rti->flags |= rf_delete;
628
629 /* Handle the case of a request to delete the retry info for this
630 destination. */
631
632 if ((rti->flags & rf_delete) != 0)
633 {
634 (void)dbfn_delete(dbm_file, rti->key);
635 DEBUG(D_retry)
636 debug_printf("deleted retry information for %s\n", rti->key);
637 continue;
638 }
639
640 /* Count the number of non-delete retry items. This is so that we
641 can compare it to the count of timed_out ones, to check whether
642 all are timed out. */
643
644 update_count++;
645
646 /* Get the retry information for this destination and error code, if
647 any. If this item is for a remote host with ip address, then pass
648 the domain name as an alternative to search for. If no retry
649 information is found, we can't generate a retry time, so there is
650 no point updating the database. This retry item is timed out. */
651
652 if ((retry = retry_find_config(rti->key + 2,
653 ((rti->flags & rf_host) != 0)? addr->domain : NULL,
654 rti->basic_errno, rti->more_errno)) == NULL)
655 {
656 DEBUG(D_retry) debug_printf("No configured retry item for %s%s%s\n",
657 rti->key,
658 ((rti->flags & rf_host) != 0)? US" or " : US"",
659 ((rti->flags & rf_host) != 0)? addr->domain : US"");
660 if (addr == endaddr) timedout_count++;
661 continue;
662 }
663
664 DEBUG(D_retry)
665 {
666 if ((rti->flags & rf_host) != 0)
667 debug_printf("retry for %s (%s) = %s %d %d\n", rti->key,
668 addr->domain, retry->pattern, retry->basic_errno,
669 retry->more_errno);
670 else
671 debug_printf("retry for %s = %s %d %d\n", rti->key, retry->pattern,
672 retry->basic_errno, retry->more_errno);
673 }
674
675 /* Set up the message for the database retry record. Because DBM
676 records have a maximum data length, we enforce a limit. There isn't
677 much point in keeping a huge message here, anyway. */
678
679 message = (rti->basic_errno > 0)? US strerror(rti->basic_errno) :
680 (rti->message == NULL)?
681 US"unknown error" : string_printing(rti->message);
682 message_length = Ustrlen(message);
683 if (message_length > 150) message_length = 150;
684
685 /* Read a retry record from the database or construct a new one.
686 Ignore an old one if it is too old since it was last updated. */
687
688 retry_record = dbfn_read(dbm_file, rti->key);
689 if (retry_record != NULL &&
690 now - retry_record->time_stamp > retry_data_expire)
691 retry_record = NULL;
692
693 if (retry_record == NULL)
694 {
695 retry_record = store_get(sizeof(dbdata_retry) + message_length);
696 message_space = message_length;
697 retry_record->first_failed = now;
698 retry_record->last_try = now;
699 retry_record->next_try = now;
700 retry_record->expired = FALSE;
701 retry_record->text[0] = 0; /* just in case */
702 }
703 else message_space = Ustrlen(retry_record->text);
704
705 /* Compute how long this destination has been failing */
706
707 failing_interval = now - retry_record->first_failed;
708 DEBUG(D_retry) debug_printf("failing_interval=%d message_age=%d\n",
709 failing_interval, message_age);
710
711 /* For a non-host error, if the message has been on the queue longer
712 than the recorded time of failure, use the message's age instead. This
713 can happen when some messages can be delivered and others cannot; a
714 successful delivery will reset the first_failed time, and this can lead
715 to a failing message being retried too often. */
716
717 if ((rti->flags & rf_host) == 0 && message_age > failing_interval)
718 failing_interval = message_age;
719
720 /* Search for the current retry rule. The cutoff time of the
721 last rule is handled differently to the others. The rule continues
722 to operate for ever (the global maximum interval will eventually
723 limit the gaps) but its cutoff time determines when an individual
724 destination times out. If there are no retry rules, the destination
725 always times out, but we can't compute a retry time. */
726
727 final_rule = NULL;
728 for (rule = retry->rules; rule != NULL; rule = rule->next)
729 {
730 if (failing_interval <= rule->timeout) break;
731 final_rule = rule;
732 }
733
734 /* If there's an un-timed out rule, the destination has not
735 yet timed out, so the address as a whole has not timed out (but we are
736 interested in this only for the end address). Make sure the expired
737 flag is false (can be forced via fixdb from outside, but ensure it is
738 consistent with the rules whenever we go through here). */
739
740 if (rule != NULL)
741 {
742 retry_record->expired = FALSE;
743 }
744
745 /* Otherwise, set the retry timeout expired, and set the final rule
746 as the one from which to compute the next retry time. Subsequent
747 messages will fail immediately until the retry time is reached (unless
748 there are other, still active, retries). */
749
750 else
751 {
752 rule = final_rule;
753 retry_record->expired = TRUE;
754 if (addr == endaddr) timedout_count++;
755 }
756
757 /* There is a special case to consider when some messages get through
758 to a destination and others don't. This can happen locally when a
759 large message pushes a user over quota, and it can happen remotely
760 when a machine is on a dodgy Internet connection. The messages that
761 get through wipe the retry information, causing those that don't to
762 stay on the queue longer than the final retry time. In order to
763 avoid this, we check, using the time of arrival of the message, to
764 see if it has been on the queue for more than the final cutoff time,
765 and if so, cause this retry item to time out, and the retry time to
766 be set to "now" so that any subsequent messages in the same condition
767 also get tried. We search for the last rule onwards from the one that
768 is in use. If there are no retry rules for the item, rule will be null
769 and timedout_count will already have been updated.
770
771 This implements "timeout this rule if EITHER the host (or routing or
772 directing) has been failing for more than the maximum time, OR if the
773 message has been on the queue for more than the maximum time."
774
775 February 2006: It is possible that this code is no longer needed
776 following the change to the retry calculation to use the message age if
777 it is larger than the time since first failure. It may be that the
778 expired flag is always set when the other conditions are met. However,
779 this is a small bit of code, and it does no harm to leave it in place,
780 just in case. */
781
782 if (received_time <= retry_record->first_failed &&
783 addr == endaddr && !retry_record->expired && rule != NULL)
784 {
785 retry_rule *last_rule;
786 for (last_rule = rule;
787 last_rule->next != NULL;
788 last_rule = last_rule->next);
789 if (now - received_time > last_rule->timeout)
790 {
791 DEBUG(D_retry) debug_printf("on queue longer than maximum retry\n");
792 timedout_count++;
793 rule = NULL;
794 }
795 }
796
797 /* Compute the next try time from the rule, subject to the global
798 maximum, and update the retry database. If rule == NULL it means
799 there were no rules at all (and the timeout will be set expired),
800 or we have a message that is older than the final timeout. In this
801 case set the next retry time to now, so that one delivery attempt
802 happens for subsequent messages. */
803
804 if (rule == NULL) next_try = now; else
805 {
806 if (rule->rule == 'F') next_try = now + rule->p1;
807 else /* rule = 'G' or 'H' */
808 {
809 int last_predicted_gap =
810 retry_record->next_try - retry_record->last_try;
811 int last_actual_gap = now - retry_record->last_try;
812 int lastgap = (last_predicted_gap < last_actual_gap)?
813 last_predicted_gap : last_actual_gap;
814 int next_gap = (lastgap * rule->p2)/1000;
815 if (rule->rule == 'G')
816 {
817 next_try = now + ((lastgap < rule->p1)? rule->p1 : next_gap);
818 }
819 else /* The 'H' rule */
820 {
821 next_try = now + rule->p1;
822 if (next_gap > rule->p1)
823 next_try += random_number(next_gap - rule->p1)/2 +
824 (next_gap - rule->p1)/2;
825 }
826 }
827 }
828
829 /* Impose a global retry max */
830
831 if (next_try - now > retry_interval_max)
832 next_try = now + retry_interval_max;
833
834 /* If the new message length is greater than the previous one, we
835 have to copy the record first. */
836
837 if (message_length > message_space)
838 {
839 dbdata_retry *newr = store_get(sizeof(dbdata_retry) + message_length);
840 memcpy(newr, retry_record, sizeof(dbdata_retry));
841 retry_record = newr;
842 }
843
844 /* Set up the retry record; message_length may be less than the string
845 length for very long error strings. */
846
847 retry_record->last_try = now;
848 retry_record->next_try = next_try;
849 retry_record->basic_errno = rti->basic_errno;
850 retry_record->more_errno = rti->more_errno;
851 Ustrncpy(retry_record->text, message, message_length);
852 retry_record->text[message_length] = 0;
853
854 DEBUG(D_retry)
855 {
856 int letter = retry_record->more_errno & 255;
857 debug_printf("Writing retry data for %s\n", rti->key);
858 debug_printf(" first failed=%d last try=%d next try=%d expired=%d\n",
859 (int)retry_record->first_failed, (int)retry_record->last_try,
860 (int)retry_record->next_try, retry_record->expired);
861 debug_printf(" errno=%d more_errno=", retry_record->basic_errno);
862 if (letter == 'A' || letter == 'M')
863 debug_printf("%d,%c", (retry_record->more_errno >> 8) & 255,
864 letter);
865 else
866 debug_printf("%d", retry_record->more_errno);
867 debug_printf(" %s\n", retry_record->text);
868 }
869
870 (void)dbfn_write(dbm_file, rti->key, retry_record,
871 sizeof(dbdata_retry) + message_length);
872 } /* Loop for each retry item */
873
874 /* If all the non-delete retry items are timed out, the address is
875 timed out, provided that we didn't skip any hosts because their retry
876 time was not reached (or because of hosts_max_try). */
877
878 if (update_count > 0 && update_count == timedout_count)
879 {
880 if (!testflag(endaddr, af_retry_skipped))
881 {
882 DEBUG(D_retry) debug_printf("timed out: all retries expired\n");
883 timed_out = TRUE;
884 }
885 else
886 {
887 DEBUG(D_retry)
888 debug_printf("timed out but some hosts were skipped\n");
889 }
890 }
891 } /* Loop for an address and its parents */
892
893 /* If this is a deferred address, and retry processing was requested by
894 means of one or more retry items, and they all timed out, move the address
895 to the failed queue, and restart this loop without updating paddr.
896
897 If there were several addresses batched in the same remote delivery, only
898 the original top one will have host retry items attached to it, but we want
899 to handle all the same. Each will have a pointer back to its "top" address,
900 and they will now precede the item with the retries because addresses are
901 inverted when added to these final queues. We have saved information about
902 them in passing (below) so they can all be cut out at once. */
903
904 if (i == 2) /* Handling defers */
905 {
906 if (endaddr->retries != NULL && timed_out)
907 {
908 if (last_first == endaddr) paddr = saved_paddr;
909 addr = *paddr;
910 *paddr = endaddr->next;
911
912 endaddr->next = *addr_failed;
913 *addr_failed = addr;
914
915 for (;; addr = addr->next)
916 {
917 setflag(addr, af_retry_timedout);
918 addr->message = (addr->message == NULL)? US"retry timeout exceeded" :
919 string_sprintf("%s: retry timeout exceeded", addr->message);
920 addr->user_message = (addr->user_message == NULL)?
921 US"retry timeout exceeded" :
922 string_sprintf("%s: retry timeout exceeded", addr->user_message);
923 log_write(0, LOG_MAIN, "** %s%s%s%s: retry timeout exceeded",
924 addr->address,
925 (addr->parent == NULL)? US"" : US" <",
926 (addr->parent == NULL)? US"" : addr->parent->address,
927 (addr->parent == NULL)? US"" : US">");
928
929 if (addr == endaddr) break;
930 }
931
932 continue; /* Restart from changed *paddr */
933 }
934
935 /* This address is to remain on the defer chain. If it has a "first"
936 pointer, save the pointer to it in case we want to fail the set of
937 addresses when we get to the first one. */
938
939 if (endaddr->first != last_first)
940 {
941 last_first = endaddr->first;
942 saved_paddr = paddr;
943 }
944 }
945
946 /* All cases (succeed, fail, defer left on queue) */
947
948 paddr = &(endaddr->next); /* Advance to next address */
949 } /* Loop for all addresses */
950 } /* Loop for succeed, fail, defer */
951
952 /* Close and unlock the database */
953
954 if (dbm_file != NULL) dbfn_close(dbm_file);
955
956 DEBUG(D_retry) debug_printf("end of retry processing\n");
957 }
958
959 /* End of retry.c */