Performance: workaround Linux kernel bug
[exim.git] / src / src / rfc2047.c
CommitLineData
059ec3d9
PH
1/*************************************************
2* Exim - an Internet mail transport agent *
3*************************************************/
4
f9ba5e22 5/* Copyright (c) University of Cambridge 1995 - 2018 */
1e1ddfac 6/* Copyright (c) The Exim Maintainers 2020 */
059ec3d9
PH
7/* See the file NOTICE for conditions of use and distribution. */
8
9/* This file contains a function for decoding message header lines that may
10contain encoded "words" according to the rules described in
11
12 RFC-2047 at http://www.ietf.org/rfc/rfc2047.txt
13
14The function is a rewritten version of code created by Norihisa Washitake.
15The original could be used both inside Exim (as part of a patch) or in a
16freestanding form. The original contained some built-in code conversions; I
17have chosen only to do code conversions if iconv() is supported by the OS.
18Because there were quite a lot of hacks to be done, for a variety of reasons,
19I rewrote the code.
20
21You can find the latest version of the original library at
22
23 http://washitake.com/mail/exim/mime/
24
25The code below is almost completely unlike the original. */
26
27
28#include "exim.h"
29
30
31/*************************************************
32* Do a QP conversion *
33*************************************************/
34
35/* This function decodes "quoted printable" into bytes.
36
37Arguments:
38 string the string that includes QP escapes
39 ptrptr where to return pointer to the decoded string
40
41Returns: the length of the decoded string, or -1 on failure
42*/
43
44static int
45rfc2047_qpdecode(uschar *string, uschar **ptrptr)
46{
47int len = 0;
48uschar *ptr;
49
f3ebb786 50ptr = *ptrptr = store_get(Ustrlen(string) + 1, is_tainted(string)); /* No longer than this */
059ec3d9
PH
51
52while (*string != 0)
53 {
f4bb363f 54 int ch = *string++;
059ec3d9
PH
55
56 if (ch == '_') *ptr++ = ' ';
57 else if (ch == '=')
58 {
59 int a = *string;
60 int b = (a == 0)? 0 : string[1];
61 if (!isxdigit(a) || !isxdigit(b)) return -1; /* Bad QP string */
62 *ptr++ = ((Ustrchr(hex_digits, tolower(a)) - hex_digits) << 4) +
63 Ustrchr(hex_digits, tolower(b)) - hex_digits;
64 string += 2;
65 }
66 else if (ch == ' ' || ch == '\t') return -1; /* Whitespace is illegal */
67 else *ptr++ = ch;
68
69 len++;
70 }
71
72*ptr = 0;
73return len;
74}
75
76
77
78/*************************************************
79* Decode next MIME word *
80*************************************************/
81
82/* Scan a string to see if a MIME word exists; pass back the separator
83points in the string.
84
85Arguments:
86 string subject string
87 lencheck TRUE to enforce maximum length check
88 q1ptr pass back address of first question mark
89 q2ptr pass back address of second question mark
90 endptr pass back address of final ?=
91 dlenptr pass back length of decoded string
92 dptrptr pass back pointer to decoded string
93
94Returns: address of =? or NULL if not present
95*/
96
97static uschar *
98decode_mimeword(uschar *string, BOOL lencheck, uschar **q1ptr, uschar **q2ptr,
99 uschar **endptr, size_t *dlenptr, uschar **dptrptr)
100{
101uschar *mimeword;
102for (;; string = mimeword + 2)
103 {
104 int encoding;
105 int dlen = -1;
106
107 if ((mimeword = Ustrstr(string, "=?")) == NULL ||
108 (*q1ptr = Ustrchr(mimeword+2, '?')) == NULL ||
109 (*q2ptr = Ustrchr(*q1ptr+1, '?')) == NULL ||
110 (*endptr = Ustrstr(*q2ptr+1, "?=")) == NULL) return NULL;
111
112 /* We have found =?xxx?xxx?xxx?= in the string. Optionally check the
113 length, and that the second field is just one character long. If not,
114 continue the loop to search again. We must start just after the initial =?
115 because we might have found =?xxx=?xxx?xxx?xxx?=. */
116
117 if ((lencheck && *endptr - mimeword > 73) || *q2ptr - *q1ptr != 2) continue;
118
119 /* Get the encoding letter, and decode the data string. */
120
121 encoding = toupper((*q1ptr)[1]);
122 **endptr = 0;
123 if (encoding == 'B')
f4d091fb 124 dlen = b64decode(*q2ptr+1, dptrptr);
059ec3d9
PH
125 else if (encoding == 'Q')
126 dlen = rfc2047_qpdecode(*q2ptr+1, dptrptr);
127 **endptr = '?'; /* restore */
128
129 /* If the decoding succeeded, we are done. Set the length of the decoded
130 string, and pass back the initial pointer. Otherwise, the loop continues. */
131
132 if (dlen >= 0)
133 {
134 *dlenptr = (size_t)dlen;
135 return mimeword;
136 }
137 }
138
139/* Control should never actually get here */
140}
141
142
143
144/*************************************************
145* Decode and convert an RFC 2047 string *
146*************************************************/
147
148/* There are two functions defined here. The original one was rfc2047_decode()
149and it was documented in the local_scan() interface. I needed to add an extra
150argument for use by expand_string(), so I created rfc2047_decode2() for that
151purpose. The original function became a stub that just supplies NULL for the
152new argument (sizeptr).
153
154An RFC 2047-encoded string may contain one or more "words", each of the
155form =?...?.?...?= with the first ... specifying the character code, the
156second being Q (for quoted printable) or B for Base64 encoding. The third ...
157is the actual data.
158
159This function first decodes each "word" into bytes from the Q or B encoding.
160Then, if provided with the name of a charset encoding, and if iconv() is
161available, it attempts to translate the result to the named character set.
162If this fails, the binary string is returned with an error message.
163
164If a binary zero is encountered in the decoded string, it is replaced by the
165contents of the zeroval argument. For use with Exim headers, the value must not
166be 0 because they are handled as zero-terminated strings. When zeroval==0,
167lenptr should not be NULL.
168
169Arguments:
170 string the subject string
171 lencheck TRUE to enforce maximum MIME word length
172 target the name of the target encoding for MIME words, or NULL for
173 no charset translation
174 zeroval the value to use for binary zero bytes
175 lenptr if not NULL, the length of the result is returned via
176 this variable
177 sizeptr if not NULL, the length of a new store block in which the
178 result is built is placed here; if no new store is obtained,
179 the value is not changed
180 error for error messages; NULL if no problem; this can be set
181 when the yield is non-NULL if there was a charset
182 translation problem
183
184Returns: the decoded, converted string, or NULL on error; if there are
185 no MIME words in the string, the original string is returned
186*/
187
188uschar *
189rfc2047_decode2(uschar *string, BOOL lencheck, uschar *target, int zeroval,
190 int *lenptr, int *sizeptr, uschar **error)
191{
059ec3d9
PH
192int size = Ustrlen(string);
193size_t dlen;
acec9514
JH
194uschar *dptr;
195gstring *yield;
059ec3d9
PH
196uschar *mimeword, *q1, *q2, *endword;
197
198*error = NULL;
199mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
200
f4bb363f 201if (!mimeword)
059ec3d9 202 {
f4bb363f 203 if (lenptr) *lenptr = size;
059ec3d9
PH
204 return string;
205 }
206
207/* Scan through the string, decoding MIME words and copying intermediate text,
208building the result as we go. The result may be longer than the input if it is
209translated into a multibyte code such as UTF-8. That's why we use the dynamic
210string building code. */
211
f3ebb786 212yield = store_get(sizeof(gstring) + ++size, is_tainted(string));
acec9514
JH
213yield->size = size;
214yield->ptr = 0;
215yield->s = US(yield + 1);
059ec3d9 216
f4bb363f 217while (mimeword)
059ec3d9
PH
218 {
219
220 #if HAVE_ICONV
221 iconv_t icd = (iconv_t)(-1);
222 #endif
223
224 if (mimeword != string)
acec9514 225 yield = string_catn(yield, string, mimeword - string);
f3ebb786 226/*XXX that might have to convert an untainted string to a tainted one */
059ec3d9
PH
227
228 /* Do a charset translation if required. This is supported only on hosts
229 that have the iconv() function. Translation errors set error, but carry on,
230 using the untranslated data. If there is more than one error, the message
231 passed back refers to the final one. We use a loop to cater for the case
232 of long strings - the RFC puts limits on the length, but it's best to be
233 robust. */
234
235 #if HAVE_ICONV
236 *q1 = 0;
e851856f
JH
237 if (target && strcmpic(target, mimeword+2) != 0)
238 if ((icd = iconv_open(CS target, CS(mimeword+2))) == (iconv_t)-1)
059ec3d9
PH
239 *error = string_sprintf("iconv_open(\"%s\", \"%s\") failed: %s%s",
240 target, mimeword+2, strerror(errno),
241 (errno == EINVAL)? " (maybe unsupported conversion)" : "");
059ec3d9
PH
242 *q1 = '?';
243 #endif
244
245 while (dlen > 0)
246 {
247 uschar *tptr = NULL; /* Stops compiler warning */
248 int tlen = -1;
249
250 #if HAVE_ICONV
251 uschar tbuffer[256];
252 uschar *outptr = tbuffer;
253 size_t outleft = sizeof(tbuffer);
254
255 /* If translation is required, go for it. */
256
257 if (icd != (iconv_t)(-1))
258 {
259 (void)iconv(icd, (ICONV_ARG2_TYPE)(&dptr), &dlen, CSS &outptr, &outleft);
260
261 /* If outptr has been adjusted, there is some output. Set up to add it to
262 the output buffer. The function will have adjusted dptr and dlen. If
263 iconv() stopped because of an error, we'll pick it up next time when
264 there's no output.
265
266 If there is no output, we expect there to have been a translation
267 error, because we know there was at least one input byte. We leave the
268 value of tlen as -1, which causes the rest of the input to be copied
269 verbatim. */
270
271 if (outptr > tbuffer)
272 {
273 tptr = tbuffer;
274 tlen = outptr - tbuffer;
275 }
276 else
277 {
278 DEBUG(D_any) debug_printf("iconv error translating \"%.*s\" to %s: "
73a46702 279 "%s\n", (int)(endword + 2 - mimeword), mimeword, target, strerror(errno));
059ec3d9
PH
280 }
281 }
282
283 #endif
284
285 /* No charset translation is happening or there was a translation error;
286 just set up the original as the string to be added, and mark it all used.
287 */
288
289 if (tlen == -1)
290 {
291 tptr = dptr;
292 tlen = dlen;
293 dlen = 0;
294 }
295
296 /* Deal with zero values; convert them if requested. */
297
298 if (zeroval != 0)
d7978c0f 299 for (int i = 0; i < tlen; i++)
059ec3d9 300 if (tptr[i] == 0) tptr[i] = zeroval;
059ec3d9
PH
301
302 /* Add the new string onto the result */
303
acec9514 304 yield = string_catn(yield, tptr, tlen);
059ec3d9
PH
305 }
306
307 #if HAVE_ICONV
308 if (icd != (iconv_t)(-1)) iconv_close(icd);
309 #endif
310
311 /* Update string past the MIME word; skip any white space if the next thing
312 is another MIME word. */
313
314 string = endword + 2;
315 mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
f4bb363f 316 if (mimeword)
059ec3d9
PH
317 {
318 uschar *s = string;
319 while (isspace(*s)) s++;
320 if (s == mimeword) string = s;
321 }
322 }
323
324/* Copy the remaining characters of the string, zero-terminate it, and return
325the length as well if requested. */
326
acec9514
JH
327yield = string_cat(yield, string);
328
329if (lenptr) *lenptr = yield->ptr;
330if (sizeptr) *sizeptr = yield->size;
331return string_from_gstring(yield);
059ec3d9
PH
332}
333
334
335/* This is the stub that provides the original interface without the sizeptr
336argument. */
337
338uschar *
339rfc2047_decode(uschar *string, BOOL lencheck, uschar *target, int zeroval,
340 int *lenptr, uschar **error)
341{
342return rfc2047_decode2(string, lencheck, target, zeroval, lenptr, NULL, error);
343}
344
345/* End of rfc2047.c */