Merge from master into 4.next
[exim.git] / src / src / rfc2047.c
1 /*************************************************
2 * Exim - an Internet mail transport agent *
3 *************************************************/
4
5 /* Copyright (c) University of Cambridge 1995 - 2015 */
6 /* See the file NOTICE for conditions of use and distribution. */
7
8 /* This file contains a function for decoding message header lines that may
9 contain encoded "words" according to the rules described in
10
11 RFC-2047 at http://www.ietf.org/rfc/rfc2047.txt
12
13 The function is a rewritten version of code created by Norihisa Washitake.
14 The original could be used both inside Exim (as part of a patch) or in a
15 freestanding form. The original contained some built-in code conversions; I
16 have chosen only to do code conversions if iconv() is supported by the OS.
17 Because there were quite a lot of hacks to be done, for a variety of reasons,
18 I rewrote the code.
19
20 You can find the latest version of the original library at
21
22 http://washitake.com/mail/exim/mime/
23
24 The code below is almost completely unlike the original. */
25
26
27 #include "exim.h"
28
29
30 /*************************************************
31 * Do a QP conversion *
32 *************************************************/
33
34 /* This function decodes "quoted printable" into bytes.
35
36 Arguments:
37 string the string that includes QP escapes
38 ptrptr where to return pointer to the decoded string
39
40 Returns: the length of the decoded string, or -1 on failure
41 */
42
43 static int
44 rfc2047_qpdecode(uschar *string, uschar **ptrptr)
45 {
46 int len = 0;
47 uschar *ptr;
48
49 ptr = *ptrptr = store_get(Ustrlen(string) + 1); /* No longer than this */
50
51 while (*string != 0)
52 {
53 register int ch = *string++;
54
55 if (ch == '_') *ptr++ = ' ';
56 else if (ch == '=')
57 {
58 int a = *string;
59 int b = (a == 0)? 0 : string[1];
60 if (!isxdigit(a) || !isxdigit(b)) return -1; /* Bad QP string */
61 *ptr++ = ((Ustrchr(hex_digits, tolower(a)) - hex_digits) << 4) +
62 Ustrchr(hex_digits, tolower(b)) - hex_digits;
63 string += 2;
64 }
65 else if (ch == ' ' || ch == '\t') return -1; /* Whitespace is illegal */
66 else *ptr++ = ch;
67
68 len++;
69 }
70
71 *ptr = 0;
72 return len;
73 }
74
75
76
77 /*************************************************
78 * Decode next MIME word *
79 *************************************************/
80
81 /* Scan a string to see if a MIME word exists; pass back the separator
82 points in the string.
83
84 Arguments:
85 string subject string
86 lencheck TRUE to enforce maximum length check
87 q1ptr pass back address of first question mark
88 q2ptr pass back address of second question mark
89 endptr pass back address of final ?=
90 dlenptr pass back length of decoded string
91 dptrptr pass back pointer to decoded string
92
93 Returns: address of =? or NULL if not present
94 */
95
96 static uschar *
97 decode_mimeword(uschar *string, BOOL lencheck, uschar **q1ptr, uschar **q2ptr,
98 uschar **endptr, size_t *dlenptr, uschar **dptrptr)
99 {
100 uschar *mimeword;
101 for (;; string = mimeword + 2)
102 {
103 int encoding;
104 int dlen = -1;
105
106 if ((mimeword = Ustrstr(string, "=?")) == NULL ||
107 (*q1ptr = Ustrchr(mimeword+2, '?')) == NULL ||
108 (*q2ptr = Ustrchr(*q1ptr+1, '?')) == NULL ||
109 (*endptr = Ustrstr(*q2ptr+1, "?=")) == NULL) return NULL;
110
111 /* We have found =?xxx?xxx?xxx?= in the string. Optionally check the
112 length, and that the second field is just one character long. If not,
113 continue the loop to search again. We must start just after the initial =?
114 because we might have found =?xxx=?xxx?xxx?xxx?=. */
115
116 if ((lencheck && *endptr - mimeword > 73) || *q2ptr - *q1ptr != 2) continue;
117
118 /* Get the encoding letter, and decode the data string. */
119
120 encoding = toupper((*q1ptr)[1]);
121 **endptr = 0;
122 if (encoding == 'B')
123 dlen = b64decode(*q2ptr+1, dptrptr);
124 else if (encoding == 'Q')
125 dlen = rfc2047_qpdecode(*q2ptr+1, dptrptr);
126 **endptr = '?'; /* restore */
127
128 /* If the decoding succeeded, we are done. Set the length of the decoded
129 string, and pass back the initial pointer. Otherwise, the loop continues. */
130
131 if (dlen >= 0)
132 {
133 *dlenptr = (size_t)dlen;
134 return mimeword;
135 }
136 }
137
138 /* Control should never actually get here */
139 }
140
141
142
143 /*************************************************
144 * Decode and convert an RFC 2047 string *
145 *************************************************/
146
147 /* There are two functions defined here. The original one was rfc2047_decode()
148 and it was documented in the local_scan() interface. I needed to add an extra
149 argument for use by expand_string(), so I created rfc2047_decode2() for that
150 purpose. The original function became a stub that just supplies NULL for the
151 new argument (sizeptr).
152
153 An RFC 2047-encoded string may contain one or more "words", each of the
154 form =?...?.?...?= with the first ... specifying the character code, the
155 second being Q (for quoted printable) or B for Base64 encoding. The third ...
156 is the actual data.
157
158 This function first decodes each "word" into bytes from the Q or B encoding.
159 Then, if provided with the name of a charset encoding, and if iconv() is
160 available, it attempts to translate the result to the named character set.
161 If this fails, the binary string is returned with an error message.
162
163 If a binary zero is encountered in the decoded string, it is replaced by the
164 contents of the zeroval argument. For use with Exim headers, the value must not
165 be 0 because they are handled as zero-terminated strings. When zeroval==0,
166 lenptr should not be NULL.
167
168 Arguments:
169 string the subject string
170 lencheck TRUE to enforce maximum MIME word length
171 target the name of the target encoding for MIME words, or NULL for
172 no charset translation
173 zeroval the value to use for binary zero bytes
174 lenptr if not NULL, the length of the result is returned via
175 this variable
176 sizeptr if not NULL, the length of a new store block in which the
177 result is built is placed here; if no new store is obtained,
178 the value is not changed
179 error for error messages; NULL if no problem; this can be set
180 when the yield is non-NULL if there was a charset
181 translation problem
182
183 Returns: the decoded, converted string, or NULL on error; if there are
184 no MIME words in the string, the original string is returned
185 */
186
187 uschar *
188 rfc2047_decode2(uschar *string, BOOL lencheck, uschar *target, int zeroval,
189 int *lenptr, int *sizeptr, uschar **error)
190 {
191 int ptr = 0;
192 int size = Ustrlen(string);
193 size_t dlen;
194 uschar *dptr, *yield;
195 uschar *mimeword, *q1, *q2, *endword;
196
197 *error = NULL;
198 mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
199
200 if (mimeword == NULL)
201 {
202 if (lenptr != NULL) *lenptr = size;
203 return string;
204 }
205
206 /* Scan through the string, decoding MIME words and copying intermediate text,
207 building the result as we go. The result may be longer than the input if it is
208 translated into a multibyte code such as UTF-8. That's why we use the dynamic
209 string building code. */
210
211 yield = store_get(++size);
212
213 while (mimeword != NULL)
214 {
215
216 #if HAVE_ICONV
217 iconv_t icd = (iconv_t)(-1);
218 #endif
219
220 if (mimeword != string)
221 yield = string_catn(yield, &size, &ptr, string, mimeword - string);
222
223 /* Do a charset translation if required. This is supported only on hosts
224 that have the iconv() function. Translation errors set error, but carry on,
225 using the untranslated data. If there is more than one error, the message
226 passed back refers to the final one. We use a loop to cater for the case
227 of long strings - the RFC puts limits on the length, but it's best to be
228 robust. */
229
230 #if HAVE_ICONV
231 *q1 = 0;
232 if (target != NULL && strcmpic(target, mimeword+2) != 0)
233 {
234 icd = iconv_open(CS target, CS(mimeword+2));
235
236 if (icd == (iconv_t)(-1))
237 {
238 *error = string_sprintf("iconv_open(\"%s\", \"%s\") failed: %s%s",
239 target, mimeword+2, strerror(errno),
240 (errno == EINVAL)? " (maybe unsupported conversion)" : "");
241 }
242 }
243 *q1 = '?';
244 #endif
245
246 while (dlen > 0)
247 {
248 uschar *tptr = NULL; /* Stops compiler warning */
249 int tlen = -1;
250
251 #if HAVE_ICONV
252 uschar tbuffer[256];
253 uschar *outptr = tbuffer;
254 size_t outleft = sizeof(tbuffer);
255
256 /* If translation is required, go for it. */
257
258 if (icd != (iconv_t)(-1))
259 {
260 (void)iconv(icd, (ICONV_ARG2_TYPE)(&dptr), &dlen, CSS &outptr, &outleft);
261
262 /* If outptr has been adjusted, there is some output. Set up to add it to
263 the output buffer. The function will have adjusted dptr and dlen. If
264 iconv() stopped because of an error, we'll pick it up next time when
265 there's no output.
266
267 If there is no output, we expect there to have been a translation
268 error, because we know there was at least one input byte. We leave the
269 value of tlen as -1, which causes the rest of the input to be copied
270 verbatim. */
271
272 if (outptr > tbuffer)
273 {
274 tptr = tbuffer;
275 tlen = outptr - tbuffer;
276 }
277 else
278 {
279 DEBUG(D_any) debug_printf("iconv error translating \"%.*s\" to %s: "
280 "%s\n", (int)(endword + 2 - mimeword), mimeword, target, strerror(errno));
281 }
282 }
283
284 #endif
285
286 /* No charset translation is happening or there was a translation error;
287 just set up the original as the string to be added, and mark it all used.
288 */
289
290 if (tlen == -1)
291 {
292 tptr = dptr;
293 tlen = dlen;
294 dlen = 0;
295 }
296
297 /* Deal with zero values; convert them if requested. */
298
299 if (zeroval != 0)
300 {
301 int i;
302 for (i = 0; i < tlen; i++)
303 if (tptr[i] == 0) tptr[i] = zeroval;
304 }
305
306 /* Add the new string onto the result */
307
308 yield = string_catn(yield, &size, &ptr, tptr, tlen);
309 }
310
311 #if HAVE_ICONV
312 if (icd != (iconv_t)(-1)) iconv_close(icd);
313 #endif
314
315 /* Update string past the MIME word; skip any white space if the next thing
316 is another MIME word. */
317
318 string = endword + 2;
319 mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
320 if (mimeword != NULL)
321 {
322 uschar *s = string;
323 while (isspace(*s)) s++;
324 if (s == mimeword) string = s;
325 }
326 }
327
328 /* Copy the remaining characters of the string, zero-terminate it, and return
329 the length as well if requested. */
330
331 yield = string_cat(yield, &size, &ptr, string);
332 yield[ptr] = 0;
333 if (lenptr != NULL) *lenptr = ptr;
334 if (sizeptr != NULL) *sizeptr = size;
335 return yield;
336 }
337
338
339 /* This is the stub that provides the original interface without the sizeptr
340 argument. */
341
342 uschar *
343 rfc2047_decode(uschar *string, BOOL lencheck, uschar *target, int zeroval,
344 int *lenptr, uschar **error)
345 {
346 return rfc2047_decode2(string, lencheck, target, zeroval, lenptr, NULL, error);
347 }
348
349 /* End of rfc2047.c */