[exim.git] / src / src / rfc2047.c

/*************************************************
*     Exim - an Internet mail transport agent    *
*************************************************/

/* Copyright (c) University of Cambridge 1995 - 2018 */
/* See the file NOTICE for conditions of use and distribution. */

/* This file contains a function for decoding message header lines that may
contain encoded "words" according to the rules described in

  RFC-2047 at http://www.ietf.org/rfc/rfc2047.txt

The function is a rewritten version of code created by Norihisa Washitake.
The original could be used both inside Exim (as part of a patch) or in a
freestanding form. The original contained some built-in code conversions; I
have chosen only to do code conversions if iconv() is supported by the OS.
Because there were quite a lot of hacks to be done, for a variety of reasons,
I rewrote the code.

You can find the latest version of the original library at

  http://washitake.com/mail/exim/mime/

The code below is almost completely unlike the original. */


#include "exim.h"


/*************************************************
*                Do a QP conversion              *
*************************************************/

/* This function decodes "quoted printable" into bytes.

Arguments:
  string      the string that includes QP escapes
  ptrptr      where to return pointer to the decoded string

Returns:      the length of the decoded string, or -1 on failure
*/

static int
rfc2047_qpdecode(uschar *string, uschar **ptrptr)
{
int len = 0;
uschar *ptr;

ptr = *ptrptr = store_get(Ustrlen(string) + 1);  /* No longer than this */

while (*string != 0)
  {
  int ch = *string++;

  if (ch == '_') *ptr++ = ' ';
  else if (ch == '=')
    {
    int a = *string;
    int b = (a == 0)? 0 : string[1];
    if (!isxdigit(a) || !isxdigit(b)) return -1;  /* Bad QP string */
    *ptr++ = ((Ustrchr(hex_digits, tolower(a)) - hex_digits) << 4) +
               Ustrchr(hex_digits, tolower(b)) - hex_digits;
    string += 2;
    }
  else if (ch == ' ' || ch == '\t') return -1;    /* Whitespace is illegal */
  else *ptr++ = ch;

  len++;
  }

*ptr = 0;
return len;
}


/*************************************************
*            Decode next MIME word               *
*************************************************/

/* Scan a string to see if a MIME word exists; pass back the separator
points in the string.

Arguments:
  string     subject string
  lencheck   TRUE to enforce maximum length check
  q1ptr      pass back address of first question mark
  q2ptr      pass back address of second question mark
  endptr     pass back address of final ?=
  dlenptr    pass back length of decoded string
  dptrptr    pass back pointer to decoded string

Returns:     address of =? or NULL if not present
*/

static uschar *
decode_mimeword(uschar *string, BOOL lencheck, uschar **q1ptr, uschar **q2ptr,
  uschar **endptr, size_t *dlenptr, uschar **dptrptr)
{
uschar *mimeword;
for (;; string = mimeword + 2)
  {
  int encoding;
  int dlen = -1;

  if ((mimeword = Ustrstr(string, "=?"))  == NULL ||
      (*q1ptr = Ustrchr(mimeword+2, '?')) == NULL ||
      (*q2ptr = Ustrchr(*q1ptr+1, '?')) == NULL ||
      (*endptr = Ustrstr(*q2ptr+1, "?=")) == NULL) return NULL;

  /* We have found =?xxx?xxx?xxx?= in the string. Optionally check the
  length, and that the second field is just one character long. If not,
  continue the loop to search again. We must start just after the initial =?
  because we might have found =?xxx=?xxx?xxx?xxx?=. */

  if ((lencheck && *endptr - mimeword > 73) || *q2ptr - *q1ptr != 2) continue;

  /* Get the encoding letter, and decode the data string. */

  encoding = toupper((*q1ptr)[1]);
  **endptr = 0;
  if (encoding == 'B')
    dlen = b64decode(*q2ptr+1, dptrptr);
  else if (encoding == 'Q')
    dlen = rfc2047_qpdecode(*q2ptr+1, dptrptr);
  **endptr = '?';   /* restore */

  /* If the decoding succeeded, we are done. Set the length of the decoded
  string, and pass back the initial pointer. Otherwise, the loop continues. */

  if (dlen >= 0)
    {
    *dlenptr = (size_t)dlen;
    return mimeword;
    }
  }

/* Control should never actually get here */
}


/*************************************************
*    Decode and convert an RFC 2047 string       *
*************************************************/

/* There are two functions defined here. The original one was rfc2047_decode()
and it was documented in the local_scan() interface. I needed to add an extra
argument for use by expand_string(), so I created rfc2047_decode2() for that
purpose. The original function became a stub that just supplies NULL for the
new argument (sizeptr).

An RFC 2047-encoded string may contain one or more "words", each of the
form  =?...?.?...?=  with the first ... specifying the character code, the
second being Q (for quoted printable) or B for Base64 encoding. The third ...
is the actual data.

This function first decodes each "word" into bytes from the Q or B encoding.
Then, if provided with the name of a charset encoding, and if iconv() is
available, it attempts to translate the result to the named character set.
If this fails, the binary string is returned with an error message.

If a binary zero is encountered in the decoded string, it is replaced by the
contents of the zeroval argument. For use with Exim headers, the value must not
be 0 because they are handled as zero-terminated strings. When zeroval==0,
lenptr should not be NULL.

Arguments:
    string       the subject string
    lencheck     TRUE to enforce maximum MIME word length
    target       the name of the target encoding for MIME words, or NULL for
                   no charset translation
    zeroval      the value to use for binary zero bytes
    lenptr       if not NULL, the length of the result is returned via
                   this variable
    sizeptr      if not NULL, the length of a new store block in which the
                   result is built is placed here; if no new store is obtained,
                   the value is not changed
    error        for error messages; NULL if no problem; this can be set
                   when the yield is non-NULL if there was a charset
                   translation problem

Returns:         the decoded, converted string, or NULL on error; if there are
                   no MIME words in the string, the original string is returned
*/

uschar *
rfc2047_decode2(uschar *string, BOOL lencheck, uschar *target, int zeroval,
  int *lenptr, int *sizeptr, uschar **error)
{
int size = Ustrlen(string);
size_t dlen;
uschar *dptr;
gstring *yield;
uschar *mimeword, *q1, *q2, *endword;

*error = NULL;
mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);

if (!mimeword)
  {
  if (lenptr) *lenptr = size;
  return string;
  }

/* Scan through the string, decoding MIME words and copying intermediate text,
building the result as we go. The result may be longer than the input if it is
translated into a multibyte code such as UTF-8. That's why we use the dynamic
string building code. */

yield = store_get(sizeof(gstring) + ++size);
yield->size = size;
yield->ptr = 0;
yield->s = US(yield + 1);

while (mimeword)
  {

  #if HAVE_ICONV
  iconv_t icd = (iconv_t)(-1);
  #endif

  if (mimeword != string)
    yield = string_catn(yield, string, mimeword - string);

  /* Do a charset translation if required. This is supported only on hosts
  that have the iconv() function. Translation errors set error, but carry on,
  using the untranslated data. If there is more than one error, the message
  passed back refers to the final one. We use a loop to cater for the case
  of long strings - the RFC puts limits on the length, but it's best to be
  robust. */

  #if HAVE_ICONV
  *q1 = 0;
  if (target != NULL && strcmpic(target, mimeword+2) != 0)
    {
    icd = iconv_open(CS target, CS(mimeword+2));

    if (icd == (iconv_t)(-1))
      {
      *error = string_sprintf("iconv_open(\"%s\", \"%s\") failed: %s%s",
        target, mimeword+2, strerror(errno),
        (errno == EINVAL)? " (maybe unsupported conversion)" : "");
      }
    }
  *q1 = '?';
  #endif

  while (dlen > 0)
    {
    uschar *tptr = NULL;   /* Stops compiler warning */
    int tlen = -1;

    #if HAVE_ICONV
    uschar tbuffer[256];
    uschar *outptr = tbuffer;
    size_t outleft = sizeof(tbuffer);

    /* If translation is required, go for it. */

    if (icd != (iconv_t)(-1))
      {
      (void)iconv(icd, (ICONV_ARG2_TYPE)(&dptr), &dlen, CSS &outptr, &outleft);

      /* If outptr has been adjusted, there is some output. Set up to add it to
      the output buffer. The function will have adjusted dptr and dlen. If
      iconv() stopped because of an error, we'll pick it up next time when
      there's no output.

      If there is no output, we expect there to have been a translation
      error, because we know there was at least one input byte. We leave the
      value of tlen as -1, which causes the rest of the input to be copied
      verbatim. */

      if (outptr > tbuffer)
        {
        tptr = tbuffer;
        tlen = outptr - tbuffer;
        }
      else
        {
        DEBUG(D_any) debug_printf("iconv error translating \"%.*s\" to %s: "
        "%s\n", (int)(endword + 2 - mimeword), mimeword, target, strerror(errno));
        }
      }

    #endif

    /* No charset translation is happening or there was a translation error;
    just set up the original as the string to be added, and mark it all used.
    */

    if (tlen == -1)
      {
      tptr = dptr;
      tlen = dlen;
      dlen = 0;
      }

    /* Deal with zero values; convert them if requested. */

    if (zeroval != 0)
      for (int i = 0; i < tlen; i++)
        if (tptr[i] == 0) tptr[i] = zeroval;

    /* Add the new string onto the result */

    yield = string_catn(yield, tptr, tlen);
    }

  #if HAVE_ICONV
  if (icd != (iconv_t)(-1))  iconv_close(icd);
  #endif

  /* Update string past the MIME word; skip any white space if the next thing
  is another MIME word. */

  string = endword + 2;
  mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
  if (mimeword)
    {
    uschar *s = string;
    while (isspace(*s)) s++;
    if (s == mimeword) string = s;
    }
  }

/* Copy the remaining characters of the string, zero-terminate it, and return
the length as well if requested. */

yield = string_cat(yield, string);

if (lenptr) *lenptr = yield->ptr;
if (sizeptr) *sizeptr = yield->size;
return string_from_gstring(yield);
}


/* This is the stub that provides the original interface without the sizeptr
argument. */

uschar *
rfc2047_decode(uschar *string, BOOL lencheck, uschar *target, int zeroval,
  int *lenptr, uschar **error)
{
return rfc2047_decode2(string, lencheck, target, zeroval, lenptr, NULL, error);
}

/* End of rfc2047.c */
Commit	Line	Data
059ec3d9 PH	1	/*************************************************
	2	* Exim - an Internet mail transport agent *
	3	*************************************************/
	4
f9ba5e22	5	/* Copyright (c) University of Cambridge 1995 - 2018 */
059ec3d9 PH	6	/* See the file NOTICE for conditions of use and distribution. */
	7
	8	/* This file contains a function for decoding message header lines that may
	9	contain encoded "words" according to the rules described in
	10
	11	RFC-2047 at http://www.ietf.org/rfc/rfc2047.txt
	12
	13	The function is a rewritten version of code created by Norihisa Washitake.
	14	The original could be used both inside Exim (as part of a patch) or in a
	15	freestanding form. The original contained some built-in code conversions; I
	16	have chosen only to do code conversions if iconv() is supported by the OS.
	17	Because there were quite a lot of hacks to be done, for a variety of reasons,
	18	I rewrote the code.
	19
	20	You can find the latest version of the original library at
	21
	22	http://washitake.com/mail/exim/mime/
	23
	24	The code below is almost completely unlike the original. */
	25
	26
	27	#include "exim.h"
	28
	29
	30	/*************************************************
	31	* Do a QP conversion *
	32	*************************************************/
	33
	34	/* This function decodes "quoted printable" into bytes.
	35
	36	Arguments:
	37	string the string that includes QP escapes
	38	ptrptr where to return pointer to the decoded string
	39
	40	Returns: the length of the decoded string, or -1 on failure
	41	*/
	42
	43	static int
	44	rfc2047_qpdecode(uschar string, uschar *ptrptr)
	45	{
	46	int len = 0;
	47	uschar *ptr;
	48
	49	ptr = ptrptr = store_get(Ustrlen(string) + 1); / No longer than this */
	50
	51	while (*string != 0)
	52	{
f4bb363f	53	int ch = *string++;
059ec3d9 PH	54
	55	if (ch == '_') *ptr++ = ' ';
	56	else if (ch == '=')
	57	{
	58	int a = *string;
	59	int b = (a == 0)? 0 : string[1];
	60	if (!isxdigit(a) \|\| !isxdigit(b)) return -1; /* Bad QP string */
	61	*ptr++ = ((Ustrchr(hex_digits, tolower(a)) - hex_digits) << 4) +
	62	Ustrchr(hex_digits, tolower(b)) - hex_digits;
	63	string += 2;
	64	}
	65	else if (ch == ' ' \|\| ch == '\t') return -1; /* Whitespace is illegal */
	66	else *ptr++ = ch;
	67
	68	len++;
	69	}
	70
	71	*ptr = 0;
	72	return len;
	73	}
	74
	75
	76
	77	/*************************************************
	78	* Decode next MIME word *
	79	*************************************************/
	80
	81	/* Scan a string to see if a MIME word exists; pass back the separator
	82	points in the string.
	83
	84	Arguments:
	85	string subject string
	86	lencheck TRUE to enforce maximum length check
	87	q1ptr pass back address of first question mark
	88	q2ptr pass back address of second question mark
	89	endptr pass back address of final ?=
	90	dlenptr pass back length of decoded string
	91	dptrptr pass back pointer to decoded string
	92
	93	Returns: address of =? or NULL if not present
	94	*/
	95
	96	static uschar *
	97	decode_mimeword(uschar string, BOOL lencheck, uschar q1ptr, uschar *q2ptr,
	98	uschar *endptr, size_t dlenptr, uschar **dptrptr)
	99	{
	100	uschar *mimeword;
	101	for (;; string = mimeword + 2)
	102	{
	103	int encoding;
	104	int dlen = -1;
	105
	106	if ((mimeword = Ustrstr(string, "=?")) == NULL \|\|
	107	(*q1ptr = Ustrchr(mimeword+2, '?')) == NULL \|\|
	108	(q2ptr = Ustrchr(q1ptr+1, '?')) == NULL \|\|
	109	(endptr = Ustrstr(q2ptr+1, "?=")) == NULL) return NULL;
	110
	111	/* We have found =?xxx?xxx?xxx?= in the string. Optionally check the
	112	length, and that the second field is just one character long. If not,
	113	continue the loop to search again. We must start just after the initial =?
	114	because we might have found =?xxx=?xxx?xxx?xxx?=. */
	115
	116	if ((lencheck && endptr - mimeword > 73) \|\| q2ptr - *q1ptr != 2) continue;
	117
118	/* Get the encoding letter, and decode the data string. */
119
120	encoding = toupper((*q1ptr)[1]);
121	**endptr = 0;
122	if (encoding == 'B')
f4d091fb	123	dlen = b64decode(*q2ptr+1, dptrptr);
059ec3d9 PH	124	else if (encoding == 'Q')
	125	dlen = rfc2047_qpdecode(*q2ptr+1, dptrptr);
	126	*endptr = '?'; / restore */
	127
	128	/* If the decoding succeeded, we are done. Set the length of the decoded
	129	string, and pass back the initial pointer. Otherwise, the loop continues. */
	130
	131	if (dlen >= 0)
	132	{
	133	*dlenptr = (size_t)dlen;
	134	return mimeword;
	135	}
	136	}
	137
	138	/* Control should never actually get here */
	139	}
	140
	141
	142
	143	/*************************************************
	144	* Decode and convert an RFC 2047 string *
	145	*************************************************/
	146
	147	/* There are two functions defined here. The original one was rfc2047_decode()
	148	and it was documented in the local_scan() interface. I needed to add an extra
	149	argument for use by expand_string(), so I created rfc2047_decode2() for that
	150	purpose. The original function became a stub that just supplies NULL for the
	151	new argument (sizeptr).
	152
	153	An RFC 2047-encoded string may contain one or more "words", each of the
	154	form =?...?.?...?= with the first ... specifying the character code, the
	155	second being Q (for quoted printable) or B for Base64 encoding. The third ...
	156	is the actual data.
	157
	158	This function first decodes each "word" into bytes from the Q or B encoding.
	159	Then, if provided with the name of a charset encoding, and if iconv() is
	160	available, it attempts to translate the result to the named character set.
	161	If this fails, the binary string is returned with an error message.
	162
	163	If a binary zero is encountered in the decoded string, it is replaced by the
	164	contents of the zeroval argument. For use with Exim headers, the value must not
	165	be 0 because they are handled as zero-terminated strings. When zeroval==0,
	166	lenptr should not be NULL.
	167
	168	Arguments:
	169	string the subject string
	170	lencheck TRUE to enforce maximum MIME word length
	171	target the name of the target encoding for MIME words, or NULL for
	172	no charset translation
	173	zeroval the value to use for binary zero bytes
	174	lenptr if not NULL, the length of the result is returned via
	175	this variable
	176	sizeptr if not NULL, the length of a new store block in which the
	177	result is built is placed here; if no new store is obtained,
	178	the value is not changed
	179	error for error messages; NULL if no problem; this can be set
	180	when the yield is non-NULL if there was a charset
	181	translation problem
	182
	183	Returns: the decoded, converted string, or NULL on error; if there are
	184	no MIME words in the string, the original string is returned
	185	*/
	186
	187	uschar *
188	rfc2047_decode2(uschar string, BOOL lencheck, uschar target, int zeroval,
189	int lenptr, int sizeptr, uschar **error)
190	{
059ec3d9 PH	191	int size = Ustrlen(string);
059ec3d9 PH	192	size_t dlen;
acec9514 JH	193	uschar *dptr;
acec9514 JH	194	gstring *yield;
059ec3d9 PH	195	uschar mimeword, q1, q2, endword;
	196
	197	*error = NULL;
	198	mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
	199
f4bb363f	200	if (!mimeword)
059ec3d9	201	{
f4bb363f	202	if (lenptr) *lenptr = size;
059ec3d9 PH	203	return string;
	204	}
	205
	206	/* Scan through the string, decoding MIME words and copying intermediate text,
	207	building the result as we go. The result may be longer than the input if it is
	208	translated into a multibyte code such as UTF-8. That's why we use the dynamic
	209	string building code. */
	210
acec9514 JH	211	yield = store_get(sizeof(gstring) + ++size);
	212	yield->size = size;
	213	yield->ptr = 0;
	214	yield->s = US(yield + 1);
059ec3d9	215
f4bb363f	216	while (mimeword)
059ec3d9 PH	217	{
	218
	219	#if HAVE_ICONV
	220	iconv_t icd = (iconv_t)(-1);
	221	#endif
	222
	223	if (mimeword != string)
acec9514	224	yield = string_catn(yield, string, mimeword - string);
059ec3d9 PH	225
	226	/* Do a charset translation if required. This is supported only on hosts
	227	that have the iconv() function. Translation errors set error, but carry on,
	228	using the untranslated data. If there is more than one error, the message
	229	passed back refers to the final one. We use a loop to cater for the case
	230	of long strings - the RFC puts limits on the length, but it's best to be
	231	robust. */
	232
	233	#if HAVE_ICONV
	234	*q1 = 0;
	235	if (target != NULL && strcmpic(target, mimeword+2) != 0)
	236	{
	237	icd = iconv_open(CS target, CS(mimeword+2));
	238
	239	if (icd == (iconv_t)(-1))
	240	{
	241	*error = string_sprintf("iconv_open(\"%s\", \"%s\") failed: %s%s",
	242	target, mimeword+2, strerror(errno),
	243	(errno == EINVAL)? " (maybe unsupported conversion)" : "");
	244	}
	245	}
	246	*q1 = '?';
	247	#endif
	248
	249	while (dlen > 0)
	250	{
	251	uschar tptr = NULL; / Stops compiler warning */
	252	int tlen = -1;
	253
	254	#if HAVE_ICONV
	255	uschar tbuffer[256];
	256	uschar *outptr = tbuffer;
	257	size_t outleft = sizeof(tbuffer);
	258
	259	/* If translation is required, go for it. */
	260
	261	if (icd != (iconv_t)(-1))
	262	{
	263	(void)iconv(icd, (ICONV_ARG2_TYPE)(&dptr), &dlen, CSS &outptr, &outleft);
	264
	265	/* If outptr has been adjusted, there is some output. Set up to add it to
	266	the output buffer. The function will have adjusted dptr and dlen. If
	267	iconv() stopped because of an error, we'll pick it up next time when
	268	there's no output.
	269
	270	If there is no output, we expect there to have been a translation
	271	error, because we know there was at least one input byte. We leave the
	272	value of tlen as -1, which causes the rest of the input to be copied
	273	verbatim. */
	274
	275	if (outptr > tbuffer)
	276	{
	277	tptr = tbuffer;
	278	tlen = outptr - tbuffer;
	279	}
	280	else
	281	{
	282	DEBUG(D_any) debug_printf("iconv error translating \"%.*s\" to %s: "
73a46702	283	"%s\n", (int)(endword + 2 - mimeword), mimeword, target, strerror(errno));
059ec3d9 PH	284	}
	285	}
	286
	287	#endif
	288
	289	/* No charset translation is happening or there was a translation error;
	290	just set up the original as the string to be added, and mark it all used.
	291	*/
	292
	293	if (tlen == -1)
	294	{
	295	tptr = dptr;
	296	tlen = dlen;
	297	dlen = 0;
	298	}
	299
	300	/* Deal with zero values; convert them if requested. */
	301
	302	if (zeroval != 0)
d7978c0f	303	for (int i = 0; i < tlen; i++)
059ec3d9	304	if (tptr[i] == 0) tptr[i] = zeroval;
059ec3d9 PH	305
	306	/* Add the new string onto the result */
	307
acec9514	308	yield = string_catn(yield, tptr, tlen);
059ec3d9 PH	309	}
	310
	311	#if HAVE_ICONV
	312	if (icd != (iconv_t)(-1)) iconv_close(icd);
	313	#endif
	314
	315	/* Update string past the MIME word; skip any white space if the next thing
	316	is another MIME word. */
	317
	318	string = endword + 2;
	319	mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
f4bb363f	320	if (mimeword)
059ec3d9 PH	321	{
	322	uschar *s = string;
	323	while (isspace(*s)) s++;
	324	if (s == mimeword) string = s;
	325	}
	326	}
	327
	328	/* Copy the remaining characters of the string, zero-terminate it, and return
	329	the length as well if requested. */
	330
acec9514 JH	331	yield = string_cat(yield, string);
	332
	333	if (lenptr) *lenptr = yield->ptr;
	334	if (sizeptr) *sizeptr = yield->size;
	335	return string_from_gstring(yield);
059ec3d9 PH	336	}
	337
	338
	339	/* This is the stub that provides the original interface without the sizeptr
	340	argument. */
	341
	342	uschar *
	343	rfc2047_decode(uschar string, BOOL lencheck, uschar target, int zeroval,
	344	int lenptr, uschar *error)
	345	{
	346	return rfc2047_decode2(string, lencheck, target, zeroval, lenptr, NULL, error);
	347	}
	348
	349	/* End of rfc2047.c */