[exim.git] / src / src / rfc2047.c

/* $Cambridge: exim/src/src/rfc2047.c,v 1.2 2005/01/04 10:00:42 ph10 Exp $ */

/*************************************************
*     Exim - an Internet mail transport agent    *
*************************************************/

/* Copyright (c) University of Cambridge 1995 - 2005 */
/* See the file NOTICE for conditions of use and distribution. */

/* This file contains a function for decoding message header lines that may
contain encoded "words" according to the rules described in

  RFC-2047 at http://www.ietf.org/rfc/rfc2047.txt

The function is a rewritten version of code created by Norihisa Washitake.
The original could be used both inside Exim (as part of a patch) or in a
freestanding form. The original contained some built-in code conversions; I
have chosen only to do code conversions if iconv() is supported by the OS.
Because there were quite a lot of hacks to be done, for a variety of reasons,
I rewrote the code.

You can find the latest version of the original library at

  http://washitake.com/mail/exim/mime/

The code below is almost completely unlike the original. */


#include "exim.h"


/*************************************************
*                Do a QP conversion              *
*************************************************/

/* This function decodes "quoted printable" into bytes.

Arguments:
  string      the string that includes QP escapes
  ptrptr      where to return pointer to the decoded string

Returns:      the length of the decoded string, or -1 on failure
*/

static int
rfc2047_qpdecode(uschar *string, uschar **ptrptr)
{
int len = 0;
uschar *ptr;

ptr = *ptrptr = store_get(Ustrlen(string) + 1);  /* No longer than this */

while (*string != 0)
  {
  register int ch = *string++;

  if (ch == '_') *ptr++ = ' ';
  else if (ch == '=')
    {
    int a = *string;
    int b = (a == 0)? 0 : string[1];
    if (!isxdigit(a) || !isxdigit(b)) return -1;  /* Bad QP string */
    *ptr++ = ((Ustrchr(hex_digits, tolower(a)) - hex_digits) << 4) +
               Ustrchr(hex_digits, tolower(b)) - hex_digits;
    string += 2;
    }
  else if (ch == ' ' || ch == '\t') return -1;    /* Whitespace is illegal */
  else *ptr++ = ch;

  len++;
  }

*ptr = 0;
return len;
}


/*************************************************
*            Decode next MIME word               *
*************************************************/

/* Scan a string to see if a MIME word exists; pass back the separator
points in the string.

Arguments:
  string     subject string
  lencheck   TRUE to enforce maximum length check
  q1ptr      pass back address of first question mark
  q2ptr      pass back address of second question mark
  endptr     pass back address of final ?=
  dlenptr    pass back length of decoded string
  dptrptr    pass back pointer to decoded string

Returns:     address of =? or NULL if not present
*/

static uschar *
decode_mimeword(uschar *string, BOOL lencheck, uschar **q1ptr, uschar **q2ptr,
  uschar **endptr, size_t *dlenptr, uschar **dptrptr)
{
uschar *mimeword;
for (;; string = mimeword + 2)
  {
  int encoding;
  int dlen = -1;

  if ((mimeword = Ustrstr(string, "=?"))  == NULL ||
      (*q1ptr = Ustrchr(mimeword+2, '?')) == NULL ||
      (*q2ptr = Ustrchr(*q1ptr+1, '?')) == NULL ||
      (*endptr = Ustrstr(*q2ptr+1, "?=")) == NULL) return NULL;

  /* We have found =?xxx?xxx?xxx?= in the string. Optionally check the
  length, and that the second field is just one character long. If not,
  continue the loop to search again. We must start just after the initial =?
  because we might have found =?xxx=?xxx?xxx?xxx?=. */

  if ((lencheck && *endptr - mimeword > 73) || *q2ptr - *q1ptr != 2) continue;

  /* Get the encoding letter, and decode the data string. */

  encoding = toupper((*q1ptr)[1]);
  **endptr = 0;
  if (encoding == 'B')
    dlen = auth_b64decode(*q2ptr+1, dptrptr);
  else if (encoding == 'Q')
    dlen = rfc2047_qpdecode(*q2ptr+1, dptrptr);
  **endptr = '?';   /* restore */

  /* If the decoding succeeded, we are done. Set the length of the decoded
  string, and pass back the initial pointer. Otherwise, the loop continues. */

  if (dlen >= 0)
    {
    *dlenptr = (size_t)dlen;
    return mimeword;
    }
  }

/* Control should never actually get here */
}


/*************************************************
*    Decode and convert an RFC 2047 string       *
*************************************************/

/* There are two functions defined here. The original one was rfc2047_decode()
and it was documented in the local_scan() interface. I needed to add an extra
argument for use by expand_string(), so I created rfc2047_decode2() for that
purpose. The original function became a stub that just supplies NULL for the
new argument (sizeptr).

An RFC 2047-encoded string may contain one or more "words", each of the
form  =?...?.?...?=  with the first ... specifying the character code, the
second being Q (for quoted printable) or B for Base64 encoding. The third ...
is the actual data.

This function first decodes each "word" into bytes from the Q or B encoding.
Then, if provided with the name of a charset encoding, and if iconv() is
available, it attempts to translate the result to the named character set.
If this fails, the binary string is returned with an error message.

If a binary zero is encountered in the decoded string, it is replaced by the
contents of the zeroval argument. For use with Exim headers, the value must not
be 0 because they are handled as zero-terminated strings. When zeroval==0,
lenptr should not be NULL.

Arguments:
    string       the subject string
    lencheck     TRUE to enforce maximum MIME word length
    target       the name of the target encoding for MIME words, or NULL for
                   no charset translation
    zeroval      the value to use for binary zero bytes
    lenptr       if not NULL, the length of the result is returned via
                   this variable
    sizeptr      if not NULL, the length of a new store block in which the
                   result is built is placed here; if no new store is obtained,
                   the value is not changed
    error        for error messages; NULL if no problem; this can be set
                   when the yield is non-NULL if there was a charset
                   translation problem

Returns:         the decoded, converted string, or NULL on error; if there are
                   no MIME words in the string, the original string is returned
*/

uschar *
rfc2047_decode2(uschar *string, BOOL lencheck, uschar *target, int zeroval,
  int *lenptr, int *sizeptr, uschar **error)
{
int ptr = 0;
int size = Ustrlen(string);
size_t dlen;
uschar *dptr, *yield;
uschar *mimeword, *q1, *q2, *endword;

*error = NULL;
mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);

if (mimeword == NULL)
  {
  if (lenptr != NULL) *lenptr = size;
  return string;
  }

/* Scan through the string, decoding MIME words and copying intermediate text,
building the result as we go. The result may be longer than the input if it is
translated into a multibyte code such as UTF-8. That's why we use the dynamic
string building code. */

yield = store_get(++size);

while (mimeword != NULL)
  {

  #if HAVE_ICONV
  iconv_t icd = (iconv_t)(-1);
  #endif

  if (mimeword != string)
    yield = string_cat(yield, &size, &ptr, string, mimeword - string);

  /* Do a charset translation if required. This is supported only on hosts
  that have the iconv() function. Translation errors set error, but carry on,
  using the untranslated data. If there is more than one error, the message
  passed back refers to the final one. We use a loop to cater for the case
  of long strings - the RFC puts limits on the length, but it's best to be
  robust. */

  #if HAVE_ICONV
  *q1 = 0;
  if (target != NULL && strcmpic(target, mimeword+2) != 0)
    {
    icd = iconv_open(CS target, CS(mimeword+2));

    if (icd == (iconv_t)(-1))
      {
      *error = string_sprintf("iconv_open(\"%s\", \"%s\") failed: %s%s",
        target, mimeword+2, strerror(errno),
        (errno == EINVAL)? " (maybe unsupported conversion)" : "");
      }
    }
  *q1 = '?';
  #endif

  while (dlen > 0)
    {
    uschar *tptr = NULL;   /* Stops compiler warning */
    int tlen = -1;

    #if HAVE_ICONV
    uschar tbuffer[256];
    uschar *outptr = tbuffer;
    size_t outleft = sizeof(tbuffer);

    /* If translation is required, go for it. */

    if (icd != (iconv_t)(-1))
      {
      (void)iconv(icd, (ICONV_ARG2_TYPE)(&dptr), &dlen, CSS &outptr, &outleft);

      /* If outptr has been adjusted, there is some output. Set up to add it to
      the output buffer. The function will have adjusted dptr and dlen. If
      iconv() stopped because of an error, we'll pick it up next time when
      there's no output.

      If there is no output, we expect there to have been a translation
      error, because we know there was at least one input byte. We leave the
      value of tlen as -1, which causes the rest of the input to be copied
      verbatim. */

      if (outptr > tbuffer)
        {
        tptr = tbuffer;
        tlen = outptr - tbuffer;
        }
      else
        {
        DEBUG(D_any) debug_printf("iconv error translating \"%.*s\" to %s: "
        "%s\n", endword + 2 - mimeword, mimeword, target, strerror(errno));
        }
      }

    #endif

    /* No charset translation is happening or there was a translation error;
    just set up the original as the string to be added, and mark it all used.
    */

    if (tlen == -1)
      {
      tptr = dptr;
      tlen = dlen;
      dlen = 0;
      }

    /* Deal with zero values; convert them if requested. */

    if (zeroval != 0)
      {
      int i;
      for (i = 0; i < tlen; i++)
        if (tptr[i] == 0) tptr[i] = zeroval;
      }

    /* Add the new string onto the result */

    yield = string_cat(yield, &size, &ptr, tptr, tlen);
    }

  #if HAVE_ICONV
  if (icd != (iconv_t)(-1))  iconv_close(icd);
  #endif

  /* Update string past the MIME word; skip any white space if the next thing
  is another MIME word. */

  string = endword + 2;
  mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
  if (mimeword != NULL)
    {
    uschar *s = string;
    while (isspace(*s)) s++;
    if (s == mimeword) string = s;
    }
  }

/* Copy the remaining characters of the string, zero-terminate it, and return
the length as well if requested. */

yield = string_cat(yield, &size, &ptr, string, Ustrlen(string));
yield[ptr] = 0;
if (lenptr != NULL) *lenptr = ptr;
if (sizeptr != NULL) *sizeptr = size;
return yield;
}


/* This is the stub that provides the original interface without the sizeptr
argument. */

uschar *
rfc2047_decode(uschar *string, BOOL lencheck, uschar *target, int zeroval,
  int *lenptr, uschar **error)
{
return rfc2047_decode2(string, lencheck, target, zeroval, lenptr, NULL, error);
}

/* End of rfc2047.c */
Commit	Line	Data
	1	/* $Cambridge: exim/src/src/rfc2047.c,v 1.2 2005/01/04 10:00:42 ph10 Exp $ */
	2
	3	/*************************************************
	4	* Exim - an Internet mail transport agent *
	5	*************************************************/
	6
	7	/* Copyright (c) University of Cambridge 1995 - 2005 */
	8	/* See the file NOTICE for conditions of use and distribution. */
	9
	10	/* This file contains a function for decoding message header lines that may
	11	contain encoded "words" according to the rules described in
	12
	13	RFC-2047 at http://www.ietf.org/rfc/rfc2047.txt
	14
	15	The function is a rewritten version of code created by Norihisa Washitake.
	16	The original could be used both inside Exim (as part of a patch) or in a
	17	freestanding form. The original contained some built-in code conversions; I
	18	have chosen only to do code conversions if iconv() is supported by the OS.
	19	Because there were quite a lot of hacks to be done, for a variety of reasons,
	20	I rewrote the code.
	21
	22	You can find the latest version of the original library at
	23
	24	http://washitake.com/mail/exim/mime/
	25
	26	The code below is almost completely unlike the original. */
	27
	28
	29	#include "exim.h"
	30
	31
	32	/*************************************************
	33	* Do a QP conversion *
	34	*************************************************/
	35
	36	/* This function decodes "quoted printable" into bytes.
	37
	38	Arguments:
	39	string the string that includes QP escapes
	40	ptrptr where to return pointer to the decoded string
	41
	42	Returns: the length of the decoded string, or -1 on failure
	43	*/
	44
	45	static int
	46	rfc2047_qpdecode(uschar string, uschar *ptrptr)
	47	{
	48	int len = 0;
	49	uschar *ptr;
	50
	51	ptr = ptrptr = store_get(Ustrlen(string) + 1); / No longer than this */
	52
	53	while (*string != 0)
	54	{
	55	register int ch = *string++;
	56
	57	if (ch == '_') *ptr++ = ' ';
	58	else if (ch == '=')
	59	{
	60	int a = *string;
	61	int b = (a == 0)? 0 : string[1];
	62	if (!isxdigit(a) \|\| !isxdigit(b)) return -1; /* Bad QP string */
	63	*ptr++ = ((Ustrchr(hex_digits, tolower(a)) - hex_digits) << 4) +
	64	Ustrchr(hex_digits, tolower(b)) - hex_digits;
	65	string += 2;
	66	}
	67	else if (ch == ' ' \|\| ch == '\t') return -1; /* Whitespace is illegal */
	68	else *ptr++ = ch;
	69
	70	len++;
	71	}
	72
	73	*ptr = 0;
	74	return len;
	75	}
	76
	77
	78
	79	/*************************************************
	80	* Decode next MIME word *
	81	*************************************************/
	82
	83	/* Scan a string to see if a MIME word exists; pass back the separator
	84	points in the string.
	85
	86	Arguments:
	87	string subject string
	88	lencheck TRUE to enforce maximum length check
	89	q1ptr pass back address of first question mark
	90	q2ptr pass back address of second question mark
	91	endptr pass back address of final ?=
	92	dlenptr pass back length of decoded string
	93	dptrptr pass back pointer to decoded string
	94
	95	Returns: address of =? or NULL if not present
	96	*/
	97
	98	static uschar *
	99	decode_mimeword(uschar string, BOOL lencheck, uschar q1ptr, uschar *q2ptr,
	100	uschar *endptr, size_t dlenptr, uschar **dptrptr)
	101	{
	102	uschar *mimeword;
	103	for (;; string = mimeword + 2)
	104	{
	105	int encoding;
	106	int dlen = -1;
	107
	108	if ((mimeword = Ustrstr(string, "=?")) == NULL \|\|
	109	(*q1ptr = Ustrchr(mimeword+2, '?')) == NULL \|\|
	110	(q2ptr = Ustrchr(q1ptr+1, '?')) == NULL \|\|
	111	(endptr = Ustrstr(q2ptr+1, "?=")) == NULL) return NULL;
	112
	113	/* We have found =?xxx?xxx?xxx?= in the string. Optionally check the
	114	length, and that the second field is just one character long. If not,
	115	continue the loop to search again. We must start just after the initial =?
	116	because we might have found =?xxx=?xxx?xxx?xxx?=. */
	117
	118	if ((lencheck && endptr - mimeword > 73) \|\| q2ptr - *q1ptr != 2) continue;
	119
	120	/* Get the encoding letter, and decode the data string. */
	121
	122	encoding = toupper((*q1ptr)[1]);
	123	**endptr = 0;
	124	if (encoding == 'B')
	125	dlen = auth_b64decode(*q2ptr+1, dptrptr);
	126	else if (encoding == 'Q')
	127	dlen = rfc2047_qpdecode(*q2ptr+1, dptrptr);
	128	*endptr = '?'; / restore */
	129
	130	/* If the decoding succeeded, we are done. Set the length of the decoded
	131	string, and pass back the initial pointer. Otherwise, the loop continues. */
	132
	133	if (dlen >= 0)
	134	{
	135	*dlenptr = (size_t)dlen;
	136	return mimeword;
	137	}
	138	}
	139
	140	/* Control should never actually get here */
	141	}
	142
	143
	144
	145	/*************************************************
	146	* Decode and convert an RFC 2047 string *
	147	*************************************************/
	148
	149	/* There are two functions defined here. The original one was rfc2047_decode()
	150	and it was documented in the local_scan() interface. I needed to add an extra
	151	argument for use by expand_string(), so I created rfc2047_decode2() for that
	152	purpose. The original function became a stub that just supplies NULL for the
	153	new argument (sizeptr).
	154
	155	An RFC 2047-encoded string may contain one or more "words", each of the
	156	form =?...?.?...?= with the first ... specifying the character code, the
	157	second being Q (for quoted printable) or B for Base64 encoding. The third ...
	158	is the actual data.
	159
	160	This function first decodes each "word" into bytes from the Q or B encoding.
	161	Then, if provided with the name of a charset encoding, and if iconv() is
	162	available, it attempts to translate the result to the named character set.
	163	If this fails, the binary string is returned with an error message.
	164
	165	If a binary zero is encountered in the decoded string, it is replaced by the
	166	contents of the zeroval argument. For use with Exim headers, the value must not
	167	be 0 because they are handled as zero-terminated strings. When zeroval==0,
	168	lenptr should not be NULL.
	169
	170	Arguments:
	171	string the subject string
	172	lencheck TRUE to enforce maximum MIME word length
	173	target the name of the target encoding for MIME words, or NULL for
	174	no charset translation
	175	zeroval the value to use for binary zero bytes
	176	lenptr if not NULL, the length of the result is returned via
	177	this variable
	178	sizeptr if not NULL, the length of a new store block in which the
	179	result is built is placed here; if no new store is obtained,
	180	the value is not changed
	181	error for error messages; NULL if no problem; this can be set
	182	when the yield is non-NULL if there was a charset
	183	translation problem
	184
	185	Returns: the decoded, converted string, or NULL on error; if there are
	186	no MIME words in the string, the original string is returned
	187	*/
	188
	189	uschar *
	190	rfc2047_decode2(uschar string, BOOL lencheck, uschar target, int zeroval,
	191	int lenptr, int sizeptr, uschar **error)
	192	{
	193	int ptr = 0;
	194	int size = Ustrlen(string);
	195	size_t dlen;
	196	uschar dptr, yield;
	197	uschar mimeword, q1, q2, endword;
	198
	199	*error = NULL;
	200	mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
	201
	202	if (mimeword == NULL)
	203	{
	204	if (lenptr != NULL) *lenptr = size;
	205	return string;
	206	}
	207
	208	/* Scan through the string, decoding MIME words and copying intermediate text,
	209	building the result as we go. The result may be longer than the input if it is
	210	translated into a multibyte code such as UTF-8. That's why we use the dynamic
	211	string building code. */
	212
	213	yield = store_get(++size);
	214
	215	while (mimeword != NULL)
	216	{
	217
	218	#if HAVE_ICONV
	219	iconv_t icd = (iconv_t)(-1);
	220	#endif
	221
	222	if (mimeword != string)
	223	yield = string_cat(yield, &size, &ptr, string, mimeword - string);
	224
	225	/* Do a charset translation if required. This is supported only on hosts
	226	that have the iconv() function. Translation errors set error, but carry on,
	227	using the untranslated data. If there is more than one error, the message
	228	passed back refers to the final one. We use a loop to cater for the case
	229	of long strings - the RFC puts limits on the length, but it's best to be
	230	robust. */
	231
	232	#if HAVE_ICONV
	233	*q1 = 0;
	234	if (target != NULL && strcmpic(target, mimeword+2) != 0)
	235	{
	236	icd = iconv_open(CS target, CS(mimeword+2));
	237
	238	if (icd == (iconv_t)(-1))
	239	{
	240	*error = string_sprintf("iconv_open(\"%s\", \"%s\") failed: %s%s",
	241	target, mimeword+2, strerror(errno),
	242	(errno == EINVAL)? " (maybe unsupported conversion)" : "");
	243	}
	244	}
	245	*q1 = '?';
	246	#endif
	247
	248	while (dlen > 0)
	249	{
	250	uschar tptr = NULL; / Stops compiler warning */
	251	int tlen = -1;
	252
	253	#if HAVE_ICONV
	254	uschar tbuffer[256];
	255	uschar *outptr = tbuffer;
	256	size_t outleft = sizeof(tbuffer);
	257
	258	/* If translation is required, go for it. */
	259
	260	if (icd != (iconv_t)(-1))
	261	{
	262	(void)iconv(icd, (ICONV_ARG2_TYPE)(&dptr), &dlen, CSS &outptr, &outleft);
	263
	264	/* If outptr has been adjusted, there is some output. Set up to add it to
	265	the output buffer. The function will have adjusted dptr and dlen. If
	266	iconv() stopped because of an error, we'll pick it up next time when
	267	there's no output.
	268
	269	If there is no output, we expect there to have been a translation
	270	error, because we know there was at least one input byte. We leave the
	271	value of tlen as -1, which causes the rest of the input to be copied
	272	verbatim. */
	273
	274	if (outptr > tbuffer)
	275	{
	276	tptr = tbuffer;
	277	tlen = outptr - tbuffer;
	278	}
	279	else
	280	{
	281	DEBUG(D_any) debug_printf("iconv error translating \"%.*s\" to %s: "
	282	"%s\n", endword + 2 - mimeword, mimeword, target, strerror(errno));
	283	}
	284	}
	285
	286	#endif
	287
	288	/* No charset translation is happening or there was a translation error;
	289	just set up the original as the string to be added, and mark it all used.
	290	*/
	291
	292	if (tlen == -1)
	293	{
	294	tptr = dptr;
	295	tlen = dlen;
	296	dlen = 0;
	297	}
	298
	299	/* Deal with zero values; convert them if requested. */
	300
	301	if (zeroval != 0)
	302	{
	303	int i;
	304	for (i = 0; i < tlen; i++)
	305	if (tptr[i] == 0) tptr[i] = zeroval;
	306	}
	307
	308	/* Add the new string onto the result */
	309
	310	yield = string_cat(yield, &size, &ptr, tptr, tlen);
	311	}
	312
	313	#if HAVE_ICONV
	314	if (icd != (iconv_t)(-1)) iconv_close(icd);
	315	#endif
	316
	317	/* Update string past the MIME word; skip any white space if the next thing
	318	is another MIME word. */
	319
	320	string = endword + 2;
	321	mimeword = decode_mimeword(string, lencheck, &q1, &q2, &endword, &dlen, &dptr);
	322	if (mimeword != NULL)
	323	{
	324	uschar *s = string;
	325	while (isspace(*s)) s++;
	326	if (s == mimeword) string = s;
	327	}
	328	}
	329
	330	/* Copy the remaining characters of the string, zero-terminate it, and return
	331	the length as well if requested. */
	332
	333	yield = string_cat(yield, &size, &ptr, string, Ustrlen(string));
	334	yield[ptr] = 0;
	335	if (lenptr != NULL) *lenptr = ptr;
	336	if (sizeptr != NULL) *sizeptr = size;
	337	return yield;
	338	}
	339
	340
	341	/* This is the stub that provides the original interface without the sizeptr
	342	argument. */
	343
	344	uschar *
	345	rfc2047_decode(uschar string, BOOL lencheck, uschar target, int zeroval,
	346	int lenptr, uschar *error)
	347	{
	348	return rfc2047_decode2(string, lencheck, target, zeroval, lenptr, NULL, error);
	349	}
	350
	351	/* End of rfc2047.c */