UTF8/locale: document constraints on current expansions.

[exim.git] / doc / doc-docbook / spec.xfpt
diff --git a/doc/doc-docbook/spec.xfpt b/doc/doc-docbook/spec.xfpt

index 9cddddde5e4f42974f4ef2ea8296f054286a2e8f..f97a430a6c74b7c2ccf747ed3fcb6a1e51960272 100644 (file)
--- a/doc/doc-docbook/spec.xfpt
+++ b/doc/doc-docbook/spec.xfpt
@@ -3966,8 +3966,17 @@ the messages are active, their status is not altered. This option can be used
  only by an admin user or by the user who originally caused the message to be
  placed on the queue.
  
+. .new
+. .vitem &%-MS%&
+. .oindex "&%-MS%&"
+. .cindex REQUIRETLS
+. This option is used to request REQUIRETLS processing on the message.
+. It is used internally by Exim in conjunction with -E when generating
+. a bounce message.
+. .wen
+
  .vitem &%-Mset%&&~<&'message&~id'&>
-.oindex "&%-Mset%&
+.oindex "&%-Mset%&"
  .cindex "testing" "string expansion"
  .cindex "expansion" "testing"
  This option is useful only in conjunction with &%-be%& (that is, when testing
@@ -9628,9 +9637,10 @@ some of the braces:
  .code
  ${length_<n>:<string>}
  .endd
-The result of this item is either the first <&'n'&> characters or the whole
+The result of this item is either the first <&'n'&> bytes or the whole
  of <&'string2'&>, whichever is the shorter. Do not confuse &%length%& with
  &%strlen%&, which gives the length of a string.
+All measurement is done in bytes and is not UTF-8 aware.
  
  
  .vitem "&*${listextract{*&<&'number'&>&*}&&&
@@ -10062,6 +10072,8 @@ ${sg{1=A 4=D 3=C}{\N(\d+)=\N}{K\$1=}}
  yields &"K1=A K4=D K3=C"&. Note the use of &`\N`& to protect the contents of
  the regular expression from string expansion.
  
+The regular expression is compiled in 8-bit mode, working against bytes
+rather than any Unicode-aware character handling.
  
  
  .vitem &*${sort{*&<&'string'&>&*}{*&<&'comparator'&>&*}{*&<&'extractor'&>&*}}*&
@@ -10118,11 +10130,11 @@ ${substr{3}{2}{$local_part}}
  If the starting offset is greater than the string length the result is the
  null string; if the length plus starting offset is greater than the string
  length, the result is the right-hand part of the string, starting from the
-given offset. The first character in the string has offset zero.
+given offset. The first byte (character) in the string has offset zero.
  
  The &%substr%& expansion item can take negative offset values to count
-from the right-hand end of its operand. The last character is offset -1, the
-second-last is offset -2, and so on. Thus, for example,
+from the right-hand end of its operand. The last byte (character) is offset -1,
+the second-last is offset -2, and so on. Thus, for example,
  .code
  ${substr{-5}{2}{1234567}}
  .endd
@@ -10139,7 +10151,7 @@ ${substr{-3}{2}{12}}
  yields &"1"&.
  
  When the second number is omitted from &%substr%&, the remainder of the string
-is taken if the offset is positive. If it is negative, all characters in the
+is taken if the offset is positive. If it is negative, all bytes (characters) in the
  string preceding the offset point are taken. For example, an offset of -1 and
  no length, as in these semantically identical examples:
  .code
@@ -10148,13 +10160,15 @@ ${substr{-1}{abcde}}
  .endd
  yields all but the last character of the string, that is, &"abcd"&.
  
+All measurement is done in bytes and is not UTF-8 aware.
+
  
  
  .vitem "&*${tr{*&<&'subject'&>&*}{*&<&'characters'&>&*}&&&
          {*&<&'replacements'&>&*}}*&"
  .cindex "expansion" "character translation"
  .cindex "&%tr%& expansion item"
-This item does single-character translation on its subject string. The second
+This item does single-character (in bytes) translation on its subject string. The second
  argument is a list of characters to be translated in the subject string. Each
  matching character is replaced by the corresponding character from the
  replacement list. For example
@@ -10165,6 +10179,9 @@ yields &`1b3de1`&. If there are duplicates in the second character string, the
  last occurrence is used. If the third string is shorter than the second, its
  last character is replicated. However, if it is empty, no translation takes
  place.
+
+All character handling is done in bytes and is not UTF-8 aware.
+
  .endlist
  
  
@@ -10184,6 +10201,8 @@ The string is interpreted as an RFC 2822 address, as it might appear in a
  header line, and the effective address is extracted from it. If the string does
  not parse successfully, the result is empty.
  
+The parsing correctly handles SMTPUTF8 Unicode in the string.
+
  
  .vitem &*${addresses:*&<&'string'&>&*}*&
  .cindex "expansion" "RFC 2822 address handling"
@@ -10227,7 +10246,7 @@ It does not see the comma because it's still encoded as "=2C".  The second
  example below is passed the contents of &`$header_from:`&, meaning it gets
  de-mimed. Exim sees the decoded "," so it treats it as &*two*& email addresses.
  The third example shows that the presence of a comma is skipped when it is
-quoted.
+quoted.  The fourth example shows SMTPUTF8 handling.
  .code
  # exim -be '${addresses:From: \
  =?iso-8859-2?Q?Last=2C_First?= <user@example.com>}'
@@ -10236,6 +10255,8 @@ user@example.com
  Last:user@example.com
  # exim -be '${addresses:From: "Last, First" <user@example.com>}'
  user@example.com
+# exim -be '${addresses:フィル <フィリップ@example.jp>}'
+フィリップ@example.jp
  .endd
  
  .vitem &*${base32:*&<&'digits'&>&*}*&
@@ -10425,7 +10446,7 @@ abbreviation &%h%& can be used when &%hash%& is used as an operator.
  .cindex "expansion" "hex to base64"
  .cindex "&%hex2b64%& expansion item"
  This operator converts a hex string into one that is base64 encoded. This can
-be useful for processing the output of the MD5 and SHA-1 hashing functions.
+be useful for processing the output of the various hashing functions.
  
  
  
@@ -10467,6 +10488,7 @@ This forces the letters in the string into lower-case, for example:
  .code
  ${lc:$local_part}
  .endd
+Case is defined per the system C locale.
  
  .vitem &*${length_*&<&'number'&>&*:*&<&'string'&>&*}*&
  .cindex "expansion" "string truncation"
@@ -10480,6 +10502,7 @@ ${length{<number>}{<string>}}
  See the description of the general &%length%& item above for details. Note that
  &%length%& is not the same as &%strlen%&. The abbreviation &%l%& can be used
  when &%length%& is used as an operator.
+All measurement is done in bytes and is not UTF-8 aware.
  
  
  .vitem &*${listcount:*&<&'string'&>&*}*&
@@ -10507,6 +10530,7 @@ matching list is returned.
  The string is interpreted as an RFC 2822 address and the local part is
  extracted from it. If the string does not parse successfully, the result is
  empty.
+The parsing correctly handles SMTPUTF8 Unicode in the string.
  
  
  .vitem &*${mask:*&<&'IP&~address'&>&*/*&<&'bit&~count'&>&*}*&
@@ -10589,6 +10613,10 @@ example, a plus sign would not cause quoting (but it would for &%quote%&).
  If you are creating a new email address from the contents of &$local_part$&
  (or any other unknown data), you should always use this operator.
  
+This quoting determination is not SMTPUTF8-aware, thus quoting non-ASCII data
+will likely use the quoting form.
+Thus &'${quote_local_part:フィル}'& will always become &'"フィル"'&.
+
  
  .vitem &*${quote_*&<&'lookup-type'&>&*:*&<&'string'&>&*}*&
  .cindex "quoting" "lookup-specific"
@@ -10752,6 +10780,7 @@ Now deprecated, a synonym for the &%base64%& expansion operator.
  .cindex "&%strlen%& expansion item"
  The item is replace by the length of the expanded string, expressed as a
  decimal number. &*Note*&: Do not confuse &%strlen%& with &%length%&.
+All measurement is done in bytes and is not UTF-8 aware.
  
  
  .vitem &*${substr_*&<&'start'&>&*_*&<&'length'&>&*:*&<&'string'&>&*}*&
@@ -10766,6 +10795,7 @@ ${substr{<start>}{<length>}{<string>}}
  .endd
  See the description of the general &%substr%& item above for details. The
  abbreviation &%s%& can be used when &%substr%& is used as an operator.
+All measurement is done in bytes and is not UTF-8 aware.
  
  .vitem &*${time_eval:*&<&'string'&>&*}*&
  .cindex "&%time_eval%& expansion item"
@@ -10788,6 +10818,7 @@ number of larger units and output in Exim's normal time format, for example,
  .cindex "expansion" "case forcing"
  .cindex "&%uc%& expansion item"
  This forces the letters in the string into upper-case.
+Case is defined per the system C locale.
  
  .vitem &*${utf8clean:*&<&'string'&>&*}*&
  .cindex "correction of invalid utf-8 sequences in strings"
@@ -10796,6 +10827,20 @@ This forces the letters in the string into upper-case.
  .cindex "expansion" "utf-8 forcing"
  .cindex "&%utf8clean%& expansion item"
  This replaces any invalid utf-8 sequence in the string by the character &`?`&.
+.new
+In versions of Exim before 4.92, this did not correctly do so for a truncated
+final codepoint's encoding, and the character would be silently dropped.
+If you must handle detection of this scenario across both sets of Exim behavior,
+the complexity will depend upon the task.
+For instance, to detect if the first character is multibyte and a 1-byte
+extraction can be successfully used as a path component (as is common for
+dividing up delivery folders), you might use:
+.code
+condition = ${if inlist{${utf8clean:${length_1:$local_part}}}{:?}{yes}{no}}
+.endd
+(which will false-positive if the first character of the local part is a
+literal question mark).
+.wen
  
  .vitem "&*${utf8_domain_to_alabel:*&<&'string'&>&*}*&" &&&
         "&*${utf8_domain_from_alabel:*&<&'string'&>&*}*&" &&&
@@ -11018,7 +11063,8 @@ the header name must be terminated by a colon if white space does not follow.
  .cindex "&%eqi%& expansion condition"
  The two substrings are first expanded. The condition is true if the two
  resulting strings are identical. For &%eq%& the comparison includes the case of
-letters, whereas for &%eqi%& the comparison is case-independent.
+letters, whereas for &%eqi%& the comparison is case-independent, where
+case is defined per the system C locale.
  
  .vitem &*exists&~{*&<&'file&~name'&>&*}*&
  .cindex "expansion" "file existence test"
@@ -11081,6 +11127,7 @@ The two substrings are first expanded. The condition is true if the first
  string is lexically greater than or equal to the second string. For &%ge%& the
  comparison includes the case of letters, whereas for &%gei%& the comparison is
  case-independent.
+Case and collation order are defined per the system C locale.
  
  .vitem &*gt&~{*&<&'string1'&>&*}{*&<&'string2'&>&*}*& &&&
         &*gti&~{*&<&'string1'&>&*}{*&<&'string2'&>&*}*&
@@ -11092,6 +11139,7 @@ The two substrings are first expanded. The condition is true if the first
  string is lexically greater than the second string. For &%gt%& the comparison
  includes the case of letters, whereas for &%gti%& the comparison is
  case-independent.
+Case and collation order are defined per the system C locale.
  
  .vitem &*inlist&~{*&<&'string1'&>&*}{*&<&'string2'&>&*}*& &&&
         &*inlisti&~{*&<&'string1'&>&*}{*&<&'string2'&>&*}*&
@@ -11100,6 +11148,7 @@ case-independent.
  Both strings are expanded; the second string is treated as a list of simple
  strings; if the first string is a member of the second, then the condition
  is true.
+For the case-independent &%inlisti%& condition, case is defined per the system C locale.
  
  These are simpler to use versions of the more powerful &*forany*& condition.
  Examples, and the &*forany*& equivalents:
@@ -11166,6 +11215,7 @@ The two substrings are first expanded. The condition is true if the first
  string is lexically less than or equal to the second string. For &%le%& the
  comparison includes the case of letters, whereas for &%lei%& the comparison is
  case-independent.
+Case and collation order are defined per the system C locale.
  
  .vitem &*lt&~{*&<&'string1'&>&*}{*&<&'string2'&>&*}*& &&&
         &*lti&~{*&<&'string1'&>&*}{*&<&'string2'&>&*}*&
@@ -11177,6 +11227,7 @@ The two substrings are first expanded. The condition is true if the first
  string is lexically less than the second string. For &%lt%& the comparison
  includes the case of letters, whereas for &%lti%& the comparison is
  case-independent.
+Case and collation order are defined per the system C locale.
  
  
  .vitem &*match&~{*&<&'string1'&>&*}{*&<&'string2'&>&*}*&
@@ -11203,6 +11254,8 @@ metacharacter, but if there is no circumflex, the expression is not anchored,
  and it may match anywhere in the subject, not just at the start. If you want
  the pattern to match at the end of the subject, you must include the &`$`&
  metacharacter at an appropriate point.
+All character handling is done in bytes and is not UTF-8 aware,
+but we might change this in a future Exim release.
  
  .cindex "numerical variables (&$1$& &$2$& etc)" "in &%if%& expansion"
  At the start of an &%if%& expansion the values of the numeric variable
@@ -24707,6 +24760,16 @@ The &%tls_verify_certificates%& option must also be set.
  If both this option and &%tls_try_verify_hosts%& are unset
  operation is as if this option selected all hosts.
  
+.new
+.option utf8_downconvert smtp integer!! unset
+.cindex utf8 "address downconversion"
+.cindex i18n "utf8 address downconversion"
+If built with internationalization support,
+this option controls conversion of UTF-8 in message addresses
+to a-label form.
+For details see section &<<SECTi18nMTA>>&.
+.wen
+
  
  
  
@@ -26830,15 +26893,17 @@ without code changes in Exim.
  
  
  .option server_channelbinding gsasl boolean false
+Do not set this true without consulting a cryptographic engineer.
+
  Some authentication mechanisms are able to use external context at both ends
  of the session to bind the authentication to that context, and fail the
  authentication process if that context differs.  Specifically, some TLS
  ciphersuites can provide identifying information about the cryptographic
  context.
  
-This means that certificate identity and verification becomes a non-issue,
-as a man-in-the-middle attack will cause the correct client and server to
-see different identifiers and authentication will fail.
+This should have meant that certificate identity and verification becomes a
+non-issue, as a man-in-the-middle attack will cause the correct client and
+server to see different identifiers and authentication will fail.
  
  This is currently only supported when using the GnuTLS library.  This is
  only usable by mechanisms which support "channel binding"; at time of
@@ -26846,7 +26911,11 @@ writing, that's the SCRAM family.
  
  This defaults off to ensure smooth upgrade across Exim releases, in case
  this option causes some clients to start failing.  Some future release
-of Exim may switch the default to be true.
+of Exim might have switched the default to be true.
+
+However, Channel Binding in TLS has proven to be broken in current versions.
+Do not plan to rely upon this feature for security, ever, without consulting
+with a subject matter expert (a cryptographic engineer).
  
  
  .option server_hostname gsasl string&!! "see below"
@@ -28158,20 +28227,48 @@ Support for client-side operation of DANE can be included at compile time by def
  in &_Local/Makefile_&.
  If it has been included, the macro "_HAVE_DANE" will be defined.
  
-The TLSA record for the server may have "certificate usage" of DANE-TA(2) or DANE-EE(3).  The latter specifies
-the End Entity directly, i.e. the certificate involved is that of the server (and should be the sole one transmitted
-during the TLS handshake); this is appropriate for a single system, using a self-signed certificate.
+The TLSA record for the server may have "certificate usage" of DANE-TA(2) or DANE-EE(3).
+These are the "Trust Anchor" and "End Entity" variants.
+The latter specifies the End Entity directly, i.e. the certificate involved is that of the server
+(and if only DANE-EE is used then it should be the sole one transmitted during the TLS handshake);
+this is appropriate for a single system, using a self-signed certificate.
  DANE-TA usage is effectively declaring a specific CA to be used; this might be a private CA or a public,
-well-known one.  A private CA at simplest is just a self-signed certificate which is used to sign
-cerver certificates, but running one securely does require careful arrangement.  If a private CA is used
-then either all clients must be primed with it, or (probably simpler) the server TLS handshake must transmit
-the entire certificate chain from CA to server-certificate.  If a public CA is used then all clients must be primed with it
-(losing one advantage of DANE) - but the attack surface is reduced from all public CAs to that single CA.
+well-known one.
+A private CA at simplest is just a self-signed certificate (with certain
+attributes) which is used to sign cerver certificates, but running one securely
+does require careful arrangement.
+With DANE-TA, as implemented in Exim and commonly in other MTAs,
+the server TLS handshake must transmit the entire certificate chain from CA to server-certificate.
  DANE-TA is commonly used for several services and/or servers, each having a TLSA query-domain CNAME record,
  all of which point to a single TLSA record.
+DANE-TA and DANE-EE can both be used together.
  
-Another approach which should be seriously considered is to use DANE with a certificate
-from a public CA, because of another technology, "MTA-STS", described below.
+.new
+Our recommendation is to use DANE with a certificate from a public CA,
+because this enables a variety of strategies for remote clients to verify
+your certificate.
+You can then publish information both via DANE and another technology,
+"MTA-STS", described below.
+
+When you use DANE-TA to publish trust anchor information, you ask entities
+outside your administrative control to trust the Certificate Authority for
+connections to you.
+If using a private CA then you should expect others to still apply the
+technical criteria they'd use for a public CA to your certificates.
+In particular, you should probably try to follow current best practices for CA
+operation around hash algorithms and key sizes.
+Do not expect other organizations to lower their security expectations just
+because a particular profile might be reasonable for your own internal use.
+
+When this text was last updated, this in practice means to avoid use of SHA-1
+and MD5; if using RSA to use key sizes of at least 2048 bits (and no larger
+than 4096, for interoperability); to use keyUsage fields correctly; to use
+random serial numbers.
+The list of requirements is subject to change as best practices evolve.
+If you're not already using a private CA, or it doesn't meet these
+requirements, then we encourage you to avoid all these issues and use a public
+CA such as &url(https://letsencrypt.org/,Let's Encrypt) instead.
+.wen
  
  The TLSA record should have a Selector field of SPKI(1) and a Matching Type field of SHA2-512(2).
  
@@ -28189,6 +28286,16 @@ are workable for 4th-field hashes.
  
  For use with the DANE-TA model, server certificates must have a correct name (SubjectName or SubjectAltName).
  
+.new
+The Certificate issued by the CA published in the DANE-TA model should be
+issued using a strong hash algorithm.
+Exim, and importantly various other MTAs sending to you, will not
+re-enable hash algorithms which have been disabled by default in TLS
+libraries.
+This means no MD5 and no SHA-1.  SHA2-256 is the minimum for reliable
+interoperability (and probably the maximum too, in 2018).
+.wen
+
  The use of OCSP-stapling should be considered, allowing for fast revocation of certificates (which would otherwise
  be limited by the DNS TTL on the TLSA records).  However, this is likely to only be usable with DANE-TA.  NOTE: the
  default of requesting OCSP for all hosts is modified iff DANE is in use, to:
@@ -28271,8 +28378,8 @@ MTA-STS to let those clients who do use that protocol derive trust
  information.
  
  The MTA-STS design requires a certificate from a public Certificate Authority
-which is recognized by clients sending to you.  That selection is outside your
-control.
+which is recognized by clients sending to you.
+That selection of which CAs are trusted by others is outside your control.
  
  The most interoperable course of action is probably to use
  &url(https://letsencrypt.org/,Let's Encrypt), with automated certificate
@@ -38589,6 +38696,12 @@ two files contains the final component of its own name as its first line. This
  is insurance against disk crashes where the directory is lost but the files
  themselves are recoverable.
  
+.new
+The file formats may be changed, or new formats added, at any release.
+Spool files are not intended as an interface to other programs
+and should not be used as such.
+.wen
+
  Some people are tempted into editing -D files in order to modify messages. You
  need to be extremely careful if you do this; it is not recommended and you are
  on your own if you do it. Here are some of the pitfalls:
@@ -39242,8 +39355,10 @@ hash-method or key-size:
         set dkim_verify_reason = hash too weak or key too short
  .endd
  
-After all the DKIM ACL runs have completed, the value becomes a
+So long as a DKIM ACL is defined (it need do no more than accept),
+after all the DKIM ACL runs have completed, the value becomes a
  colon-separated list of the values after each run.
+This is maintained for the mime, prdr and data ACLs.
  
  .vitem &%$dkim_verify_reason%&
  A string giving a little bit more detail when &%$dkim_verify_status%& is either
@@ -39808,6 +39923,12 @@ If a value is appended it may be:
  If mua_wrapper is set, the utf8_downconvert control
  is initially set to -1.
  
+.new
+The smtp transport has an option &%utf8_downconvert%&.
+If set it must expand to one of the three values described above,
+and it overrides any previously set value.
+.wen
+
  
  There is no explicit support for VRFY and EXPN.
  Configurations supporting these should inspect