rewriting utf8 encoding function. preg_replace is safer that splitting into
[squirrelmail.git] / functions / encode / utf_8.php
1 <?php
2 /**
3 * utf-8 encoding functions
4 *
5 * takes a string of unicode entities and converts it to a utf-8 encoded string
6 * each unicode entitiy has the form &#nnn(nn); n={0..9} and can be displayed by utf-8 supporting
7 * browsers. Ascii will not be modified.
8 *
9 * Original code is taken from www.php.net manual comments
10 * Original author: ronen at greyzone dot com
11 *
12 * @version $Id$
13 * @copyright Copyright &copy; SquirrelMail Development Team, 2004
14 * @package squirrelmail
15 * @subpackage encode
16 */
17
18 /**
19 * Converts string to utf-8
20 * @param string $string text with numeric unicode entities
21 * @return string utf-8 encoded text
22 */
23 function charset_encode_utf_8 ($string) {
24 // don't run encoding function, if there is no encoded characters
25 if (! preg_match("'&#[0-9]+;'",$string) ) return $string;
26
27 $string=preg_replace("/&#([0-9]+);/e","unicodetoutf8('\\1')",$string);
28 // $string=preg_replace("/&#[xX]([0-9A-F]+);/e","unicodetoutf8(hexdec('\\1'))",$string);
29
30 return $string;
31 }
32
33 /**
34 * Return utf8 symbol when unicode character number is provided
35 *
36 * This function is used internally by charset_encode_utf_8
37 * function. It might be unavailable to other squirrelmail functions.
38 * Don't use it or make sure, that functions/encode/utf_8.php is
39 * included.
40 *
41 * @param int $var decimal unicode value
42 * @return string utf8 character
43 */
44 function unicodetoutf8($var) {
45
46 if ($var < 128) {
47 $ret = chr ($var);
48 } else if ($var < 2048) {
49 // Two byte utf-8
50 $binVal = str_pad (decbin ($var), 11, "0", STR_PAD_LEFT);
51 $binPart1 = substr ($binVal, 0, 5);
52 $binPart2 = substr ($binVal, 5);
53
54 $char1 = chr (192 + bindec ($binPart1));
55 $char2 = chr (128 + bindec ($binPart2));
56 $ret = $char1 . $char2;
57 } else if ($var < 65536) {
58 // Three byte utf-8
59 $binVal = str_pad (decbin ($var), 16, "0", STR_PAD_LEFT);
60 $binPart1 = substr ($binVal, 0, 4);
61 $binPart2 = substr ($binVal, 4, 6);
62 $binPart3 = substr ($binVal, 10);
63
64 $char1 = chr (224 + bindec ($binPart1));
65 $char2 = chr (128 + bindec ($binPart2));
66 $char3 = chr (128 + bindec ($binPart3));
67 $ret = $char1 . $char2 . $char3;
68 } else if ($var < 2097152) {
69 // Four byte utf-8
70 $binVal = str_pad (decbin ($var), 21, "0", STR_PAD_LEFT);
71 $binPart1 = substr ($binVal, 0, 3);
72 $binPart2 = substr ($binVal, 3, 6);
73 $binPart3 = substr ($binVal, 9, 6);
74 $binPart4 = substr ($binVal, 15);
75
76 $char1 = chr (240 + bindec ($binPart1));
77 $char2 = chr (128 + bindec ($binPart2));
78 $char3 = chr (128 + bindec ($binPart3));
79 $char4 = chr (128 + bindec ($binPart4));
80 $ret = $char1 . $char2 . $char3 . $char4;
81 } else if ($var < 67108864) {
82 // Five byte utf-8
83 $binVal = str_pad (decbin ($var), 26, "0", STR_PAD_LEFT);
84 $binPart1 = substr ($binVal, 0, 2);
85 $binPart2 = substr ($binVal, 2, 6);
86 $binPart3 = substr ($binVal, 8, 6);
87 $binPart4 = substr ($binVal, 14,6);
88 $binPart5 = substr ($binVal, 20);
89
90 $char1 = chr (248 + bindec ($binPart1));
91 $char2 = chr (128 + bindec ($binPart2));
92 $char3 = chr (128 + bindec ($binPart3));
93 $char4 = chr (128 + bindec ($binPart4));
94 $char5 = chr (128 + bindec ($binPart5));
95 $ret = $char1 . $char2 . $char3 . $char4 . $char5;
96 } else if ($var < 2147483648) {
97 // Six byte utf-8
98 $binVal = str_pad (decbin ($var), 31, "0", STR_PAD_LEFT);
99 $binPart1 = substr ($binVal, 0, 1);
100 $binPart2 = substr ($binVal, 1, 6);
101 $binPart3 = substr ($binVal, 7, 6);
102 $binPart4 = substr ($binVal, 13,6);
103 $binPart5 = substr ($binVal, 19,6);
104 $binPart6 = substr ($binVal, 25);
105
106 $char1 = chr (252 + bindec ($binPart1));
107 $char2 = chr (128 + bindec ($binPart2));
108 $char3 = chr (128 + bindec ($binPart3));
109 $char4 = chr (128 + bindec ($binPart4));
110 $char5 = chr (128 + bindec ($binPart5));
111 $char6 = chr (128 + bindec ($binPart6));
112 $ret = $char1 . $char2 . $char3 . $char4 . $char5 . $char6;
113 } else {
114 // there is no such symbol in utf-8
115 $ret='?';
116 }
117 return $ret;
118 }
119 ?>