Mercurial > hg > cmmr2012-drupal-site
comparison vendor/masterminds/html5/src/HTML5/Parser/UTF8Utils.php @ 5:12f9dff5fda9 tip
Update to Drupal core 8.7.1
author | Chris Cannam |
---|---|
date | Thu, 09 May 2019 15:34:47 +0100 |
parents | a9cd425dd02b |
children |
comparison
equal
deleted
inserted
replaced
4:a9cd425dd02b | 5:12f9dff5fda9 |
---|---|
1 <?php | 1 <?php |
2 | 2 |
3 namespace Masterminds\HTML5\Parser; | 3 namespace Masterminds\HTML5\Parser; |
4 | 4 |
5 /* | 5 /* |
6 * | 6 Portions based on code from html5lib files with the following copyright: |
7 * Portions based on code from html5lib files with the following copyright: | |
8 | 7 |
9 Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/> | 8 Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/> |
10 | 9 |
11 Permission is hereby granted, free of charge, to any person obtaining a | 10 Permission is hereby granted, free of charge, to any person obtaining a |
12 copy of this software and associated documentation files (the | 11 copy of this software and associated documentation files (the |
24 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. | 23 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
25 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY | 24 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
26 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | 25 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
27 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | 26 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
28 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | 27 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
29 | |
30 */ | 28 */ |
31 | 29 |
32 use Masterminds\HTML5\Exception; | 30 use Masterminds\HTML5\Exception; |
33 | 31 |
34 /** | |
35 * UTF-8 Utilities. | |
36 */ | |
37 class UTF8Utils | 32 class UTF8Utils |
38 { | 33 { |
39 /** | 34 /** |
40 * The Unicode replacement character.. | 35 * The Unicode replacement character. |
41 */ | 36 */ |
42 const FFFD = "\xEF\xBF\xBD"; | 37 const FFFD = "\xEF\xBF\xBD"; |
43 | 38 |
44 /** | 39 /** |
45 * Count the number of characters in a string. | 40 * Count the number of characters in a string. |
46 * | 41 * UTF-8 aware. This will try (in order) iconv, MB, libxml, and finally a custom counter. |
47 * UTF-8 aware. This will try (in order) iconv, | |
48 * MB, libxml, and finally a custom counter. | |
49 * | |
50 * @todo Move this to a general utility class. | |
51 * | 42 * |
52 * @param string $string | 43 * @param string $string |
53 * | 44 * |
54 * @return int | 45 * @return int |
55 */ | 46 */ |
56 public static function countChars($string) | 47 public static function countChars($string) |
57 { | 48 { |
58 // Get the length for the string we need. | 49 // Get the length for the string we need. |
59 if (function_exists('mb_strlen')) { | 50 if (function_exists('mb_strlen')) { |
60 return mb_strlen($string, 'utf-8'); | 51 return mb_strlen($string, 'utf-8'); |
61 } elseif (function_exists('iconv_strlen')) { | 52 } |
53 | |
54 if (function_exists('iconv_strlen')) { | |
62 return iconv_strlen($string, 'utf-8'); | 55 return iconv_strlen($string, 'utf-8'); |
63 } elseif (function_exists('utf8_decode')) { | 56 } |
57 | |
58 if (function_exists('utf8_decode')) { | |
64 // MPB: Will this work? Won't certain decodes lead to two chars | 59 // MPB: Will this work? Won't certain decodes lead to two chars |
65 // extrapolated out of 2-byte chars? | 60 // extrapolated out of 2-byte chars? |
66 return strlen(utf8_decode($string)); | 61 return strlen(utf8_decode($string)); |
67 } | 62 } |
63 | |
68 $count = count_chars($string); | 64 $count = count_chars($string); |
65 | |
69 // 0x80 = 0x7F - 0 + 1 (one added to get inclusive range) | 66 // 0x80 = 0x7F - 0 + 1 (one added to get inclusive range) |
70 // 0x33 = 0xF4 - 0x2C + 1 (one added to get inclusive range) | 67 // 0x33 = 0xF4 - 0x2C + 1 (one added to get inclusive range) |
71 return array_sum(array_slice($count, 0, 0x80)) + array_sum(array_slice($count, 0xC2, 0x33)); | 68 return array_sum(array_slice($count, 0, 0x80)) + array_sum(array_slice($count, 0xC2, 0x33)); |
72 } | 69 } |
73 | 70 |
83 * @return string | 80 * @return string |
84 */ | 81 */ |
85 public static function convertToUTF8($data, $encoding = 'UTF-8') | 82 public static function convertToUTF8($data, $encoding = 'UTF-8') |
86 { | 83 { |
87 /* | 84 /* |
88 * From the HTML5 spec: Given an encoding, the bytes in the input stream must be converted to Unicode characters for the tokeniser, as described by the rules for that encoding, except that the leading U+FEFF BYTE ORDER MARK character, if any, must not be stripped by the encoding layer (it is stripped by the rule below). Bytes or sequences of bytes in the original byte stream that could not be converted to Unicode characters must be converted to U+FFFD REPLACEMENT CHARACTER code points. | 85 * From the HTML5 spec: Given an encoding, the bytes in the input stream must be converted |
86 * to Unicode characters for the tokeniser, as described by the rules for that encoding, | |
87 * except that the leading U+FEFF BYTE ORDER MARK character, if any, must not be stripped | |
88 * by the encoding layer (it is stripped by the rule below). Bytes or sequences of bytes | |
89 * in the original byte stream that could not be converted to Unicode characters must be | |
90 * converted to U+FFFD REPLACEMENT CHARACTER code points. | |
89 */ | 91 */ |
90 | 92 |
91 // mb_convert_encoding is chosen over iconv because of a bug. The best | 93 // mb_convert_encoding is chosen over iconv because of a bug. The best |
92 // details for the bug are on http://us1.php.net/manual/en/function.iconv.php#108643 | 94 // details for the bug are on http://us1.php.net/manual/en/function.iconv.php#108643 |
93 // which contains links to the actual but reports as well as work around | 95 // which contains links to the actual but reports as well as work around |
104 // a little excessive and it would be great if there was a better way. | 106 // a little excessive and it would be great if there was a better way. |
105 $save = mb_substitute_character(); | 107 $save = mb_substitute_character(); |
106 mb_substitute_character('none'); | 108 mb_substitute_character('none'); |
107 $data = mb_convert_encoding($data, 'UTF-8', $encoding); | 109 $data = mb_convert_encoding($data, 'UTF-8', $encoding); |
108 mb_substitute_character($save); | 110 mb_substitute_character($save); |
109 } // @todo Get iconv running in at least some environments if that is possible. | 111 } |
112 // @todo Get iconv running in at least some environments if that is possible. | |
110 elseif (function_exists('iconv') && 'auto' !== $encoding) { | 113 elseif (function_exists('iconv') && 'auto' !== $encoding) { |
111 // fprintf(STDOUT, "iconv found\n"); | 114 // fprintf(STDOUT, "iconv found\n"); |
112 // iconv has the following behaviors: | 115 // iconv has the following behaviors: |
113 // - Overlong representations are ignored. | 116 // - Overlong representations are ignored. |
114 // - Beyond Plane 16 is replaced with a lower char. | 117 // - Beyond Plane 16 is replaced with a lower char. |
139 { | 142 { |
140 // Vestigal error handling. | 143 // Vestigal error handling. |
141 $errors = array(); | 144 $errors = array(); |
142 | 145 |
143 /* | 146 /* |
144 * All U+0000 null characters in the input must be replaced by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such characters is a parse error. | 147 * All U+0000 null characters in the input must be replaced by U+FFFD REPLACEMENT CHARACTERs. |
148 * Any occurrences of such characters is a parse error. | |
145 */ | 149 */ |
146 for ($i = 0, $count = substr_count($data, "\0"); $i < $count; ++$i) { | 150 for ($i = 0, $count = substr_count($data, "\0"); $i < $count; ++$i) { |
147 $errors[] = 'null-character'; | 151 $errors[] = 'null-character'; |
148 } | 152 } |
149 | 153 |
150 /* | 154 /* |
151 * Any occurrences of any characters in the ranges U+0001 to U+0008, U+000B, U+000E to U+001F, U+007F to U+009F, U+D800 to U+DFFF , U+FDD0 to U+FDEF, and characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and U+10FFFF are parse errors. (These are all control characters or permanently undefined Unicode characters.) | 155 * Any occurrences of any characters in the ranges U+0001 to U+0008, U+000B, U+000E to U+001F, U+007F |
156 * to U+009F, U+D800 to U+DFFF , U+FDD0 to U+FDEF, and characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, | |
157 * U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE, | |
158 * U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, | |
159 * U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and U+10FFFF are parse errors. | |
160 * (These are all control characters or permanently undefined Unicode characters.) | |
152 */ | 161 */ |
153 // Check PCRE is loaded. | 162 // Check PCRE is loaded. |
154 $count = preg_match_all( | 163 $count = preg_match_all( |
155 '/(?: | 164 '/(?: |
156 [\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B, U+000E to U+001F and U+007F | 165 [\x01-\x08\x0B\x0E-\x1F\x7F] # U+0001 to U+0008, U+000B, U+000E to U+001F and U+007F |