Mercurial > hg > isophonics-drupal-site
comparison vendor/masterminds/html5/src/HTML5/Parser/UTF8Utils.php @ 17:129ea1e6d783
Update, including to Drupal core 8.6.10
author | Chris Cannam |
---|---|
date | Thu, 28 Feb 2019 13:21:36 +0000 |
parents | 4c8ae668cc8c |
children | af1871eacc83 |
comparison
equal
deleted
inserted
replaced
16:c2387f117808 | 17:129ea1e6d783 |
---|---|
1 <?php | 1 <?php |
2 | |
2 namespace Masterminds\HTML5\Parser; | 3 namespace Masterminds\HTML5\Parser; |
4 | |
3 /* | 5 /* |
4 * | 6 * |
5 * Portions based on code from html5lib files with the following copyright: | 7 * Portions based on code from html5lib files with the following copyright: |
6 | 8 |
7 Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/> | 9 Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/> |
24 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, | 26 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
25 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE | 27 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
26 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | 28 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
27 | 29 |
28 */ | 30 */ |
31 | |
32 use Masterminds\HTML5\Exception; | |
33 | |
29 /** | 34 /** |
30 * UTF-8 Utilities | 35 * UTF-8 Utilities. |
31 */ | 36 */ |
32 class UTF8Utils | 37 class UTF8Utils |
33 { | 38 { |
34 | |
35 /** | 39 /** |
36 * The Unicode replacement character.. | 40 * The Unicode replacement character.. |
37 */ | 41 */ |
38 const FFFD = "\xEF\xBF\xBD"; | 42 const FFFD = "\xEF\xBF\xBD"; |
39 | 43 |
42 * | 46 * |
43 * UTF-8 aware. This will try (in order) iconv, | 47 * UTF-8 aware. This will try (in order) iconv, |
44 * MB, libxml, and finally a custom counter. | 48 * MB, libxml, and finally a custom counter. |
45 * | 49 * |
46 * @todo Move this to a general utility class. | 50 * @todo Move this to a general utility class. |
51 * | |
52 * @param string $string | |
53 * | |
54 * @return int | |
47 */ | 55 */ |
48 public static function countChars($string) | 56 public static function countChars($string) |
49 { | 57 { |
50 // Get the length for the string we need. | 58 // Get the length for the string we need. |
51 if (function_exists('mb_strlen')) { | 59 if (function_exists('mb_strlen')) { |
67 * Convert data from the given encoding to UTF-8. | 75 * Convert data from the given encoding to UTF-8. |
68 * | 76 * |
69 * This has not yet been tested with charactersets other than UTF-8. | 77 * This has not yet been tested with charactersets other than UTF-8. |
70 * It should work with ISO-8859-1/-13 and standard Latin Win charsets. | 78 * It should work with ISO-8859-1/-13 and standard Latin Win charsets. |
71 * | 79 * |
72 * @param string $data | 80 * @param string $data The data to convert |
73 * The data to convert. | 81 * @param string $encoding A valid encoding. Examples: http://www.php.net/manual/en/mbstring.supported-encodings.php |
74 * @param string $encoding | 82 * |
75 * A valid encoding. Examples: http://www.php.net/manual/en/mbstring.supported-encodings.php | 83 * @return string |
76 */ | 84 */ |
77 public static function convertToUTF8($data, $encoding = 'UTF-8') | 85 public static function convertToUTF8($data, $encoding = 'UTF-8') |
78 { | 86 { |
79 /* | 87 /* |
80 * From the HTML5 spec: Given an encoding, the bytes in the input stream must be converted to Unicode characters for the tokeniser, as described by the rules for that encoding, except that the leading U+FEFF BYTE ORDER MARK character, if any, must not be stripped by the encoding layer (it is stripped by the rule below). Bytes or sequences of bytes in the original byte stream that could not be converted to Unicode characters must be converted to U+FFFD REPLACEMENT CHARACTER code points. | 88 * From the HTML5 spec: Given an encoding, the bytes in the input stream must be converted to Unicode characters for the tokeniser, as described by the rules for that encoding, except that the leading U+FEFF BYTE ORDER MARK character, if any, must not be stripped by the encoding layer (it is stripped by the rule below). Bytes or sequences of bytes in the original byte stream that could not be converted to Unicode characters must be converted to U+FFFD REPLACEMENT CHARACTER code points. |
97 $save = mb_substitute_character(); | 105 $save = mb_substitute_character(); |
98 mb_substitute_character('none'); | 106 mb_substitute_character('none'); |
99 $data = mb_convert_encoding($data, 'UTF-8', $encoding); | 107 $data = mb_convert_encoding($data, 'UTF-8', $encoding); |
100 mb_substitute_character($save); | 108 mb_substitute_character($save); |
101 } // @todo Get iconv running in at least some environments if that is possible. | 109 } // @todo Get iconv running in at least some environments if that is possible. |
102 elseif (function_exists('iconv') && $encoding != 'auto') { | 110 elseif (function_exists('iconv') && 'auto' !== $encoding) { |
103 // fprintf(STDOUT, "iconv found\n"); | 111 // fprintf(STDOUT, "iconv found\n"); |
104 // iconv has the following behaviors: | 112 // iconv has the following behaviors: |
105 // - Overlong representations are ignored. | 113 // - Overlong representations are ignored. |
106 // - Beyond Plane 16 is replaced with a lower char. | 114 // - Beyond Plane 16 is replaced with a lower char. |
107 // - Incomplete sequences generate a warning. | 115 // - Incomplete sequences generate a warning. |
108 $data = @iconv($encoding, 'UTF-8//IGNORE', $data); | 116 $data = @iconv($encoding, 'UTF-8//IGNORE', $data); |
109 } else { | 117 } else { |
110 // we can make a conforming native implementation | |
111 throw new Exception('Not implemented, please install mbstring or iconv'); | 118 throw new Exception('Not implemented, please install mbstring or iconv'); |
112 } | 119 } |
113 | 120 |
114 /* | 121 /* |
115 * One leading U+FEFF BYTE ORDER MARK character must be ignored if any are present. | 122 * One leading U+FEFF BYTE ORDER MARK character must be ignored if any are present. |
116 */ | 123 */ |
117 if (substr($data, 0, 3) === "\xEF\xBB\xBF") { | 124 if ("\xEF\xBB\xBF" === substr($data, 0, 3)) { |
118 $data = substr($data, 3); | 125 $data = substr($data, 3); |
119 } | 126 } |
120 | 127 |
121 return $data; | 128 return $data; |
122 } | 129 } |
123 | 130 |
124 /** | 131 /** |
125 * Checks for Unicode code points that are not valid in a document. | 132 * Checks for Unicode code points that are not valid in a document. |
126 * | 133 * |
127 * @param string $data | 134 * @param string $data A string to analyze |
128 * A string to analyze. | 135 * |
129 * @return array An array of (string) error messages produced by the scanning. | 136 * @return array An array of (string) error messages produced by the scanning |
130 */ | 137 */ |
131 public static function checkForIllegalCodepoints($data) | 138 public static function checkForIllegalCodepoints($data) |
132 { | 139 { |
133 if (! function_exists('preg_match_all')) { | |
134 throw\Exception('The PCRE library is not loaded or is not available.'); | |
135 } | |
136 | |
137 // Vestigal error handling. | 140 // Vestigal error handling. |
138 $errors = array(); | 141 $errors = array(); |
139 | 142 |
140 /* | 143 /* |
141 * All U+0000 null characters in the input must be replaced by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such characters is a parse error. | 144 * All U+0000 null characters in the input must be replaced by U+FFFD REPLACEMENT CHARACTERs. Any occurrences of such characters is a parse error. |
142 */ | 145 */ |
143 for ($i = 0, $count = substr_count($data, "\0"); $i < $count; $i ++) { | 146 for ($i = 0, $count = substr_count($data, "\0"); $i < $count; ++$i) { |
144 $errors[] = 'null-character'; | 147 $errors[] = 'null-character'; |
145 } | 148 } |
146 | 149 |
147 /* | 150 /* |
148 * Any occurrences of any characters in the ranges U+0001 to U+0008, U+000B, U+000E to U+001F, U+007F to U+009F, U+D800 to U+DFFF , U+FDD0 to U+FDEF, and characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and U+10FFFF are parse errors. (These are all control characters or permanently undefined Unicode characters.) | 151 * Any occurrences of any characters in the ranges U+0001 to U+0008, U+000B, U+000E to U+001F, U+007F to U+009F, U+D800 to U+DFFF , U+FDD0 to U+FDEF, and characters U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF, U+FFFFE, U+FFFFF, U+10FFFE, and U+10FFFF are parse errors. (These are all control characters or permanently undefined Unicode characters.) |
160 | | 163 | |
161 \xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF | 164 \xEF\xBF[\xBE\xBF] # U+FFFE and U+FFFF |
162 | | 165 | |
163 [\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16}) | 166 [\xF0-\xF4][\x8F-\xBF]\xBF[\xBE\xBF] # U+nFFFE and U+nFFFF (1 <= n <= 10_{16}) |
164 )/x', $data, $matches); | 167 )/x', $data, $matches); |
165 for ($i = 0; $i < $count; $i ++) { | 168 for ($i = 0; $i < $count; ++$i) { |
166 $errors[] = 'invalid-codepoint'; | 169 $errors[] = 'invalid-codepoint'; |
167 } | 170 } |
168 | 171 |
169 return $errors; | 172 return $errors; |
170 } | 173 } |