Mercurial > hg > isophonics-drupal-site
comparison vendor/masterminds/html5/src/HTML5/Parser/Scanner.php @ 17:129ea1e6d783
Update, including to Drupal core 8.6.10
author | Chris Cannam |
---|---|
date | Thu, 28 Feb 2019 13:21:36 +0000 |
parents | 4c8ae668cc8c |
children |
comparison
equal
deleted
inserted
replaced
16:c2387f117808 | 17:129ea1e6d783 |
---|---|
1 <?php | 1 <?php |
2 | |
2 namespace Masterminds\HTML5\Parser; | 3 namespace Masterminds\HTML5\Parser; |
3 | 4 |
5 use Masterminds\HTML5\Exception; | |
6 | |
4 /** | 7 /** |
5 * The scanner. | 8 * The scanner scans over a given data input to react appropriately to characters. |
6 * | |
7 * This scans over an input stream. | |
8 */ | 9 */ |
9 class Scanner | 10 class Scanner |
10 { | 11 { |
11 | |
12 const CHARS_HEX = 'abcdefABCDEF01234567890'; | 12 const CHARS_HEX = 'abcdefABCDEF01234567890'; |
13 | |
14 const CHARS_ALNUM = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890'; | 13 const CHARS_ALNUM = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890'; |
15 | |
16 const CHARS_ALPHA = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'; | 14 const CHARS_ALPHA = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'; |
17 | 15 |
18 protected $is; | 16 /** |
19 | 17 * The string data we're parsing. |
20 // Flipping this to true will give minisculely more debugging info. | 18 */ |
21 public $debug = false; | 19 private $data; |
20 | |
21 /** | |
22 * The current integer byte position we are in $data. | |
23 */ | |
24 private $char; | |
25 | |
26 /** | |
27 * Length of $data; when $char === $data, we are at the end-of-file. | |
28 */ | |
29 private $EOF; | |
30 | |
31 /** | |
32 * Parse errors. | |
33 */ | |
34 public $errors = array(); | |
22 | 35 |
23 /** | 36 /** |
24 * Create a new Scanner. | 37 * Create a new Scanner. |
25 * | 38 * |
26 * @param \Masterminds\HTML5\Parser\InputStream $input | 39 * @param string $data Data to parse. |
27 * An InputStream to be scanned. | 40 * @param string $encoding The encoding to use for the data. |
28 */ | 41 * |
29 public function __construct($input) | 42 * @throws Exception If the given data cannot be encoded to UTF-8. |
30 { | 43 */ |
31 $this->is = $input; | 44 public function __construct($data, $encoding = 'UTF-8') |
45 { | |
46 if ($data instanceof InputStream) { | |
47 @trigger_error('InputStream objects are deprecated since version 2.4 and will be removed in 3.0. Use strings instead.', E_USER_DEPRECATED); | |
48 $data = (string) $data; | |
49 } | |
50 | |
51 $data = UTF8Utils::convertToUTF8($data, $encoding); | |
52 | |
53 // There is good reason to question whether it makes sense to | |
54 // do this here, since most of these checks are done during | |
55 // parsing, and since this check doesn't actually *do* anything. | |
56 $this->errors = UTF8Utils::checkForIllegalCodepoints($data); | |
57 | |
58 $data = $this->replaceLinefeeds($data); | |
59 | |
60 $this->data = $data; | |
61 $this->char = 0; | |
62 $this->EOF = strlen($data); | |
63 } | |
64 | |
65 /** | |
66 * Check if upcomming chars match the given sequence. | |
67 * | |
68 * This will read the stream for the $sequence. If it's | |
69 * found, this will return true. If not, return false. | |
70 * Since this unconsumes any chars it reads, the caller | |
71 * will still need to read the next sequence, even if | |
72 * this returns true. | |
73 * | |
74 * Example: $this->scanner->sequenceMatches('</script>') will | |
75 * see if the input stream is at the start of a | |
76 * '</script>' string. | |
77 * | |
78 * @param string $sequence | |
79 * @param bool $caseSensitive | |
80 * | |
81 * @return bool | |
82 */ | |
83 public function sequenceMatches($sequence, $caseSensitive = true) | |
84 { | |
85 $portion = substr($this->data, $this->char, strlen($sequence)); | |
86 | |
87 return $caseSensitive ? $portion === $sequence : 0 === strcasecmp($portion, $sequence); | |
32 } | 88 } |
33 | 89 |
34 /** | 90 /** |
35 * Get the current position. | 91 * Get the current position. |
36 * | 92 * |
37 * @return int The current intiger byte position. | 93 * @return int The current intiger byte position. |
38 */ | 94 */ |
39 public function position() | 95 public function position() |
40 { | 96 { |
41 return $this->is->key(); | 97 return $this->char; |
42 } | 98 } |
43 | 99 |
44 /** | 100 /** |
45 * Take a peek at the next character in the data. | 101 * Take a peek at the next character in the data. |
46 * | 102 * |
47 * @return string The next character. | 103 * @return string The next character. |
48 */ | 104 */ |
49 public function peek() | 105 public function peek() |
50 { | 106 { |
51 return $this->is->peek(); | 107 if (($this->char + 1) <= $this->EOF) { |
108 return $this->data[$this->char + 1]; | |
109 } | |
110 | |
111 return false; | |
52 } | 112 } |
53 | 113 |
54 /** | 114 /** |
55 * Get the next character. | 115 * Get the next character. |
56 * | |
57 * Note: This advances the pointer. | 116 * Note: This advances the pointer. |
58 * | 117 * |
59 * @return string The next character. | 118 * @return string The next character. |
60 */ | 119 */ |
61 public function next() | 120 public function next() |
62 { | 121 { |
63 $this->is->next(); | 122 ++$this->char; |
64 if ($this->is->valid()) { | 123 |
65 if ($this->debug) | 124 if ($this->char < $this->EOF) { |
66 fprintf(STDOUT, "> %s\n", $this->is->current()); | 125 return $this->data[$this->char]; |
67 return $this->is->current(); | |
68 } | 126 } |
69 | 127 |
70 return false; | 128 return false; |
71 } | 129 } |
72 | 130 |
73 /** | 131 /** |
74 * Get the current character. | 132 * Get the current character. |
75 * | |
76 * Note, this does not advance the pointer. | 133 * Note, this does not advance the pointer. |
77 * | 134 * |
78 * @return string The current character. | 135 * @return string The current character. |
79 */ | 136 */ |
80 public function current() | 137 public function current() |
81 { | 138 { |
82 if ($this->is->valid()) { | 139 if ($this->char < $this->EOF) { |
83 return $this->is->current(); | 140 return $this->data[$this->char]; |
84 } | 141 } |
85 | 142 |
86 return false; | 143 return false; |
87 } | 144 } |
88 | 145 |
89 /** | 146 /** |
90 * Silently consume N chars. | 147 * Silently consume N chars. |
148 * | |
149 * @param int $count | |
91 */ | 150 */ |
92 public function consume($count = 1) | 151 public function consume($count = 1) |
93 { | 152 { |
94 for ($i = 0; $i < $count; ++ $i) { | 153 $this->char += $count; |
95 $this->next(); | |
96 } | |
97 } | 154 } |
98 | 155 |
99 /** | 156 /** |
100 * Unconsume some of the data. | 157 * Unconsume some of the data. |
101 * This moves the data pointer backwards. | 158 * This moves the data pointer backwards. |
102 * | 159 * |
103 * @param int $howMany | 160 * @param int $howMany The number of characters to move the pointer back. |
104 * The number of characters to move the pointer back. | |
105 */ | 161 */ |
106 public function unconsume($howMany = 1) | 162 public function unconsume($howMany = 1) |
107 { | 163 { |
108 $this->is->unconsume($howMany); | 164 if (($this->char - $howMany) >= 0) { |
165 $this->char -= $howMany; | |
166 } | |
109 } | 167 } |
110 | 168 |
111 /** | 169 /** |
112 * Get the next group of that contains hex characters. | 170 * Get the next group of that contains hex characters. |
113 * | |
114 * Note, along with getting the characters the pointer in the data will be | 171 * Note, along with getting the characters the pointer in the data will be |
115 * moved as well. | 172 * moved as well. |
116 * | 173 * |
117 * @return string The next group that is hex characters. | 174 * @return string The next group that is hex characters. |
118 */ | 175 */ |
119 public function getHex() | 176 public function getHex() |
120 { | 177 { |
121 return $this->is->charsWhile(static::CHARS_HEX); | 178 return $this->doCharsWhile(static::CHARS_HEX); |
122 } | 179 } |
123 | 180 |
124 /** | 181 /** |
125 * Get the next group of characters that are ASCII Alpha characters. | 182 * Get the next group of characters that are ASCII Alpha characters. |
126 * | |
127 * Note, along with getting the characters the pointer in the data will be | 183 * Note, along with getting the characters the pointer in the data will be |
128 * moved as well. | 184 * moved as well. |
129 * | 185 * |
130 * @return string The next group of ASCII alpha characters. | 186 * @return string The next group of ASCII alpha characters. |
131 */ | 187 */ |
132 public function getAsciiAlpha() | 188 public function getAsciiAlpha() |
133 { | 189 { |
134 return $this->is->charsWhile(static::CHARS_ALPHA); | 190 return $this->doCharsWhile(static::CHARS_ALPHA); |
135 } | 191 } |
136 | 192 |
137 /** | 193 /** |
138 * Get the next group of characters that are ASCII Alpha characters and numbers. | 194 * Get the next group of characters that are ASCII Alpha characters and numbers. |
139 * | |
140 * Note, along with getting the characters the pointer in the data will be | 195 * Note, along with getting the characters the pointer in the data will be |
141 * moved as well. | 196 * moved as well. |
142 * | 197 * |
143 * @return string The next group of ASCII alpha characters and numbers. | 198 * @return string The next group of ASCII alpha characters and numbers. |
144 */ | 199 */ |
145 public function getAsciiAlphaNum() | 200 public function getAsciiAlphaNum() |
146 { | 201 { |
147 return $this->is->charsWhile(static::CHARS_ALNUM); | 202 return $this->doCharsWhile(static::CHARS_ALNUM); |
148 } | 203 } |
149 | 204 |
150 /** | 205 /** |
151 * Get the next group of numbers. | 206 * Get the next group of numbers. |
152 * | |
153 * Note, along with getting the characters the pointer in the data will be | 207 * Note, along with getting the characters the pointer in the data will be |
154 * moved as well. | 208 * moved as well. |
155 * | 209 * |
156 * @return string The next group of numbers. | 210 * @return string The next group of numbers. |
157 */ | 211 */ |
158 public function getNumeric() | 212 public function getNumeric() |
159 { | 213 { |
160 return $this->is->charsWhile('0123456789'); | 214 return $this->doCharsWhile('0123456789'); |
161 } | 215 } |
162 | 216 |
163 /** | 217 /** |
164 * Consume whitespace. | 218 * Consume whitespace. |
165 * | |
166 * Whitespace in HTML5 is: formfeed, tab, newline, space. | 219 * Whitespace in HTML5 is: formfeed, tab, newline, space. |
220 * | |
221 * @return int The length of the matched whitespaces. | |
167 */ | 222 */ |
168 public function whitespace() | 223 public function whitespace() |
169 { | 224 { |
170 return $this->is->charsWhile("\n\t\f "); | 225 if ($this->char >= $this->EOF) { |
226 return false; | |
227 } | |
228 | |
229 $len = strspn($this->data, "\n\t\f ", $this->char); | |
230 | |
231 $this->char += $len; | |
232 | |
233 return $len; | |
171 } | 234 } |
172 | 235 |
173 /** | 236 /** |
174 * Returns the current line that is being consumed. | 237 * Returns the current line that is being consumed. |
175 * | 238 * |
176 * @return int The current line number. | 239 * @return int The current line number. |
177 */ | 240 */ |
178 public function currentLine() | 241 public function currentLine() |
179 { | 242 { |
180 return $this->is->currentLine(); | 243 if (empty($this->EOF) || 0 === $this->char) { |
244 return 1; | |
245 } | |
246 | |
247 // Add one to $this->char because we want the number for the next | |
248 // byte to be processed. | |
249 return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1; | |
181 } | 250 } |
182 | 251 |
183 /** | 252 /** |
184 * Read chars until something in the mask is encountered. | 253 * Read chars until something in the mask is encountered. |
254 * | |
255 * @param string $mask | |
256 * | |
257 * @return mixed | |
185 */ | 258 */ |
186 public function charsUntil($mask) | 259 public function charsUntil($mask) |
187 { | 260 { |
188 return $this->is->charsUntil($mask); | 261 return $this->doCharsUntil($mask); |
189 } | 262 } |
190 | 263 |
191 /** | 264 /** |
192 * Read chars as long as the mask matches. | 265 * Read chars as long as the mask matches. |
266 * | |
267 * @param string $mask | |
268 * | |
269 * @return int | |
193 */ | 270 */ |
194 public function charsWhile($mask) | 271 public function charsWhile($mask) |
195 { | 272 { |
196 return $this->is->charsWhile($mask); | 273 return $this->doCharsWhile($mask); |
197 } | 274 } |
198 | 275 |
199 /** | 276 /** |
200 * Returns the current column of the current line that the tokenizer is at. | 277 * Returns the current column of the current line that the tokenizer is at. |
201 * | 278 * |
203 * | 280 * |
204 * @return int The column number. | 281 * @return int The column number. |
205 */ | 282 */ |
206 public function columnOffset() | 283 public function columnOffset() |
207 { | 284 { |
208 return $this->is->columnOffset(); | 285 // Short circuit for the first char. |
286 if (0 === $this->char) { | |
287 return 0; | |
288 } | |
289 | |
290 // strrpos is weird, and the offset needs to be negative for what we | |
291 // want (i.e., the last \n before $this->char). This needs to not have | |
292 // one (to make it point to the next character, the one we want the | |
293 // position of) added to it because strrpos's behaviour includes the | |
294 // final offset byte. | |
295 $backwardFrom = $this->char - 1 - strlen($this->data); | |
296 $lastLine = strrpos($this->data, "\n", $backwardFrom); | |
297 | |
298 // However, for here we want the length up until the next byte to be | |
299 // processed, so add one to the current byte ($this->char). | |
300 if (false !== $lastLine) { | |
301 $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine); | |
302 } else { | |
303 // After a newline. | |
304 $findLengthOf = substr($this->data, 0, $this->char); | |
305 } | |
306 | |
307 return UTF8Utils::countChars($findLengthOf); | |
209 } | 308 } |
210 | 309 |
211 /** | 310 /** |
212 * Get all characters until EOF. | 311 * Get all characters until EOF. |
213 * | 312 * |
215 * | 314 * |
216 * @return int The number of characters remaining. | 315 * @return int The number of characters remaining. |
217 */ | 316 */ |
218 public function remainingChars() | 317 public function remainingChars() |
219 { | 318 { |
220 return $this->is->remainingChars(); | 319 if ($this->char < $this->EOF) { |
320 $data = substr($this->data, $this->char); | |
321 $this->char = $this->EOF; | |
322 | |
323 return $data; | |
324 } | |
325 | |
326 return ''; // false; | |
327 } | |
328 | |
329 /** | |
330 * Replace linefeed characters according to the spec. | |
331 * | |
332 * @param $data | |
333 * | |
334 * @return string | |
335 */ | |
336 private function replaceLinefeeds($data) | |
337 { | |
338 /* | |
339 * U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially. | |
340 * Any CR characters that are followed by LF characters must be removed, and any CR characters not | |
341 * followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are | |
342 * represented by LF characters, and there are never any CR characters in the input to the tokenization | |
343 * stage. | |
344 */ | |
345 $crlfTable = array( | |
346 "\0" => "\xEF\xBF\xBD", | |
347 "\r\n" => "\n", | |
348 "\r" => "\n", | |
349 ); | |
350 | |
351 return strtr($data, $crlfTable); | |
352 } | |
353 | |
354 /** | |
355 * Read to a particular match (or until $max bytes are consumed). | |
356 * | |
357 * This operates on byte sequences, not characters. | |
358 * | |
359 * Matches as far as possible until we reach a certain set of bytes | |
360 * and returns the matched substring. | |
361 * | |
362 * @param string $bytes Bytes to match. | |
363 * @param int $max Maximum number of bytes to scan. | |
364 * | |
365 * @return mixed Index or false if no match is found. You should use strong | |
366 * equality when checking the result, since index could be 0. | |
367 */ | |
368 private function doCharsUntil($bytes, $max = null) | |
369 { | |
370 if ($this->char >= $this->EOF) { | |
371 return false; | |
372 } | |
373 | |
374 if (0 === $max || $max) { | |
375 $len = strcspn($this->data, $bytes, $this->char, $max); | |
376 } else { | |
377 $len = strcspn($this->data, $bytes, $this->char); | |
378 } | |
379 | |
380 $string = (string) substr($this->data, $this->char, $len); | |
381 $this->char += $len; | |
382 | |
383 return $string; | |
384 } | |
385 | |
386 /** | |
387 * Returns the string so long as $bytes matches. | |
388 * | |
389 * Matches as far as possible with a certain set of bytes | |
390 * and returns the matched substring. | |
391 * | |
392 * @param string $bytes A mask of bytes to match. If ANY byte in this mask matches the | |
393 * current char, the pointer advances and the char is part of the | |
394 * substring. | |
395 * @param int $max The max number of chars to read. | |
396 * | |
397 * @return string | |
398 */ | |
399 private function doCharsWhile($bytes, $max = null) | |
400 { | |
401 if ($this->char >= $this->EOF) { | |
402 return false; | |
403 } | |
404 | |
405 if (0 === $max || $max) { | |
406 $len = strspn($this->data, $bytes, $this->char, $max); | |
407 } else { | |
408 $len = strspn($this->data, $bytes, $this->char); | |
409 } | |
410 | |
411 $string = (string) substr($this->data, $this->char, $len); | |
412 $this->char += $len; | |
413 | |
414 return $string; | |
221 } | 415 } |
222 } | 416 } |