comparison vendor/masterminds/html5/src/HTML5/Parser/Scanner.php @ 17:129ea1e6d783

Update, including to Drupal core 8.6.10
author Chris Cannam
date Thu, 28 Feb 2019 13:21:36 +0000
parents 4c8ae668cc8c
children
comparison
equal deleted inserted replaced
16:c2387f117808 17:129ea1e6d783
1 <?php 1 <?php
2
2 namespace Masterminds\HTML5\Parser; 3 namespace Masterminds\HTML5\Parser;
3 4
5 use Masterminds\HTML5\Exception;
6
4 /** 7 /**
5 * The scanner. 8 * The scanner scans over a given data input to react appropriately to characters.
6 *
7 * This scans over an input stream.
8 */ 9 */
9 class Scanner 10 class Scanner
10 { 11 {
11
12 const CHARS_HEX = 'abcdefABCDEF01234567890'; 12 const CHARS_HEX = 'abcdefABCDEF01234567890';
13
14 const CHARS_ALNUM = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890'; 13 const CHARS_ALNUM = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890';
15
16 const CHARS_ALPHA = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'; 14 const CHARS_ALPHA = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ';
17 15
18 protected $is; 16 /**
19 17 * The string data we're parsing.
20 // Flipping this to true will give minisculely more debugging info. 18 */
21 public $debug = false; 19 private $data;
20
21 /**
22 * The current integer byte position we are in $data.
23 */
24 private $char;
25
26 /**
27 * Length of $data; when $char === $data, we are at the end-of-file.
28 */
29 private $EOF;
30
31 /**
32 * Parse errors.
33 */
34 public $errors = array();
22 35
23 /** 36 /**
24 * Create a new Scanner. 37 * Create a new Scanner.
25 * 38 *
26 * @param \Masterminds\HTML5\Parser\InputStream $input 39 * @param string $data Data to parse.
27 * An InputStream to be scanned. 40 * @param string $encoding The encoding to use for the data.
28 */ 41 *
29 public function __construct($input) 42 * @throws Exception If the given data cannot be encoded to UTF-8.
30 { 43 */
31 $this->is = $input; 44 public function __construct($data, $encoding = 'UTF-8')
45 {
46 if ($data instanceof InputStream) {
47 @trigger_error('InputStream objects are deprecated since version 2.4 and will be removed in 3.0. Use strings instead.', E_USER_DEPRECATED);
48 $data = (string) $data;
49 }
50
51 $data = UTF8Utils::convertToUTF8($data, $encoding);
52
53 // There is good reason to question whether it makes sense to
54 // do this here, since most of these checks are done during
55 // parsing, and since this check doesn't actually *do* anything.
56 $this->errors = UTF8Utils::checkForIllegalCodepoints($data);
57
58 $data = $this->replaceLinefeeds($data);
59
60 $this->data = $data;
61 $this->char = 0;
62 $this->EOF = strlen($data);
63 }
64
65 /**
66 * Check if upcomming chars match the given sequence.
67 *
68 * This will read the stream for the $sequence. If it's
69 * found, this will return true. If not, return false.
70 * Since this unconsumes any chars it reads, the caller
71 * will still need to read the next sequence, even if
72 * this returns true.
73 *
74 * Example: $this->scanner->sequenceMatches('</script>') will
75 * see if the input stream is at the start of a
76 * '</script>' string.
77 *
78 * @param string $sequence
79 * @param bool $caseSensitive
80 *
81 * @return bool
82 */
83 public function sequenceMatches($sequence, $caseSensitive = true)
84 {
85 $portion = substr($this->data, $this->char, strlen($sequence));
86
87 return $caseSensitive ? $portion === $sequence : 0 === strcasecmp($portion, $sequence);
32 } 88 }
33 89
34 /** 90 /**
35 * Get the current position. 91 * Get the current position.
36 * 92 *
37 * @return int The current intiger byte position. 93 * @return int The current intiger byte position.
38 */ 94 */
39 public function position() 95 public function position()
40 { 96 {
41 return $this->is->key(); 97 return $this->char;
42 } 98 }
43 99
44 /** 100 /**
45 * Take a peek at the next character in the data. 101 * Take a peek at the next character in the data.
46 * 102 *
47 * @return string The next character. 103 * @return string The next character.
48 */ 104 */
49 public function peek() 105 public function peek()
50 { 106 {
51 return $this->is->peek(); 107 if (($this->char + 1) <= $this->EOF) {
108 return $this->data[$this->char + 1];
109 }
110
111 return false;
52 } 112 }
53 113
54 /** 114 /**
55 * Get the next character. 115 * Get the next character.
56 *
57 * Note: This advances the pointer. 116 * Note: This advances the pointer.
58 * 117 *
59 * @return string The next character. 118 * @return string The next character.
60 */ 119 */
61 public function next() 120 public function next()
62 { 121 {
63 $this->is->next(); 122 ++$this->char;
64 if ($this->is->valid()) { 123
65 if ($this->debug) 124 if ($this->char < $this->EOF) {
66 fprintf(STDOUT, "> %s\n", $this->is->current()); 125 return $this->data[$this->char];
67 return $this->is->current();
68 } 126 }
69 127
70 return false; 128 return false;
71 } 129 }
72 130
73 /** 131 /**
74 * Get the current character. 132 * Get the current character.
75 *
76 * Note, this does not advance the pointer. 133 * Note, this does not advance the pointer.
77 * 134 *
78 * @return string The current character. 135 * @return string The current character.
79 */ 136 */
80 public function current() 137 public function current()
81 { 138 {
82 if ($this->is->valid()) { 139 if ($this->char < $this->EOF) {
83 return $this->is->current(); 140 return $this->data[$this->char];
84 } 141 }
85 142
86 return false; 143 return false;
87 } 144 }
88 145
89 /** 146 /**
90 * Silently consume N chars. 147 * Silently consume N chars.
148 *
149 * @param int $count
91 */ 150 */
92 public function consume($count = 1) 151 public function consume($count = 1)
93 { 152 {
94 for ($i = 0; $i < $count; ++ $i) { 153 $this->char += $count;
95 $this->next();
96 }
97 } 154 }
98 155
99 /** 156 /**
100 * Unconsume some of the data. 157 * Unconsume some of the data.
101 * This moves the data pointer backwards. 158 * This moves the data pointer backwards.
102 * 159 *
103 * @param int $howMany 160 * @param int $howMany The number of characters to move the pointer back.
104 * The number of characters to move the pointer back.
105 */ 161 */
106 public function unconsume($howMany = 1) 162 public function unconsume($howMany = 1)
107 { 163 {
108 $this->is->unconsume($howMany); 164 if (($this->char - $howMany) >= 0) {
165 $this->char -= $howMany;
166 }
109 } 167 }
110 168
111 /** 169 /**
112 * Get the next group of that contains hex characters. 170 * Get the next group of that contains hex characters.
113 *
114 * Note, along with getting the characters the pointer in the data will be 171 * Note, along with getting the characters the pointer in the data will be
115 * moved as well. 172 * moved as well.
116 * 173 *
117 * @return string The next group that is hex characters. 174 * @return string The next group that is hex characters.
118 */ 175 */
119 public function getHex() 176 public function getHex()
120 { 177 {
121 return $this->is->charsWhile(static::CHARS_HEX); 178 return $this->doCharsWhile(static::CHARS_HEX);
122 } 179 }
123 180
124 /** 181 /**
125 * Get the next group of characters that are ASCII Alpha characters. 182 * Get the next group of characters that are ASCII Alpha characters.
126 *
127 * Note, along with getting the characters the pointer in the data will be 183 * Note, along with getting the characters the pointer in the data will be
128 * moved as well. 184 * moved as well.
129 * 185 *
130 * @return string The next group of ASCII alpha characters. 186 * @return string The next group of ASCII alpha characters.
131 */ 187 */
132 public function getAsciiAlpha() 188 public function getAsciiAlpha()
133 { 189 {
134 return $this->is->charsWhile(static::CHARS_ALPHA); 190 return $this->doCharsWhile(static::CHARS_ALPHA);
135 } 191 }
136 192
137 /** 193 /**
138 * Get the next group of characters that are ASCII Alpha characters and numbers. 194 * Get the next group of characters that are ASCII Alpha characters and numbers.
139 *
140 * Note, along with getting the characters the pointer in the data will be 195 * Note, along with getting the characters the pointer in the data will be
141 * moved as well. 196 * moved as well.
142 * 197 *
143 * @return string The next group of ASCII alpha characters and numbers. 198 * @return string The next group of ASCII alpha characters and numbers.
144 */ 199 */
145 public function getAsciiAlphaNum() 200 public function getAsciiAlphaNum()
146 { 201 {
147 return $this->is->charsWhile(static::CHARS_ALNUM); 202 return $this->doCharsWhile(static::CHARS_ALNUM);
148 } 203 }
149 204
150 /** 205 /**
151 * Get the next group of numbers. 206 * Get the next group of numbers.
152 *
153 * Note, along with getting the characters the pointer in the data will be 207 * Note, along with getting the characters the pointer in the data will be
154 * moved as well. 208 * moved as well.
155 * 209 *
156 * @return string The next group of numbers. 210 * @return string The next group of numbers.
157 */ 211 */
158 public function getNumeric() 212 public function getNumeric()
159 { 213 {
160 return $this->is->charsWhile('0123456789'); 214 return $this->doCharsWhile('0123456789');
161 } 215 }
162 216
163 /** 217 /**
164 * Consume whitespace. 218 * Consume whitespace.
165 *
166 * Whitespace in HTML5 is: formfeed, tab, newline, space. 219 * Whitespace in HTML5 is: formfeed, tab, newline, space.
220 *
221 * @return int The length of the matched whitespaces.
167 */ 222 */
168 public function whitespace() 223 public function whitespace()
169 { 224 {
170 return $this->is->charsWhile("\n\t\f "); 225 if ($this->char >= $this->EOF) {
226 return false;
227 }
228
229 $len = strspn($this->data, "\n\t\f ", $this->char);
230
231 $this->char += $len;
232
233 return $len;
171 } 234 }
172 235
173 /** 236 /**
174 * Returns the current line that is being consumed. 237 * Returns the current line that is being consumed.
175 * 238 *
176 * @return int The current line number. 239 * @return int The current line number.
177 */ 240 */
178 public function currentLine() 241 public function currentLine()
179 { 242 {
180 return $this->is->currentLine(); 243 if (empty($this->EOF) || 0 === $this->char) {
244 return 1;
245 }
246
247 // Add one to $this->char because we want the number for the next
248 // byte to be processed.
249 return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1;
181 } 250 }
182 251
183 /** 252 /**
184 * Read chars until something in the mask is encountered. 253 * Read chars until something in the mask is encountered.
254 *
255 * @param string $mask
256 *
257 * @return mixed
185 */ 258 */
186 public function charsUntil($mask) 259 public function charsUntil($mask)
187 { 260 {
188 return $this->is->charsUntil($mask); 261 return $this->doCharsUntil($mask);
189 } 262 }
190 263
191 /** 264 /**
192 * Read chars as long as the mask matches. 265 * Read chars as long as the mask matches.
266 *
267 * @param string $mask
268 *
269 * @return int
193 */ 270 */
194 public function charsWhile($mask) 271 public function charsWhile($mask)
195 { 272 {
196 return $this->is->charsWhile($mask); 273 return $this->doCharsWhile($mask);
197 } 274 }
198 275
199 /** 276 /**
200 * Returns the current column of the current line that the tokenizer is at. 277 * Returns the current column of the current line that the tokenizer is at.
201 * 278 *
203 * 280 *
204 * @return int The column number. 281 * @return int The column number.
205 */ 282 */
206 public function columnOffset() 283 public function columnOffset()
207 { 284 {
208 return $this->is->columnOffset(); 285 // Short circuit for the first char.
286 if (0 === $this->char) {
287 return 0;
288 }
289
290 // strrpos is weird, and the offset needs to be negative for what we
291 // want (i.e., the last \n before $this->char). This needs to not have
292 // one (to make it point to the next character, the one we want the
293 // position of) added to it because strrpos's behaviour includes the
294 // final offset byte.
295 $backwardFrom = $this->char - 1 - strlen($this->data);
296 $lastLine = strrpos($this->data, "\n", $backwardFrom);
297
298 // However, for here we want the length up until the next byte to be
299 // processed, so add one to the current byte ($this->char).
300 if (false !== $lastLine) {
301 $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine);
302 } else {
303 // After a newline.
304 $findLengthOf = substr($this->data, 0, $this->char);
305 }
306
307 return UTF8Utils::countChars($findLengthOf);
209 } 308 }
210 309
211 /** 310 /**
212 * Get all characters until EOF. 311 * Get all characters until EOF.
213 * 312 *
215 * 314 *
216 * @return int The number of characters remaining. 315 * @return int The number of characters remaining.
217 */ 316 */
218 public function remainingChars() 317 public function remainingChars()
219 { 318 {
220 return $this->is->remainingChars(); 319 if ($this->char < $this->EOF) {
320 $data = substr($this->data, $this->char);
321 $this->char = $this->EOF;
322
323 return $data;
324 }
325
326 return ''; // false;
327 }
328
329 /**
330 * Replace linefeed characters according to the spec.
331 *
332 * @param $data
333 *
334 * @return string
335 */
336 private function replaceLinefeeds($data)
337 {
338 /*
339 * U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially.
340 * Any CR characters that are followed by LF characters must be removed, and any CR characters not
341 * followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are
342 * represented by LF characters, and there are never any CR characters in the input to the tokenization
343 * stage.
344 */
345 $crlfTable = array(
346 "\0" => "\xEF\xBF\xBD",
347 "\r\n" => "\n",
348 "\r" => "\n",
349 );
350
351 return strtr($data, $crlfTable);
352 }
353
354 /**
355 * Read to a particular match (or until $max bytes are consumed).
356 *
357 * This operates on byte sequences, not characters.
358 *
359 * Matches as far as possible until we reach a certain set of bytes
360 * and returns the matched substring.
361 *
362 * @param string $bytes Bytes to match.
363 * @param int $max Maximum number of bytes to scan.
364 *
365 * @return mixed Index or false if no match is found. You should use strong
366 * equality when checking the result, since index could be 0.
367 */
368 private function doCharsUntil($bytes, $max = null)
369 {
370 if ($this->char >= $this->EOF) {
371 return false;
372 }
373
374 if (0 === $max || $max) {
375 $len = strcspn($this->data, $bytes, $this->char, $max);
376 } else {
377 $len = strcspn($this->data, $bytes, $this->char);
378 }
379
380 $string = (string) substr($this->data, $this->char, $len);
381 $this->char += $len;
382
383 return $string;
384 }
385
386 /**
387 * Returns the string so long as $bytes matches.
388 *
389 * Matches as far as possible with a certain set of bytes
390 * and returns the matched substring.
391 *
392 * @param string $bytes A mask of bytes to match. If ANY byte in this mask matches the
393 * current char, the pointer advances and the char is part of the
394 * substring.
395 * @param int $max The max number of chars to read.
396 *
397 * @return string
398 */
399 private function doCharsWhile($bytes, $max = null)
400 {
401 if ($this->char >= $this->EOF) {
402 return false;
403 }
404
405 if (0 === $max || $max) {
406 $len = strspn($this->data, $bytes, $this->char, $max);
407 } else {
408 $len = strspn($this->data, $bytes, $this->char);
409 }
410
411 $string = (string) substr($this->data, $this->char, $len);
412 $this->char += $len;
413
414 return $string;
221 } 415 }
222 } 416 }