Chris@0
|
1 <?php
|
Chris@17
|
2
|
Chris@0
|
3 namespace Masterminds\HTML5\Parser;
|
Chris@0
|
4
|
Chris@17
|
5 use Masterminds\HTML5\Exception;
|
Chris@17
|
6
|
Chris@0
|
7 /**
|
Chris@17
|
8 * The scanner scans over a given data input to react appropriately to characters.
|
Chris@0
|
9 */
|
Chris@0
|
10 class Scanner
|
Chris@0
|
11 {
|
Chris@0
|
12 const CHARS_HEX = 'abcdefABCDEF01234567890';
|
Chris@0
|
13 const CHARS_ALNUM = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890';
|
Chris@0
|
14 const CHARS_ALPHA = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ';
|
Chris@0
|
15
|
Chris@17
|
16 /**
|
Chris@17
|
17 * The string data we're parsing.
|
Chris@17
|
18 */
|
Chris@17
|
19 private $data;
|
Chris@0
|
20
|
Chris@17
|
21 /**
|
Chris@17
|
22 * The current integer byte position we are in $data.
|
Chris@17
|
23 */
|
Chris@17
|
24 private $char;
|
Chris@17
|
25
|
Chris@17
|
26 /**
|
Chris@17
|
27 * Length of $data; when $char === $data, we are at the end-of-file.
|
Chris@17
|
28 */
|
Chris@17
|
29 private $EOF;
|
Chris@17
|
30
|
Chris@17
|
31 /**
|
Chris@17
|
32 * Parse errors.
|
Chris@17
|
33 */
|
Chris@17
|
34 public $errors = array();
|
Chris@0
|
35
|
Chris@0
|
36 /**
|
Chris@0
|
37 * Create a new Scanner.
|
Chris@0
|
38 *
|
Chris@17
|
39 * @param string $data Data to parse.
|
Chris@17
|
40 * @param string $encoding The encoding to use for the data.
|
Chris@17
|
41 *
|
Chris@17
|
42 * @throws Exception If the given data cannot be encoded to UTF-8.
|
Chris@0
|
43 */
|
Chris@17
|
44 public function __construct($data, $encoding = 'UTF-8')
|
Chris@0
|
45 {
|
Chris@17
|
46 if ($data instanceof InputStream) {
|
Chris@17
|
47 @trigger_error('InputStream objects are deprecated since version 2.4 and will be removed in 3.0. Use strings instead.', E_USER_DEPRECATED);
|
Chris@17
|
48 $data = (string) $data;
|
Chris@17
|
49 }
|
Chris@17
|
50
|
Chris@17
|
51 $data = UTF8Utils::convertToUTF8($data, $encoding);
|
Chris@17
|
52
|
Chris@17
|
53 // There is good reason to question whether it makes sense to
|
Chris@17
|
54 // do this here, since most of these checks are done during
|
Chris@17
|
55 // parsing, and since this check doesn't actually *do* anything.
|
Chris@17
|
56 $this->errors = UTF8Utils::checkForIllegalCodepoints($data);
|
Chris@17
|
57
|
Chris@17
|
58 $data = $this->replaceLinefeeds($data);
|
Chris@17
|
59
|
Chris@17
|
60 $this->data = $data;
|
Chris@17
|
61 $this->char = 0;
|
Chris@17
|
62 $this->EOF = strlen($data);
|
Chris@17
|
63 }
|
Chris@17
|
64
|
Chris@17
|
65 /**
|
Chris@17
|
66 * Check if upcomming chars match the given sequence.
|
Chris@17
|
67 *
|
Chris@17
|
68 * This will read the stream for the $sequence. If it's
|
Chris@17
|
69 * found, this will return true. If not, return false.
|
Chris@17
|
70 * Since this unconsumes any chars it reads, the caller
|
Chris@17
|
71 * will still need to read the next sequence, even if
|
Chris@17
|
72 * this returns true.
|
Chris@17
|
73 *
|
Chris@17
|
74 * Example: $this->scanner->sequenceMatches('</script>') will
|
Chris@17
|
75 * see if the input stream is at the start of a
|
Chris@17
|
76 * '</script>' string.
|
Chris@17
|
77 *
|
Chris@17
|
78 * @param string $sequence
|
Chris@17
|
79 * @param bool $caseSensitive
|
Chris@17
|
80 *
|
Chris@17
|
81 * @return bool
|
Chris@17
|
82 */
|
Chris@17
|
83 public function sequenceMatches($sequence, $caseSensitive = true)
|
Chris@17
|
84 {
|
Chris@17
|
85 $portion = substr($this->data, $this->char, strlen($sequence));
|
Chris@17
|
86
|
Chris@17
|
87 return $caseSensitive ? $portion === $sequence : 0 === strcasecmp($portion, $sequence);
|
Chris@0
|
88 }
|
Chris@0
|
89
|
Chris@0
|
90 /**
|
Chris@0
|
91 * Get the current position.
|
Chris@0
|
92 *
|
Chris@0
|
93 * @return int The current intiger byte position.
|
Chris@0
|
94 */
|
Chris@0
|
95 public function position()
|
Chris@0
|
96 {
|
Chris@17
|
97 return $this->char;
|
Chris@0
|
98 }
|
Chris@0
|
99
|
Chris@0
|
100 /**
|
Chris@0
|
101 * Take a peek at the next character in the data.
|
Chris@0
|
102 *
|
Chris@0
|
103 * @return string The next character.
|
Chris@0
|
104 */
|
Chris@0
|
105 public function peek()
|
Chris@0
|
106 {
|
Chris@17
|
107 if (($this->char + 1) <= $this->EOF) {
|
Chris@17
|
108 return $this->data[$this->char + 1];
|
Chris@17
|
109 }
|
Chris@17
|
110
|
Chris@17
|
111 return false;
|
Chris@0
|
112 }
|
Chris@0
|
113
|
Chris@0
|
114 /**
|
Chris@0
|
115 * Get the next character.
|
Chris@0
|
116 * Note: This advances the pointer.
|
Chris@0
|
117 *
|
Chris@0
|
118 * @return string The next character.
|
Chris@0
|
119 */
|
Chris@0
|
120 public function next()
|
Chris@0
|
121 {
|
Chris@17
|
122 ++$this->char;
|
Chris@17
|
123
|
Chris@17
|
124 if ($this->char < $this->EOF) {
|
Chris@17
|
125 return $this->data[$this->char];
|
Chris@0
|
126 }
|
Chris@0
|
127
|
Chris@0
|
128 return false;
|
Chris@0
|
129 }
|
Chris@0
|
130
|
Chris@0
|
131 /**
|
Chris@0
|
132 * Get the current character.
|
Chris@0
|
133 * Note, this does not advance the pointer.
|
Chris@0
|
134 *
|
Chris@0
|
135 * @return string The current character.
|
Chris@0
|
136 */
|
Chris@0
|
137 public function current()
|
Chris@0
|
138 {
|
Chris@17
|
139 if ($this->char < $this->EOF) {
|
Chris@17
|
140 return $this->data[$this->char];
|
Chris@0
|
141 }
|
Chris@0
|
142
|
Chris@0
|
143 return false;
|
Chris@0
|
144 }
|
Chris@0
|
145
|
Chris@0
|
146 /**
|
Chris@0
|
147 * Silently consume N chars.
|
Chris@17
|
148 *
|
Chris@17
|
149 * @param int $count
|
Chris@0
|
150 */
|
Chris@0
|
151 public function consume($count = 1)
|
Chris@0
|
152 {
|
Chris@17
|
153 $this->char += $count;
|
Chris@0
|
154 }
|
Chris@0
|
155
|
Chris@0
|
156 /**
|
Chris@0
|
157 * Unconsume some of the data.
|
Chris@0
|
158 * This moves the data pointer backwards.
|
Chris@0
|
159 *
|
Chris@17
|
160 * @param int $howMany The number of characters to move the pointer back.
|
Chris@0
|
161 */
|
Chris@0
|
162 public function unconsume($howMany = 1)
|
Chris@0
|
163 {
|
Chris@17
|
164 if (($this->char - $howMany) >= 0) {
|
Chris@17
|
165 $this->char -= $howMany;
|
Chris@17
|
166 }
|
Chris@0
|
167 }
|
Chris@0
|
168
|
Chris@0
|
169 /**
|
Chris@0
|
170 * Get the next group of that contains hex characters.
|
Chris@0
|
171 * Note, along with getting the characters the pointer in the data will be
|
Chris@0
|
172 * moved as well.
|
Chris@0
|
173 *
|
Chris@0
|
174 * @return string The next group that is hex characters.
|
Chris@0
|
175 */
|
Chris@0
|
176 public function getHex()
|
Chris@0
|
177 {
|
Chris@17
|
178 return $this->doCharsWhile(static::CHARS_HEX);
|
Chris@0
|
179 }
|
Chris@0
|
180
|
Chris@0
|
181 /**
|
Chris@0
|
182 * Get the next group of characters that are ASCII Alpha characters.
|
Chris@0
|
183 * Note, along with getting the characters the pointer in the data will be
|
Chris@0
|
184 * moved as well.
|
Chris@0
|
185 *
|
Chris@0
|
186 * @return string The next group of ASCII alpha characters.
|
Chris@0
|
187 */
|
Chris@0
|
188 public function getAsciiAlpha()
|
Chris@0
|
189 {
|
Chris@17
|
190 return $this->doCharsWhile(static::CHARS_ALPHA);
|
Chris@0
|
191 }
|
Chris@0
|
192
|
Chris@0
|
193 /**
|
Chris@0
|
194 * Get the next group of characters that are ASCII Alpha characters and numbers.
|
Chris@0
|
195 * Note, along with getting the characters the pointer in the data will be
|
Chris@0
|
196 * moved as well.
|
Chris@0
|
197 *
|
Chris@0
|
198 * @return string The next group of ASCII alpha characters and numbers.
|
Chris@0
|
199 */
|
Chris@0
|
200 public function getAsciiAlphaNum()
|
Chris@0
|
201 {
|
Chris@17
|
202 return $this->doCharsWhile(static::CHARS_ALNUM);
|
Chris@0
|
203 }
|
Chris@0
|
204
|
Chris@0
|
205 /**
|
Chris@0
|
206 * Get the next group of numbers.
|
Chris@0
|
207 * Note, along with getting the characters the pointer in the data will be
|
Chris@0
|
208 * moved as well.
|
Chris@0
|
209 *
|
Chris@0
|
210 * @return string The next group of numbers.
|
Chris@0
|
211 */
|
Chris@0
|
212 public function getNumeric()
|
Chris@0
|
213 {
|
Chris@17
|
214 return $this->doCharsWhile('0123456789');
|
Chris@0
|
215 }
|
Chris@0
|
216
|
Chris@0
|
217 /**
|
Chris@0
|
218 * Consume whitespace.
|
Chris@17
|
219 * Whitespace in HTML5 is: formfeed, tab, newline, space.
|
Chris@0
|
220 *
|
Chris@17
|
221 * @return int The length of the matched whitespaces.
|
Chris@0
|
222 */
|
Chris@0
|
223 public function whitespace()
|
Chris@0
|
224 {
|
Chris@17
|
225 if ($this->char >= $this->EOF) {
|
Chris@17
|
226 return false;
|
Chris@17
|
227 }
|
Chris@17
|
228
|
Chris@17
|
229 $len = strspn($this->data, "\n\t\f ", $this->char);
|
Chris@17
|
230
|
Chris@17
|
231 $this->char += $len;
|
Chris@17
|
232
|
Chris@17
|
233 return $len;
|
Chris@0
|
234 }
|
Chris@0
|
235
|
Chris@0
|
236 /**
|
Chris@0
|
237 * Returns the current line that is being consumed.
|
Chris@0
|
238 *
|
Chris@0
|
239 * @return int The current line number.
|
Chris@0
|
240 */
|
Chris@0
|
241 public function currentLine()
|
Chris@0
|
242 {
|
Chris@17
|
243 if (empty($this->EOF) || 0 === $this->char) {
|
Chris@17
|
244 return 1;
|
Chris@17
|
245 }
|
Chris@17
|
246
|
Chris@17
|
247 // Add one to $this->char because we want the number for the next
|
Chris@17
|
248 // byte to be processed.
|
Chris@17
|
249 return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1;
|
Chris@0
|
250 }
|
Chris@0
|
251
|
Chris@0
|
252 /**
|
Chris@0
|
253 * Read chars until something in the mask is encountered.
|
Chris@17
|
254 *
|
Chris@17
|
255 * @param string $mask
|
Chris@17
|
256 *
|
Chris@17
|
257 * @return mixed
|
Chris@0
|
258 */
|
Chris@0
|
259 public function charsUntil($mask)
|
Chris@0
|
260 {
|
Chris@17
|
261 return $this->doCharsUntil($mask);
|
Chris@0
|
262 }
|
Chris@0
|
263
|
Chris@0
|
264 /**
|
Chris@0
|
265 * Read chars as long as the mask matches.
|
Chris@17
|
266 *
|
Chris@17
|
267 * @param string $mask
|
Chris@17
|
268 *
|
Chris@17
|
269 * @return int
|
Chris@0
|
270 */
|
Chris@0
|
271 public function charsWhile($mask)
|
Chris@0
|
272 {
|
Chris@17
|
273 return $this->doCharsWhile($mask);
|
Chris@0
|
274 }
|
Chris@0
|
275
|
Chris@0
|
276 /**
|
Chris@0
|
277 * Returns the current column of the current line that the tokenizer is at.
|
Chris@0
|
278 *
|
Chris@0
|
279 * Newlines are column 0. The first char after a newline is column 1.
|
Chris@0
|
280 *
|
Chris@0
|
281 * @return int The column number.
|
Chris@0
|
282 */
|
Chris@0
|
283 public function columnOffset()
|
Chris@0
|
284 {
|
Chris@17
|
285 // Short circuit for the first char.
|
Chris@17
|
286 if (0 === $this->char) {
|
Chris@17
|
287 return 0;
|
Chris@17
|
288 }
|
Chris@17
|
289
|
Chris@17
|
290 // strrpos is weird, and the offset needs to be negative for what we
|
Chris@17
|
291 // want (i.e., the last \n before $this->char). This needs to not have
|
Chris@17
|
292 // one (to make it point to the next character, the one we want the
|
Chris@17
|
293 // position of) added to it because strrpos's behaviour includes the
|
Chris@17
|
294 // final offset byte.
|
Chris@17
|
295 $backwardFrom = $this->char - 1 - strlen($this->data);
|
Chris@17
|
296 $lastLine = strrpos($this->data, "\n", $backwardFrom);
|
Chris@17
|
297
|
Chris@17
|
298 // However, for here we want the length up until the next byte to be
|
Chris@17
|
299 // processed, so add one to the current byte ($this->char).
|
Chris@17
|
300 if (false !== $lastLine) {
|
Chris@17
|
301 $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine);
|
Chris@17
|
302 } else {
|
Chris@17
|
303 // After a newline.
|
Chris@17
|
304 $findLengthOf = substr($this->data, 0, $this->char);
|
Chris@17
|
305 }
|
Chris@17
|
306
|
Chris@17
|
307 return UTF8Utils::countChars($findLengthOf);
|
Chris@0
|
308 }
|
Chris@0
|
309
|
Chris@0
|
310 /**
|
Chris@0
|
311 * Get all characters until EOF.
|
Chris@0
|
312 *
|
Chris@0
|
313 * This consumes characters until the EOF.
|
Chris@0
|
314 *
|
Chris@0
|
315 * @return int The number of characters remaining.
|
Chris@0
|
316 */
|
Chris@0
|
317 public function remainingChars()
|
Chris@0
|
318 {
|
Chris@17
|
319 if ($this->char < $this->EOF) {
|
Chris@17
|
320 $data = substr($this->data, $this->char);
|
Chris@17
|
321 $this->char = $this->EOF;
|
Chris@17
|
322
|
Chris@17
|
323 return $data;
|
Chris@17
|
324 }
|
Chris@17
|
325
|
Chris@17
|
326 return ''; // false;
|
Chris@17
|
327 }
|
Chris@17
|
328
|
Chris@17
|
329 /**
|
Chris@17
|
330 * Replace linefeed characters according to the spec.
|
Chris@17
|
331 *
|
Chris@17
|
332 * @param $data
|
Chris@17
|
333 *
|
Chris@17
|
334 * @return string
|
Chris@17
|
335 */
|
Chris@17
|
336 private function replaceLinefeeds($data)
|
Chris@17
|
337 {
|
Chris@17
|
338 /*
|
Chris@17
|
339 * U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially.
|
Chris@17
|
340 * Any CR characters that are followed by LF characters must be removed, and any CR characters not
|
Chris@17
|
341 * followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are
|
Chris@17
|
342 * represented by LF characters, and there are never any CR characters in the input to the tokenization
|
Chris@17
|
343 * stage.
|
Chris@17
|
344 */
|
Chris@17
|
345 $crlfTable = array(
|
Chris@17
|
346 "\0" => "\xEF\xBF\xBD",
|
Chris@17
|
347 "\r\n" => "\n",
|
Chris@17
|
348 "\r" => "\n",
|
Chris@17
|
349 );
|
Chris@17
|
350
|
Chris@17
|
351 return strtr($data, $crlfTable);
|
Chris@17
|
352 }
|
Chris@17
|
353
|
Chris@17
|
354 /**
|
Chris@17
|
355 * Read to a particular match (or until $max bytes are consumed).
|
Chris@17
|
356 *
|
Chris@17
|
357 * This operates on byte sequences, not characters.
|
Chris@17
|
358 *
|
Chris@17
|
359 * Matches as far as possible until we reach a certain set of bytes
|
Chris@17
|
360 * and returns the matched substring.
|
Chris@17
|
361 *
|
Chris@17
|
362 * @param string $bytes Bytes to match.
|
Chris@17
|
363 * @param int $max Maximum number of bytes to scan.
|
Chris@17
|
364 *
|
Chris@17
|
365 * @return mixed Index or false if no match is found. You should use strong
|
Chris@17
|
366 * equality when checking the result, since index could be 0.
|
Chris@17
|
367 */
|
Chris@17
|
368 private function doCharsUntil($bytes, $max = null)
|
Chris@17
|
369 {
|
Chris@17
|
370 if ($this->char >= $this->EOF) {
|
Chris@17
|
371 return false;
|
Chris@17
|
372 }
|
Chris@17
|
373
|
Chris@17
|
374 if (0 === $max || $max) {
|
Chris@17
|
375 $len = strcspn($this->data, $bytes, $this->char, $max);
|
Chris@17
|
376 } else {
|
Chris@17
|
377 $len = strcspn($this->data, $bytes, $this->char);
|
Chris@17
|
378 }
|
Chris@17
|
379
|
Chris@17
|
380 $string = (string) substr($this->data, $this->char, $len);
|
Chris@17
|
381 $this->char += $len;
|
Chris@17
|
382
|
Chris@17
|
383 return $string;
|
Chris@17
|
384 }
|
Chris@17
|
385
|
Chris@17
|
386 /**
|
Chris@17
|
387 * Returns the string so long as $bytes matches.
|
Chris@17
|
388 *
|
Chris@17
|
389 * Matches as far as possible with a certain set of bytes
|
Chris@17
|
390 * and returns the matched substring.
|
Chris@17
|
391 *
|
Chris@17
|
392 * @param string $bytes A mask of bytes to match. If ANY byte in this mask matches the
|
Chris@17
|
393 * current char, the pointer advances and the char is part of the
|
Chris@17
|
394 * substring.
|
Chris@17
|
395 * @param int $max The max number of chars to read.
|
Chris@17
|
396 *
|
Chris@17
|
397 * @return string
|
Chris@17
|
398 */
|
Chris@17
|
399 private function doCharsWhile($bytes, $max = null)
|
Chris@17
|
400 {
|
Chris@17
|
401 if ($this->char >= $this->EOF) {
|
Chris@17
|
402 return false;
|
Chris@17
|
403 }
|
Chris@17
|
404
|
Chris@17
|
405 if (0 === $max || $max) {
|
Chris@17
|
406 $len = strspn($this->data, $bytes, $this->char, $max);
|
Chris@17
|
407 } else {
|
Chris@17
|
408 $len = strspn($this->data, $bytes, $this->char);
|
Chris@17
|
409 }
|
Chris@17
|
410
|
Chris@17
|
411 $string = (string) substr($this->data, $this->char, $len);
|
Chris@17
|
412 $this->char += $len;
|
Chris@17
|
413
|
Chris@17
|
414 return $string;
|
Chris@0
|
415 }
|
Chris@0
|
416 }
|