Chris@0
|
1 <?php
|
Chris@0
|
2 /**
|
Chris@0
|
3 * Loads a string to be parsed.
|
Chris@0
|
4 */
|
Chris@17
|
5
|
Chris@0
|
6 namespace Masterminds\HTML5\Parser;
|
Chris@0
|
7
|
Chris@0
|
8 /*
|
Chris@0
|
9 *
|
Chris@0
|
10 * Based on code from html5lib:
|
Chris@0
|
11
|
Chris@0
|
12 Copyright 2009 Geoffrey Sneddon <http://gsnedders.com/>
|
Chris@0
|
13
|
Chris@0
|
14 Permission is hereby granted, free of charge, to any person obtaining a
|
Chris@0
|
15 copy of this software and associated documentation files (the
|
Chris@0
|
16 "Software"), to deal in the Software without restriction, including
|
Chris@0
|
17 without limitation the rights to use, copy, modify, merge, publish,
|
Chris@0
|
18 distribute, sublicense, and/or sell copies of the Software, and to
|
Chris@0
|
19 permit persons to whom the Software is furnished to do so, subject to
|
Chris@0
|
20 the following conditions:
|
Chris@0
|
21
|
Chris@0
|
22 The above copyright notice and this permission notice shall be included
|
Chris@0
|
23 in all copies or substantial portions of the Software.
|
Chris@0
|
24
|
Chris@0
|
25 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
Chris@0
|
26 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
Chris@0
|
27 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
Chris@0
|
28 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
Chris@0
|
29 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
Chris@0
|
30 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
Chris@0
|
31 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
Chris@0
|
32
|
Chris@0
|
33 */
|
Chris@0
|
34
|
Chris@0
|
35 // Some conventions:
|
Chris@0
|
36 // - /* */ indicates verbatim text from the HTML 5 specification
|
Chris@0
|
37 // MPB: Not sure which version of the spec. Moving from HTML5lib to
|
Chris@0
|
38 // HTML5-PHP, I have been using this version:
|
Chris@0
|
39 // http://www.w3.org/TR/2012/CR-html5-20121217/Overview.html#contents
|
Chris@0
|
40 //
|
Chris@0
|
41 // - // indicates regular comments
|
Chris@0
|
42
|
Chris@17
|
43 /**
|
Chris@17
|
44 * @deprecated since 2.4, to remove in 3.0. Use a string in the scanner instead.
|
Chris@17
|
45 */
|
Chris@0
|
46 class StringInputStream implements InputStream
|
Chris@0
|
47 {
|
Chris@0
|
48 /**
|
Chris@0
|
49 * The string data we're parsing.
|
Chris@0
|
50 */
|
Chris@0
|
51 private $data;
|
Chris@0
|
52
|
Chris@0
|
53 /**
|
Chris@17
|
54 * The current integer byte position we are in $data.
|
Chris@0
|
55 */
|
Chris@0
|
56 private $char;
|
Chris@0
|
57
|
Chris@0
|
58 /**
|
Chris@0
|
59 * Length of $data; when $char === $data, we are at the end-of-file.
|
Chris@0
|
60 */
|
Chris@0
|
61 private $EOF;
|
Chris@0
|
62
|
Chris@0
|
63 /**
|
Chris@0
|
64 * Parse errors.
|
Chris@0
|
65 */
|
Chris@0
|
66 public $errors = array();
|
Chris@0
|
67
|
Chris@0
|
68 /**
|
Chris@0
|
69 * Create a new InputStream wrapper.
|
Chris@0
|
70 *
|
Chris@17
|
71 * @param string $data Data to parse.
|
Chris@17
|
72 * @param string $encoding The encoding to use for the data.
|
Chris@17
|
73 * @param string $debug A fprintf format to use to echo the data on stdout.
|
Chris@0
|
74 */
|
Chris@0
|
75 public function __construct($data, $encoding = 'UTF-8', $debug = '')
|
Chris@0
|
76 {
|
Chris@0
|
77 $data = UTF8Utils::convertToUTF8($data, $encoding);
|
Chris@17
|
78 if ($debug) {
|
Chris@0
|
79 fprintf(STDOUT, $debug, $data, strlen($data));
|
Chris@17
|
80 }
|
Chris@0
|
81
|
Chris@17
|
82 // There is good reason to question whether it makes sense to
|
Chris@17
|
83 // do this here, since most of these checks are done during
|
Chris@17
|
84 // parsing, and since this check doesn't actually *do* anything.
|
Chris@0
|
85 $this->errors = UTF8Utils::checkForIllegalCodepoints($data);
|
Chris@0
|
86
|
Chris@0
|
87 $data = $this->replaceLinefeeds($data);
|
Chris@0
|
88
|
Chris@0
|
89 $this->data = $data;
|
Chris@0
|
90 $this->char = 0;
|
Chris@0
|
91 $this->EOF = strlen($data);
|
Chris@0
|
92 }
|
Chris@0
|
93
|
Chris@17
|
94 public function __toString()
|
Chris@17
|
95 {
|
Chris@17
|
96 return $this->data;
|
Chris@17
|
97 }
|
Chris@17
|
98
|
Chris@0
|
99 /**
|
Chris@0
|
100 * Replace linefeed characters according to the spec.
|
Chris@0
|
101 */
|
Chris@0
|
102 protected function replaceLinefeeds($data)
|
Chris@0
|
103 {
|
Chris@0
|
104 /*
|
Chris@17
|
105 * U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially.
|
Chris@17
|
106 * Any CR characters that are followed by LF characters must be removed, and any CR characters not
|
Chris@17
|
107 * followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are
|
Chris@17
|
108 * represented by LF characters, and there are never any CR characters in the input to the tokenization
|
Chris@17
|
109 * stage.
|
Chris@0
|
110 */
|
Chris@0
|
111 $crlfTable = array(
|
Chris@0
|
112 "\0" => "\xEF\xBF\xBD",
|
Chris@0
|
113 "\r\n" => "\n",
|
Chris@17
|
114 "\r" => "\n",
|
Chris@0
|
115 );
|
Chris@0
|
116
|
Chris@0
|
117 return strtr($data, $crlfTable);
|
Chris@0
|
118 }
|
Chris@0
|
119
|
Chris@0
|
120 /**
|
Chris@0
|
121 * Returns the current line that the tokenizer is at.
|
Chris@0
|
122 */
|
Chris@0
|
123 public function currentLine()
|
Chris@0
|
124 {
|
Chris@17
|
125 if (empty($this->EOF) || 0 === $this->char) {
|
Chris@0
|
126 return 1;
|
Chris@0
|
127 }
|
Chris@0
|
128 // Add one to $this->char because we want the number for the next
|
Chris@0
|
129 // byte to be processed.
|
Chris@0
|
130 return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1;
|
Chris@0
|
131 }
|
Chris@0
|
132
|
Chris@0
|
133 /**
|
Chris@0
|
134 * @deprecated
|
Chris@0
|
135 */
|
Chris@0
|
136 public function getCurrentLine()
|
Chris@0
|
137 {
|
Chris@17
|
138 return $this->currentLine();
|
Chris@0
|
139 }
|
Chris@0
|
140
|
Chris@0
|
141 /**
|
Chris@0
|
142 * Returns the current column of the current line that the tokenizer is at.
|
Chris@0
|
143 * Newlines are column 0. The first char after a newline is column 1.
|
Chris@0
|
144 *
|
Chris@0
|
145 * @return int The column number.
|
Chris@0
|
146 */
|
Chris@0
|
147 public function columnOffset()
|
Chris@0
|
148 {
|
Chris@0
|
149 // Short circuit for the first char.
|
Chris@17
|
150 if (0 === $this->char) {
|
Chris@0
|
151 return 0;
|
Chris@0
|
152 }
|
Chris@0
|
153 // strrpos is weird, and the offset needs to be negative for what we
|
Chris@0
|
154 // want (i.e., the last \n before $this->char). This needs to not have
|
Chris@0
|
155 // one (to make it point to the next character, the one we want the
|
Chris@0
|
156 // position of) added to it because strrpos's behaviour includes the
|
Chris@0
|
157 // final offset byte.
|
Chris@0
|
158 $backwardFrom = $this->char - 1 - strlen($this->data);
|
Chris@0
|
159 $lastLine = strrpos($this->data, "\n", $backwardFrom);
|
Chris@0
|
160
|
Chris@0
|
161 // However, for here we want the length up until the next byte to be
|
Chris@0
|
162 // processed, so add one to the current byte ($this->char).
|
Chris@17
|
163 if (false !== $lastLine) {
|
Chris@0
|
164 $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine);
|
Chris@0
|
165 } else {
|
Chris@0
|
166 // After a newline.
|
Chris@0
|
167 $findLengthOf = substr($this->data, 0, $this->char);
|
Chris@0
|
168 }
|
Chris@0
|
169
|
Chris@0
|
170 return UTF8Utils::countChars($findLengthOf);
|
Chris@0
|
171 }
|
Chris@0
|
172
|
Chris@0
|
173 /**
|
Chris@0
|
174 * @deprecated
|
Chris@0
|
175 */
|
Chris@0
|
176 public function getColumnOffset()
|
Chris@0
|
177 {
|
Chris@0
|
178 return $this->columnOffset();
|
Chris@0
|
179 }
|
Chris@0
|
180
|
Chris@0
|
181 /**
|
Chris@0
|
182 * Get the current character.
|
Chris@0
|
183 *
|
Chris@0
|
184 * @return string The current character.
|
Chris@0
|
185 */
|
Chris@0
|
186 public function current()
|
Chris@0
|
187 {
|
Chris@0
|
188 return $this->data[$this->char];
|
Chris@0
|
189 }
|
Chris@0
|
190
|
Chris@0
|
191 /**
|
Chris@0
|
192 * Advance the pointer.
|
Chris@0
|
193 * This is part of the Iterator interface.
|
Chris@0
|
194 */
|
Chris@0
|
195 public function next()
|
Chris@0
|
196 {
|
Chris@17
|
197 ++$this->char;
|
Chris@0
|
198 }
|
Chris@0
|
199
|
Chris@0
|
200 /**
|
Chris@0
|
201 * Rewind to the start of the string.
|
Chris@0
|
202 */
|
Chris@0
|
203 public function rewind()
|
Chris@0
|
204 {
|
Chris@0
|
205 $this->char = 0;
|
Chris@0
|
206 }
|
Chris@0
|
207
|
Chris@0
|
208 /**
|
Chris@0
|
209 * Is the current pointer location valid.
|
Chris@0
|
210 *
|
Chris@17
|
211 * @return bool Whether the current pointer location is valid.
|
Chris@0
|
212 */
|
Chris@0
|
213 public function valid()
|
Chris@0
|
214 {
|
Chris@17
|
215 return $this->char < $this->EOF;
|
Chris@0
|
216 }
|
Chris@0
|
217
|
Chris@0
|
218 /**
|
Chris@0
|
219 * Get all characters until EOF.
|
Chris@0
|
220 *
|
Chris@0
|
221 * This reads to the end of the file, and sets the read marker at the
|
Chris@0
|
222 * end of the file.
|
Chris@0
|
223 *
|
Chris@17
|
224 * Note this performs bounds checking.
|
Chris@0
|
225 *
|
Chris@0
|
226 * @return string Returns the remaining text. If called when the InputStream is
|
Chris@17
|
227 * already exhausted, it returns an empty string.
|
Chris@0
|
228 */
|
Chris@0
|
229 public function remainingChars()
|
Chris@0
|
230 {
|
Chris@0
|
231 if ($this->char < $this->EOF) {
|
Chris@0
|
232 $data = substr($this->data, $this->char);
|
Chris@0
|
233 $this->char = $this->EOF;
|
Chris@0
|
234
|
Chris@0
|
235 return $data;
|
Chris@0
|
236 }
|
Chris@0
|
237
|
Chris@0
|
238 return ''; // false;
|
Chris@0
|
239 }
|
Chris@0
|
240
|
Chris@0
|
241 /**
|
Chris@0
|
242 * Read to a particular match (or until $max bytes are consumed).
|
Chris@0
|
243 *
|
Chris@0
|
244 * This operates on byte sequences, not characters.
|
Chris@0
|
245 *
|
Chris@0
|
246 * Matches as far as possible until we reach a certain set of bytes
|
Chris@0
|
247 * and returns the matched substring.
|
Chris@0
|
248 *
|
Chris@17
|
249 * @param string $bytes Bytes to match.
|
Chris@17
|
250 * @param int $max Maximum number of bytes to scan.
|
Chris@17
|
251 *
|
Chris@0
|
252 * @return mixed Index or false if no match is found. You should use strong
|
Chris@17
|
253 * equality when checking the result, since index could be 0.
|
Chris@0
|
254 */
|
Chris@0
|
255 public function charsUntil($bytes, $max = null)
|
Chris@0
|
256 {
|
Chris@0
|
257 if ($this->char >= $this->EOF) {
|
Chris@0
|
258 return false;
|
Chris@0
|
259 }
|
Chris@0
|
260
|
Chris@17
|
261 if (0 === $max || $max) {
|
Chris@0
|
262 $len = strcspn($this->data, $bytes, $this->char, $max);
|
Chris@0
|
263 } else {
|
Chris@0
|
264 $len = strcspn($this->data, $bytes, $this->char);
|
Chris@0
|
265 }
|
Chris@0
|
266
|
Chris@0
|
267 $string = (string) substr($this->data, $this->char, $len);
|
Chris@0
|
268 $this->char += $len;
|
Chris@0
|
269
|
Chris@0
|
270 return $string;
|
Chris@0
|
271 }
|
Chris@0
|
272
|
Chris@0
|
273 /**
|
Chris@0
|
274 * Returns the string so long as $bytes matches.
|
Chris@0
|
275 *
|
Chris@0
|
276 * Matches as far as possible with a certain set of bytes
|
Chris@0
|
277 * and returns the matched substring.
|
Chris@0
|
278 *
|
Chris@17
|
279 * @param string $bytes A mask of bytes to match. If ANY byte in this mask matches the
|
Chris@17
|
280 * current char, the pointer advances and the char is part of the
|
Chris@17
|
281 * substring.
|
Chris@17
|
282 * @param int $max The max number of chars to read.
|
Chris@17
|
283 *
|
Chris@17
|
284 * @return string
|
Chris@0
|
285 */
|
Chris@0
|
286 public function charsWhile($bytes, $max = null)
|
Chris@0
|
287 {
|
Chris@0
|
288 if ($this->char >= $this->EOF) {
|
Chris@0
|
289 return false;
|
Chris@0
|
290 }
|
Chris@0
|
291
|
Chris@17
|
292 if (0 === $max || $max) {
|
Chris@0
|
293 $len = strspn($this->data, $bytes, $this->char, $max);
|
Chris@0
|
294 } else {
|
Chris@0
|
295 $len = strspn($this->data, $bytes, $this->char);
|
Chris@0
|
296 }
|
Chris@0
|
297 $string = (string) substr($this->data, $this->char, $len);
|
Chris@0
|
298 $this->char += $len;
|
Chris@0
|
299
|
Chris@0
|
300 return $string;
|
Chris@0
|
301 }
|
Chris@0
|
302
|
Chris@0
|
303 /**
|
Chris@0
|
304 * Unconsume characters.
|
Chris@0
|
305 *
|
Chris@17
|
306 * @param int $howMany The number of characters to unconsume.
|
Chris@0
|
307 */
|
Chris@0
|
308 public function unconsume($howMany = 1)
|
Chris@0
|
309 {
|
Chris@0
|
310 if (($this->char - $howMany) >= 0) {
|
Chris@17
|
311 $this->char -= $howMany;
|
Chris@0
|
312 }
|
Chris@0
|
313 }
|
Chris@0
|
314
|
Chris@0
|
315 /**
|
Chris@0
|
316 * Look ahead without moving cursor.
|
Chris@0
|
317 */
|
Chris@0
|
318 public function peek()
|
Chris@0
|
319 {
|
Chris@0
|
320 if (($this->char + 1) <= $this->EOF) {
|
Chris@0
|
321 return $this->data[$this->char + 1];
|
Chris@0
|
322 }
|
Chris@0
|
323
|
Chris@0
|
324 return false;
|
Chris@0
|
325 }
|
Chris@0
|
326
|
Chris@0
|
327 public function key()
|
Chris@0
|
328 {
|
Chris@0
|
329 return $this->char;
|
Chris@0
|
330 }
|
Chris@0
|
331 }
|