Chris@0
|
1 <?php
|
Chris@17
|
2
|
Chris@0
|
3 namespace Masterminds\HTML5\Parser;
|
Chris@0
|
4
|
Chris@0
|
5 use Masterminds\HTML5\Elements;
|
Chris@17
|
6 use Masterminds\HTML5\InstructionProcessor;
|
Chris@0
|
7
|
Chris@0
|
8 /**
|
Chris@0
|
9 * Create an HTML5 DOM tree from events.
|
Chris@0
|
10 *
|
Chris@0
|
11 * This attempts to create a DOM from events emitted by a parser. This
|
Chris@0
|
12 * attempts (but does not guarantee) to up-convert older HTML documents
|
Chris@0
|
13 * to HTML5. It does this by applying HTML5's rules, but it will not
|
Chris@0
|
14 * change the architecture of the document itself.
|
Chris@0
|
15 *
|
Chris@0
|
16 * Many of the error correction and quirks features suggested in the specification
|
Chris@0
|
17 * are implemented herein; however, not all of them are. Since we do not
|
Chris@0
|
18 * assume a graphical user agent, no presentation-specific logic is conducted
|
Chris@0
|
19 * during tree building.
|
Chris@0
|
20 *
|
Chris@0
|
21 * FIXME: The present tree builder does not exactly follow the state machine rules
|
Chris@0
|
22 * for insert modes as outlined in the HTML5 spec. The processor needs to be
|
Chris@0
|
23 * re-written to accomodate this. See, for example, the Go language HTML5
|
Chris@0
|
24 * parser.
|
Chris@0
|
25 */
|
Chris@0
|
26 class DOMTreeBuilder implements EventHandler
|
Chris@0
|
27 {
|
Chris@0
|
28 /**
|
Chris@17
|
29 * Defined in http://www.w3.org/TR/html51/infrastructure.html#html-namespace-0.
|
Chris@0
|
30 */
|
Chris@0
|
31 const NAMESPACE_HTML = 'http://www.w3.org/1999/xhtml';
|
Chris@0
|
32
|
Chris@0
|
33 const NAMESPACE_MATHML = 'http://www.w3.org/1998/Math/MathML';
|
Chris@0
|
34
|
Chris@0
|
35 const NAMESPACE_SVG = 'http://www.w3.org/2000/svg';
|
Chris@0
|
36
|
Chris@0
|
37 const NAMESPACE_XLINK = 'http://www.w3.org/1999/xlink';
|
Chris@0
|
38
|
Chris@0
|
39 const NAMESPACE_XML = 'http://www.w3.org/XML/1998/namespace';
|
Chris@0
|
40
|
Chris@0
|
41 const NAMESPACE_XMLNS = 'http://www.w3.org/2000/xmlns/';
|
Chris@0
|
42
|
Chris@0
|
43 const OPT_DISABLE_HTML_NS = 'disable_html_ns';
|
Chris@0
|
44
|
Chris@0
|
45 const OPT_TARGET_DOC = 'target_document';
|
Chris@0
|
46
|
Chris@0
|
47 const OPT_IMPLICIT_NS = 'implicit_namespaces';
|
Chris@0
|
48
|
Chris@0
|
49 /**
|
Chris@17
|
50 * Holds the HTML5 element names that causes a namespace switch.
|
Chris@0
|
51 *
|
Chris@0
|
52 * @var array
|
Chris@0
|
53 */
|
Chris@0
|
54 protected $nsRoots = array(
|
Chris@0
|
55 'html' => self::NAMESPACE_HTML,
|
Chris@0
|
56 'svg' => self::NAMESPACE_SVG,
|
Chris@17
|
57 'math' => self::NAMESPACE_MATHML,
|
Chris@0
|
58 );
|
Chris@0
|
59
|
Chris@0
|
60 /**
|
Chris@0
|
61 * Holds the always available namespaces (which does not require the XMLNS declaration).
|
Chris@0
|
62 *
|
Chris@0
|
63 * @var array
|
Chris@0
|
64 */
|
Chris@0
|
65 protected $implicitNamespaces = array(
|
Chris@0
|
66 'xml' => self::NAMESPACE_XML,
|
Chris@0
|
67 'xmlns' => self::NAMESPACE_XMLNS,
|
Chris@17
|
68 'xlink' => self::NAMESPACE_XLINK,
|
Chris@0
|
69 );
|
Chris@0
|
70
|
Chris@0
|
71 /**
|
Chris@0
|
72 * Holds a stack of currently active namespaces.
|
Chris@0
|
73 *
|
Chris@0
|
74 * @var array
|
Chris@0
|
75 */
|
Chris@0
|
76 protected $nsStack = array();
|
Chris@0
|
77
|
Chris@0
|
78 /**
|
Chris@0
|
79 * Holds the number of namespaces declared by a node.
|
Chris@0
|
80 *
|
Chris@0
|
81 * @var array
|
Chris@0
|
82 */
|
Chris@0
|
83 protected $pushes = array();
|
Chris@0
|
84
|
Chris@0
|
85 /**
|
Chris@0
|
86 * Defined in 8.2.5.
|
Chris@0
|
87 */
|
Chris@0
|
88 const IM_INITIAL = 0;
|
Chris@0
|
89
|
Chris@0
|
90 const IM_BEFORE_HTML = 1;
|
Chris@0
|
91
|
Chris@0
|
92 const IM_BEFORE_HEAD = 2;
|
Chris@0
|
93
|
Chris@0
|
94 const IM_IN_HEAD = 3;
|
Chris@0
|
95
|
Chris@0
|
96 const IM_IN_HEAD_NOSCRIPT = 4;
|
Chris@0
|
97
|
Chris@0
|
98 const IM_AFTER_HEAD = 5;
|
Chris@0
|
99
|
Chris@0
|
100 const IM_IN_BODY = 6;
|
Chris@0
|
101
|
Chris@0
|
102 const IM_TEXT = 7;
|
Chris@0
|
103
|
Chris@0
|
104 const IM_IN_TABLE = 8;
|
Chris@0
|
105
|
Chris@0
|
106 const IM_IN_TABLE_TEXT = 9;
|
Chris@0
|
107
|
Chris@0
|
108 const IM_IN_CAPTION = 10;
|
Chris@0
|
109
|
Chris@0
|
110 const IM_IN_COLUMN_GROUP = 11;
|
Chris@0
|
111
|
Chris@0
|
112 const IM_IN_TABLE_BODY = 12;
|
Chris@0
|
113
|
Chris@0
|
114 const IM_IN_ROW = 13;
|
Chris@0
|
115
|
Chris@0
|
116 const IM_IN_CELL = 14;
|
Chris@0
|
117
|
Chris@0
|
118 const IM_IN_SELECT = 15;
|
Chris@0
|
119
|
Chris@0
|
120 const IM_IN_SELECT_IN_TABLE = 16;
|
Chris@0
|
121
|
Chris@0
|
122 const IM_AFTER_BODY = 17;
|
Chris@0
|
123
|
Chris@0
|
124 const IM_IN_FRAMESET = 18;
|
Chris@0
|
125
|
Chris@0
|
126 const IM_AFTER_FRAMESET = 19;
|
Chris@0
|
127
|
Chris@0
|
128 const IM_AFTER_AFTER_BODY = 20;
|
Chris@0
|
129
|
Chris@0
|
130 const IM_AFTER_AFTER_FRAMESET = 21;
|
Chris@0
|
131
|
Chris@0
|
132 const IM_IN_SVG = 22;
|
Chris@0
|
133
|
Chris@0
|
134 const IM_IN_MATHML = 23;
|
Chris@0
|
135
|
Chris@0
|
136 protected $options = array();
|
Chris@0
|
137
|
Chris@0
|
138 protected $stack = array();
|
Chris@0
|
139
|
Chris@0
|
140 protected $current; // Pointer in the tag hierarchy.
|
Chris@17
|
141 protected $rules;
|
Chris@0
|
142 protected $doc;
|
Chris@0
|
143
|
Chris@0
|
144 protected $frag;
|
Chris@0
|
145
|
Chris@0
|
146 protected $processor;
|
Chris@0
|
147
|
Chris@0
|
148 protected $insertMode = 0;
|
Chris@0
|
149
|
Chris@0
|
150 /**
|
Chris@17
|
151 * Track if we are in an element that allows only inline child nodes.
|
Chris@17
|
152 *
|
Chris@0
|
153 * @var string|null
|
Chris@0
|
154 */
|
Chris@0
|
155 protected $onlyInline;
|
Chris@0
|
156
|
Chris@0
|
157 /**
|
Chris@0
|
158 * Quirks mode is enabled by default.
|
Chris@17
|
159 * Any document that is missing the DT will be considered to be in quirks mode.
|
Chris@0
|
160 */
|
Chris@0
|
161 protected $quirks = true;
|
Chris@0
|
162
|
Chris@0
|
163 protected $errors = array();
|
Chris@0
|
164
|
Chris@0
|
165 public function __construct($isFragment = false, array $options = array())
|
Chris@0
|
166 {
|
Chris@0
|
167 $this->options = $options;
|
Chris@0
|
168
|
Chris@0
|
169 if (isset($options[self::OPT_TARGET_DOC])) {
|
Chris@0
|
170 $this->doc = $options[self::OPT_TARGET_DOC];
|
Chris@0
|
171 } else {
|
Chris@0
|
172 $impl = new \DOMImplementation();
|
Chris@0
|
173 // XXX:
|
Chris@0
|
174 // Create the doctype. For now, we are always creating HTML5
|
Chris@0
|
175 // documents, and attempting to up-convert any older DTDs to HTML5.
|
Chris@0
|
176 $dt = $impl->createDocumentType('html');
|
Chris@0
|
177 // $this->doc = \DOMImplementation::createDocument(NULL, 'html', $dt);
|
Chris@0
|
178 $this->doc = $impl->createDocument(null, null, $dt);
|
Chris@0
|
179 }
|
Chris@17
|
180
|
Chris@0
|
181 $this->errors = array();
|
Chris@0
|
182
|
Chris@0
|
183 $this->current = $this->doc; // ->documentElement;
|
Chris@0
|
184
|
Chris@0
|
185 // Create a rules engine for tags.
|
Chris@17
|
186 $this->rules = new TreeBuildingRules();
|
Chris@0
|
187
|
Chris@0
|
188 $implicitNS = array();
|
Chris@0
|
189 if (isset($this->options[self::OPT_IMPLICIT_NS])) {
|
Chris@0
|
190 $implicitNS = $this->options[self::OPT_IMPLICIT_NS];
|
Chris@17
|
191 } elseif (isset($this->options['implicitNamespaces'])) {
|
Chris@17
|
192 $implicitNS = $this->options['implicitNamespaces'];
|
Chris@0
|
193 }
|
Chris@0
|
194
|
Chris@0
|
195 // Fill $nsStack with the defalut HTML5 namespaces, plus the "implicitNamespaces" array taken form $options
|
Chris@17
|
196 array_unshift($this->nsStack, $implicitNS + array('' => self::NAMESPACE_HTML) + $this->implicitNamespaces);
|
Chris@0
|
197
|
Chris@0
|
198 if ($isFragment) {
|
Chris@0
|
199 $this->insertMode = static::IM_IN_BODY;
|
Chris@0
|
200 $this->frag = $this->doc->createDocumentFragment();
|
Chris@0
|
201 $this->current = $this->frag;
|
Chris@0
|
202 }
|
Chris@0
|
203 }
|
Chris@0
|
204
|
Chris@0
|
205 /**
|
Chris@0
|
206 * Get the document.
|
Chris@0
|
207 */
|
Chris@0
|
208 public function document()
|
Chris@0
|
209 {
|
Chris@0
|
210 return $this->doc;
|
Chris@0
|
211 }
|
Chris@0
|
212
|
Chris@0
|
213 /**
|
Chris@0
|
214 * Get the DOM fragment for the body.
|
Chris@0
|
215 *
|
Chris@0
|
216 * This returns a DOMNodeList because a fragment may have zero or more
|
Chris@0
|
217 * DOMNodes at its root.
|
Chris@0
|
218 *
|
Chris@0
|
219 * @see http://www.w3.org/TR/2012/CR-html5-20121217/syntax.html#concept-frag-parse-context
|
Chris@0
|
220 *
|
Chris@17
|
221 * @return \DOMDocumentFragment
|
Chris@0
|
222 */
|
Chris@0
|
223 public function fragment()
|
Chris@0
|
224 {
|
Chris@0
|
225 return $this->frag;
|
Chris@0
|
226 }
|
Chris@0
|
227
|
Chris@0
|
228 /**
|
Chris@0
|
229 * Provide an instruction processor.
|
Chris@0
|
230 *
|
Chris@0
|
231 * This is used for handling Processor Instructions as they are
|
Chris@0
|
232 * inserted. If omitted, PI's are inserted directly into the DOM tree.
|
Chris@17
|
233 *
|
Chris@17
|
234 * @param InstructionProcessor $proc
|
Chris@0
|
235 */
|
Chris@17
|
236 public function setInstructionProcessor(InstructionProcessor $proc)
|
Chris@0
|
237 {
|
Chris@0
|
238 $this->processor = $proc;
|
Chris@0
|
239 }
|
Chris@0
|
240
|
Chris@0
|
241 public function doctype($name, $idType = 0, $id = null, $quirks = false)
|
Chris@0
|
242 {
|
Chris@0
|
243 // This is used solely for setting quirks mode. Currently we don't
|
Chris@0
|
244 // try to preserve the inbound DT. We convert it to HTML5.
|
Chris@0
|
245 $this->quirks = $quirks;
|
Chris@0
|
246
|
Chris@0
|
247 if ($this->insertMode > static::IM_INITIAL) {
|
Chris@17
|
248 $this->parseError('Illegal placement of DOCTYPE tag. Ignoring: ' . $name);
|
Chris@0
|
249
|
Chris@0
|
250 return;
|
Chris@0
|
251 }
|
Chris@0
|
252
|
Chris@0
|
253 $this->insertMode = static::IM_BEFORE_HTML;
|
Chris@0
|
254 }
|
Chris@0
|
255
|
Chris@0
|
256 /**
|
Chris@0
|
257 * Process the start tag.
|
Chris@0
|
258 *
|
Chris@0
|
259 * @todo - XMLNS namespace handling (we need to parse, even if it's not valid)
|
Chris@0
|
260 * - XLink, MathML and SVG namespace handling
|
Chris@0
|
261 * - Omission rules: 8.1.2.4 Optional tags
|
Chris@17
|
262 *
|
Chris@17
|
263 * @param string $name
|
Chris@17
|
264 * @param array $attributes
|
Chris@17
|
265 * @param bool $selfClosing
|
Chris@17
|
266 *
|
Chris@17
|
267 * @return int
|
Chris@0
|
268 */
|
Chris@0
|
269 public function startTag($name, $attributes = array(), $selfClosing = false)
|
Chris@0
|
270 {
|
Chris@0
|
271 $lname = $this->normalizeTagName($name);
|
Chris@0
|
272
|
Chris@0
|
273 // Make sure we have an html element.
|
Chris@17
|
274 if (!$this->doc->documentElement && 'html' !== $name && !$this->frag) {
|
Chris@0
|
275 $this->startTag('html');
|
Chris@0
|
276 }
|
Chris@0
|
277
|
Chris@0
|
278 // Set quirks mode if we're at IM_INITIAL with no doctype.
|
Chris@17
|
279 if ($this->insertMode === static::IM_INITIAL) {
|
Chris@0
|
280 $this->quirks = true;
|
Chris@17
|
281 $this->parseError('No DOCTYPE specified.');
|
Chris@0
|
282 }
|
Chris@0
|
283
|
Chris@0
|
284 // SPECIAL TAG HANDLING:
|
Chris@0
|
285 // Spec says do this, and "don't ask."
|
Chris@0
|
286 // find the spec where this is defined... looks problematic
|
Chris@17
|
287 if ('image' === $name && !($this->insertMode === static::IM_IN_SVG || $this->insertMode === static::IM_IN_MATHML)) {
|
Chris@0
|
288 $name = 'img';
|
Chris@0
|
289 }
|
Chris@0
|
290
|
Chris@0
|
291 // Autoclose p tags where appropriate.
|
Chris@0
|
292 if ($this->insertMode >= static::IM_IN_BODY && Elements::isA($name, Elements::AUTOCLOSE_P)) {
|
Chris@0
|
293 $this->autoclose('p');
|
Chris@0
|
294 }
|
Chris@0
|
295
|
Chris@0
|
296 // Set insert mode:
|
Chris@0
|
297 switch ($name) {
|
Chris@0
|
298 case 'html':
|
Chris@0
|
299 $this->insertMode = static::IM_BEFORE_HEAD;
|
Chris@0
|
300 break;
|
Chris@0
|
301 case 'head':
|
Chris@0
|
302 if ($this->insertMode > static::IM_BEFORE_HEAD) {
|
Chris@17
|
303 $this->parseError('Unexpected head tag outside of head context.');
|
Chris@0
|
304 } else {
|
Chris@0
|
305 $this->insertMode = static::IM_IN_HEAD;
|
Chris@0
|
306 }
|
Chris@0
|
307 break;
|
Chris@0
|
308 case 'body':
|
Chris@0
|
309 $this->insertMode = static::IM_IN_BODY;
|
Chris@0
|
310 break;
|
Chris@0
|
311 case 'svg':
|
Chris@0
|
312 $this->insertMode = static::IM_IN_SVG;
|
Chris@0
|
313 break;
|
Chris@0
|
314 case 'math':
|
Chris@0
|
315 $this->insertMode = static::IM_IN_MATHML;
|
Chris@0
|
316 break;
|
Chris@0
|
317 case 'noscript':
|
Chris@17
|
318 if ($this->insertMode === static::IM_IN_HEAD) {
|
Chris@0
|
319 $this->insertMode = static::IM_IN_HEAD_NOSCRIPT;
|
Chris@0
|
320 }
|
Chris@0
|
321 break;
|
Chris@0
|
322 }
|
Chris@0
|
323
|
Chris@0
|
324 // Special case handling for SVG.
|
Chris@17
|
325 if ($this->insertMode === static::IM_IN_SVG) {
|
Chris@0
|
326 $lname = Elements::normalizeSvgElement($lname);
|
Chris@0
|
327 }
|
Chris@0
|
328
|
Chris@0
|
329 $pushes = 0;
|
Chris@0
|
330 // when we found a tag thats appears inside $nsRoots, we have to switch the defalut namespace
|
Chris@0
|
331 if (isset($this->nsRoots[$lname]) && $this->nsStack[0][''] !== $this->nsRoots[$lname]) {
|
Chris@0
|
332 array_unshift($this->nsStack, array(
|
Chris@17
|
333 '' => $this->nsRoots[$lname],
|
Chris@0
|
334 ) + $this->nsStack[0]);
|
Chris@17
|
335 ++$pushes;
|
Chris@0
|
336 }
|
Chris@0
|
337 $needsWorkaround = false;
|
Chris@17
|
338 if (isset($this->options['xmlNamespaces']) && $this->options['xmlNamespaces']) {
|
Chris@0
|
339 // when xmlNamespaces is true a and we found a 'xmlns' or 'xmlns:*' attribute, we should add a new item to the $nsStack
|
Chris@0
|
340 foreach ($attributes as $aName => $aVal) {
|
Chris@17
|
341 if ('xmlns' === $aName) {
|
Chris@0
|
342 $needsWorkaround = $aVal;
|
Chris@0
|
343 array_unshift($this->nsStack, array(
|
Chris@17
|
344 '' => $aVal,
|
Chris@0
|
345 ) + $this->nsStack[0]);
|
Chris@17
|
346 ++$pushes;
|
Chris@17
|
347 } elseif ('xmlns' === (($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : '')) {
|
Chris@0
|
348 array_unshift($this->nsStack, array(
|
Chris@17
|
349 substr($aName, $pos + 1) => $aVal,
|
Chris@0
|
350 ) + $this->nsStack[0]);
|
Chris@17
|
351 ++$pushes;
|
Chris@0
|
352 }
|
Chris@0
|
353 }
|
Chris@0
|
354 }
|
Chris@0
|
355
|
Chris@0
|
356 if ($this->onlyInline && Elements::isA($lname, Elements::BLOCK_TAG)) {
|
Chris@17
|
357 $this->autoclose($this->onlyInline);
|
Chris@17
|
358 $this->onlyInline = null;
|
Chris@0
|
359 }
|
Chris@0
|
360
|
Chris@0
|
361 try {
|
Chris@0
|
362 $prefix = ($pos = strpos($lname, ':')) ? substr($lname, 0, $pos) : '';
|
Chris@0
|
363
|
Chris@17
|
364 if (false !== $needsWorkaround) {
|
Chris@17
|
365 $xml = "<$lname xmlns=\"$needsWorkaround\" " . (strlen($prefix) && isset($this->nsStack[0][$prefix]) ? ("xmlns:$prefix=\"" . $this->nsStack[0][$prefix] . '"') : '') . '/>';
|
Chris@0
|
366
|
Chris@0
|
367 $frag = new \DOMDocument('1.0', 'UTF-8');
|
Chris@0
|
368 $frag->loadXML($xml);
|
Chris@0
|
369
|
Chris@0
|
370 $ele = $this->doc->importNode($frag->documentElement, true);
|
Chris@0
|
371 } else {
|
Chris@17
|
372 if (!isset($this->nsStack[0][$prefix]) || ('' === $prefix && isset($this->options[self::OPT_DISABLE_HTML_NS]) && $this->options[self::OPT_DISABLE_HTML_NS])) {
|
Chris@0
|
373 $ele = $this->doc->createElement($lname);
|
Chris@0
|
374 } else {
|
Chris@0
|
375 $ele = $this->doc->createElementNS($this->nsStack[0][$prefix], $lname);
|
Chris@0
|
376 }
|
Chris@0
|
377 }
|
Chris@0
|
378 } catch (\DOMException $e) {
|
Chris@0
|
379 $this->parseError("Illegal tag name: <$lname>. Replaced with <invalid>.");
|
Chris@0
|
380 $ele = $this->doc->createElement('invalid');
|
Chris@0
|
381 }
|
Chris@0
|
382
|
Chris@0
|
383 if (Elements::isA($lname, Elements::BLOCK_ONLY_INLINE)) {
|
Chris@17
|
384 $this->onlyInline = $lname;
|
Chris@0
|
385 }
|
Chris@0
|
386
|
Chris@0
|
387 // When we add some namespacess, we have to track them. Later, when "endElement" is invoked, we have to remove them.
|
Chris@0
|
388 // When we are on a void tag, we do not need to care about namesapce nesting.
|
Chris@0
|
389 if ($pushes > 0 && !Elements::isA($name, Elements::VOID_TAG)) {
|
Chris@0
|
390 // PHP tends to free the memory used by DOM,
|
Chris@0
|
391 // to avoid spl_object_hash collisions whe have to avoid garbage collection of $ele storing it into $pushes
|
Chris@0
|
392 // see https://bugs.php.net/bug.php?id=67459
|
Chris@0
|
393 $this->pushes[spl_object_hash($ele)] = array($pushes, $ele);
|
Chris@0
|
394
|
Chris@0
|
395 // SEE https://github.com/facebook/hhvm/issues/2962
|
Chris@0
|
396 if (defined('HHVM_VERSION')) {
|
Chris@0
|
397 $ele->setAttribute('html5-php-fake-id-attribute', spl_object_hash($ele));
|
Chris@0
|
398 }
|
Chris@0
|
399 }
|
Chris@0
|
400
|
Chris@0
|
401 foreach ($attributes as $aName => $aVal) {
|
Chris@0
|
402 // xmlns attributes can't be set
|
Chris@17
|
403 if ('xmlns' === $aName) {
|
Chris@0
|
404 continue;
|
Chris@0
|
405 }
|
Chris@0
|
406
|
Chris@17
|
407 if ($this->insertMode === static::IM_IN_SVG) {
|
Chris@0
|
408 $aName = Elements::normalizeSvgAttribute($aName);
|
Chris@17
|
409 } elseif ($this->insertMode === static::IM_IN_MATHML) {
|
Chris@0
|
410 $aName = Elements::normalizeMathMlAttribute($aName);
|
Chris@0
|
411 }
|
Chris@0
|
412
|
Chris@0
|
413 try {
|
Chris@0
|
414 $prefix = ($pos = strpos($aName, ':')) ? substr($aName, 0, $pos) : false;
|
Chris@0
|
415
|
Chris@17
|
416 if ('xmlns' === $prefix) {
|
Chris@17
|
417 $ele->setAttributeNS(self::NAMESPACE_XMLNS, $aName, $aVal);
|
Chris@17
|
418 } elseif (false !== $prefix && isset($this->nsStack[0][$prefix])) {
|
Chris@17
|
419 $ele->setAttributeNS($this->nsStack[0][$prefix], $aName, $aVal);
|
Chris@0
|
420 } else {
|
Chris@0
|
421 $ele->setAttribute($aName, $aVal);
|
Chris@0
|
422 }
|
Chris@0
|
423 } catch (\DOMException $e) {
|
Chris@0
|
424 $this->parseError("Illegal attribute name for tag $name. Ignoring: $aName");
|
Chris@0
|
425 continue;
|
Chris@0
|
426 }
|
Chris@0
|
427
|
Chris@0
|
428 // This is necessary on a non-DTD schema, like HTML5.
|
Chris@17
|
429 if ('id' === $aName) {
|
Chris@0
|
430 $ele->setIdAttribute('id', true);
|
Chris@0
|
431 }
|
Chris@0
|
432 }
|
Chris@0
|
433
|
Chris@17
|
434 if ($this->frag !== $this->current && $this->rules->hasRules($name)) {
|
Chris@17
|
435 // Some elements have special processing rules. Handle those separately.
|
Chris@0
|
436 $this->current = $this->rules->evaluate($ele, $this->current);
|
Chris@17
|
437 } else {
|
Chris@17
|
438 // Otherwise, it's a standard element.
|
Chris@0
|
439 $this->current->appendChild($ele);
|
Chris@0
|
440
|
Chris@17
|
441 if (!Elements::isA($name, Elements::VOID_TAG)) {
|
Chris@0
|
442 $this->current = $ele;
|
Chris@0
|
443 }
|
Chris@17
|
444
|
Chris@17
|
445 // Self-closing tags should only be respected on foreign elements
|
Chris@17
|
446 // (and are implied on void elements)
|
Chris@17
|
447 // See: https://www.w3.org/TR/html5/syntax.html#start-tags
|
Chris@17
|
448 if (Elements::isHtml5Element($name)) {
|
Chris@17
|
449 $selfClosing = false;
|
Chris@17
|
450 }
|
Chris@0
|
451 }
|
Chris@0
|
452
|
Chris@0
|
453 // This is sort of a last-ditch attempt to correct for cases where no head/body
|
Chris@0
|
454 // elements are provided.
|
Chris@17
|
455 if ($this->insertMode <= static::IM_BEFORE_HEAD && 'head' !== $name && 'html' !== $name) {
|
Chris@0
|
456 $this->insertMode = static::IM_IN_BODY;
|
Chris@0
|
457 }
|
Chris@0
|
458
|
Chris@0
|
459 // When we are on a void tag, we do not need to care about namesapce nesting,
|
Chris@0
|
460 // but we have to remove the namespaces pushed to $nsStack.
|
Chris@0
|
461 if ($pushes > 0 && Elements::isA($name, Elements::VOID_TAG)) {
|
Chris@0
|
462 // remove the namespaced definded by current node
|
Chris@17
|
463 for ($i = 0; $i < $pushes; ++$i) {
|
Chris@0
|
464 array_shift($this->nsStack);
|
Chris@0
|
465 }
|
Chris@0
|
466 }
|
Chris@17
|
467
|
Chris@17
|
468 if ($selfClosing) {
|
Chris@17
|
469 $this->endTag($name);
|
Chris@17
|
470 }
|
Chris@17
|
471
|
Chris@0
|
472 // Return the element mask, which the tokenizer can then use to set
|
Chris@0
|
473 // various processing rules.
|
Chris@0
|
474 return Elements::element($name);
|
Chris@0
|
475 }
|
Chris@0
|
476
|
Chris@0
|
477 public function endTag($name)
|
Chris@0
|
478 {
|
Chris@0
|
479 $lname = $this->normalizeTagName($name);
|
Chris@0
|
480
|
Chris@0
|
481 // Ignore closing tags for unary elements.
|
Chris@0
|
482 if (Elements::isA($name, Elements::VOID_TAG)) {
|
Chris@0
|
483 return;
|
Chris@0
|
484 }
|
Chris@0
|
485
|
Chris@0
|
486 if ($this->insertMode <= static::IM_BEFORE_HTML) {
|
Chris@0
|
487 // 8.2.5.4.2
|
Chris@0
|
488 if (in_array($name, array(
|
Chris@0
|
489 'html',
|
Chris@0
|
490 'br',
|
Chris@0
|
491 'head',
|
Chris@17
|
492 'title',
|
Chris@0
|
493 ))) {
|
Chris@0
|
494 $this->startTag('html');
|
Chris@0
|
495 $this->endTag($name);
|
Chris@0
|
496 $this->insertMode = static::IM_BEFORE_HEAD;
|
Chris@0
|
497
|
Chris@0
|
498 return;
|
Chris@0
|
499 }
|
Chris@0
|
500
|
Chris@0
|
501 // Ignore the tag.
|
Chris@17
|
502 $this->parseError('Illegal closing tag at global scope.');
|
Chris@0
|
503
|
Chris@0
|
504 return;
|
Chris@0
|
505 }
|
Chris@0
|
506
|
Chris@0
|
507 // Special case handling for SVG.
|
Chris@17
|
508 if ($this->insertMode === static::IM_IN_SVG) {
|
Chris@0
|
509 $lname = Elements::normalizeSvgElement($lname);
|
Chris@0
|
510 }
|
Chris@0
|
511
|
Chris@0
|
512 // See https://github.com/facebook/hhvm/issues/2962
|
Chris@0
|
513 if (defined('HHVM_VERSION') && ($cid = $this->current->getAttribute('html5-php-fake-id-attribute'))) {
|
Chris@0
|
514 $this->current->removeAttribute('html5-php-fake-id-attribute');
|
Chris@0
|
515 } else {
|
Chris@0
|
516 $cid = spl_object_hash($this->current);
|
Chris@0
|
517 }
|
Chris@0
|
518
|
Chris@0
|
519 // XXX: HTML has no parent. What do we do, though,
|
Chris@0
|
520 // if this element appears in the wrong place?
|
Chris@17
|
521 if ('html' === $lname) {
|
Chris@0
|
522 return;
|
Chris@0
|
523 }
|
Chris@0
|
524
|
Chris@0
|
525 // remove the namespaced definded by current node
|
Chris@0
|
526 if (isset($this->pushes[$cid])) {
|
Chris@17
|
527 for ($i = 0; $i < $this->pushes[$cid][0]; ++$i) {
|
Chris@0
|
528 array_shift($this->nsStack);
|
Chris@0
|
529 }
|
Chris@0
|
530 unset($this->pushes[$cid]);
|
Chris@0
|
531 }
|
Chris@0
|
532
|
Chris@17
|
533 if (!$this->autoclose($lname)) {
|
Chris@0
|
534 $this->parseError('Could not find closing tag for ' . $lname);
|
Chris@0
|
535 }
|
Chris@0
|
536
|
Chris@0
|
537 switch ($lname) {
|
Chris@17
|
538 case 'head':
|
Chris@0
|
539 $this->insertMode = static::IM_AFTER_HEAD;
|
Chris@0
|
540 break;
|
Chris@17
|
541 case 'body':
|
Chris@0
|
542 $this->insertMode = static::IM_AFTER_BODY;
|
Chris@0
|
543 break;
|
Chris@17
|
544 case 'svg':
|
Chris@17
|
545 case 'mathml':
|
Chris@0
|
546 $this->insertMode = static::IM_IN_BODY;
|
Chris@0
|
547 break;
|
Chris@0
|
548 }
|
Chris@0
|
549 }
|
Chris@0
|
550
|
Chris@0
|
551 public function comment($cdata)
|
Chris@0
|
552 {
|
Chris@0
|
553 // TODO: Need to handle case where comment appears outside of the HTML tag.
|
Chris@0
|
554 $node = $this->doc->createComment($cdata);
|
Chris@0
|
555 $this->current->appendChild($node);
|
Chris@0
|
556 }
|
Chris@0
|
557
|
Chris@0
|
558 public function text($data)
|
Chris@0
|
559 {
|
Chris@0
|
560 // XXX: Hmmm.... should we really be this strict?
|
Chris@0
|
561 if ($this->insertMode < static::IM_IN_HEAD) {
|
Chris@0
|
562 // Per '8.2.5.4.3 The "before head" insertion mode' the characters
|
Chris@0
|
563 // " \t\n\r\f" should be ignored but no mention of a parse error. This is
|
Chris@0
|
564 // practical as most documents contain these characters. Other text is not
|
Chris@0
|
565 // expected here so recording a parse error is necessary.
|
Chris@0
|
566 $dataTmp = trim($data, " \t\n\r\f");
|
Chris@17
|
567 if (!empty($dataTmp)) {
|
Chris@0
|
568 // fprintf(STDOUT, "Unexpected insert mode: %d", $this->insertMode);
|
Chris@17
|
569 $this->parseError('Unexpected text. Ignoring: ' . $dataTmp);
|
Chris@0
|
570 }
|
Chris@0
|
571
|
Chris@0
|
572 return;
|
Chris@0
|
573 }
|
Chris@0
|
574 // fprintf(STDOUT, "Appending text %s.", $data);
|
Chris@0
|
575 $node = $this->doc->createTextNode($data);
|
Chris@0
|
576 $this->current->appendChild($node);
|
Chris@0
|
577 }
|
Chris@0
|
578
|
Chris@0
|
579 public function eof()
|
Chris@0
|
580 {
|
Chris@0
|
581 // If the $current isn't the $root, do we need to do anything?
|
Chris@0
|
582 }
|
Chris@0
|
583
|
Chris@0
|
584 public function parseError($msg, $line = 0, $col = 0)
|
Chris@0
|
585 {
|
Chris@17
|
586 $this->errors[] = sprintf('Line %d, Col %d: %s', $line, $col, $msg);
|
Chris@0
|
587 }
|
Chris@0
|
588
|
Chris@0
|
589 public function getErrors()
|
Chris@0
|
590 {
|
Chris@0
|
591 return $this->errors;
|
Chris@0
|
592 }
|
Chris@0
|
593
|
Chris@0
|
594 public function cdata($data)
|
Chris@0
|
595 {
|
Chris@0
|
596 $node = $this->doc->createCDATASection($data);
|
Chris@0
|
597 $this->current->appendChild($node);
|
Chris@0
|
598 }
|
Chris@0
|
599
|
Chris@0
|
600 public function processingInstruction($name, $data = null)
|
Chris@0
|
601 {
|
Chris@0
|
602 // XXX: Ignore initial XML declaration, per the spec.
|
Chris@17
|
603 if ($this->insertMode === static::IM_INITIAL && 'xml' === strtolower($name)) {
|
Chris@0
|
604 return;
|
Chris@0
|
605 }
|
Chris@0
|
606
|
Chris@17
|
607 // Important: The processor may modify the current DOM tree however it sees fit.
|
Chris@17
|
608 if ($this->processor instanceof InstructionProcessor) {
|
Chris@0
|
609 $res = $this->processor->process($this->current, $name, $data);
|
Chris@17
|
610 if (!empty($res)) {
|
Chris@0
|
611 $this->current = $res;
|
Chris@0
|
612 }
|
Chris@0
|
613
|
Chris@0
|
614 return;
|
Chris@0
|
615 }
|
Chris@0
|
616
|
Chris@0
|
617 // Otherwise, this is just a dumb PI element.
|
Chris@0
|
618 $node = $this->doc->createProcessingInstruction($name, $data);
|
Chris@0
|
619
|
Chris@0
|
620 $this->current->appendChild($node);
|
Chris@0
|
621 }
|
Chris@0
|
622
|
Chris@0
|
623 // ==========================================================================
|
Chris@0
|
624 // UTILITIES
|
Chris@0
|
625 // ==========================================================================
|
Chris@0
|
626
|
Chris@0
|
627 /**
|
Chris@0
|
628 * Apply normalization rules to a tag name.
|
Chris@0
|
629 * See sections 2.9 and 8.1.2.
|
Chris@0
|
630 *
|
Chris@17
|
631 * @param string $tagName
|
Chris@17
|
632 *
|
Chris@0
|
633 * @return string The normalized tag name.
|
Chris@0
|
634 */
|
Chris@17
|
635 protected function normalizeTagName($tagName)
|
Chris@0
|
636 {
|
Chris@0
|
637 /*
|
Chris@0
|
638 * Section 2.9 suggests that we should not do this. if (strpos($name, ':') !== false) { // We know from the grammar that there must be at least one other // char besides :, since : is not a legal tag start. $parts = explode(':', $name); return array_pop($parts); }
|
Chris@0
|
639 */
|
Chris@17
|
640 return $tagName;
|
Chris@0
|
641 }
|
Chris@0
|
642
|
Chris@0
|
643 protected function quirksTreeResolver($name)
|
Chris@0
|
644 {
|
Chris@17
|
645 throw new \Exception('Not implemented.');
|
Chris@0
|
646 }
|
Chris@0
|
647
|
Chris@0
|
648 /**
|
Chris@0
|
649 * Automatically climb the tree and close the closest node with the matching $tag.
|
Chris@17
|
650 *
|
Chris@17
|
651 * @param string $tagName
|
Chris@17
|
652 *
|
Chris@17
|
653 * @return bool
|
Chris@0
|
654 */
|
Chris@17
|
655 protected function autoclose($tagName)
|
Chris@0
|
656 {
|
Chris@0
|
657 $working = $this->current;
|
Chris@0
|
658 do {
|
Chris@17
|
659 if (XML_ELEMENT_NODE !== $working->nodeType) {
|
Chris@0
|
660 return false;
|
Chris@0
|
661 }
|
Chris@17
|
662 if ($working->tagName === $tagName) {
|
Chris@0
|
663 $this->current = $working->parentNode;
|
Chris@0
|
664
|
Chris@0
|
665 return true;
|
Chris@0
|
666 }
|
Chris@0
|
667 } while ($working = $working->parentNode);
|
Chris@17
|
668
|
Chris@0
|
669 return false;
|
Chris@0
|
670 }
|
Chris@0
|
671
|
Chris@0
|
672 /**
|
Chris@0
|
673 * Checks if the given tagname is an ancestor of the present candidate.
|
Chris@0
|
674 *
|
Chris@0
|
675 * If $this->current or anything above $this->current matches the given tag
|
Chris@0
|
676 * name, this returns true.
|
Chris@17
|
677 *
|
Chris@17
|
678 * @param string $tagName
|
Chris@17
|
679 *
|
Chris@17
|
680 * @return bool
|
Chris@0
|
681 */
|
Chris@17
|
682 protected function isAncestor($tagName)
|
Chris@0
|
683 {
|
Chris@0
|
684 $candidate = $this->current;
|
Chris@17
|
685 while (XML_ELEMENT_NODE === $candidate->nodeType) {
|
Chris@17
|
686 if ($candidate->tagName === $tagName) {
|
Chris@0
|
687 return true;
|
Chris@0
|
688 }
|
Chris@0
|
689 $candidate = $candidate->parentNode;
|
Chris@0
|
690 }
|
Chris@0
|
691
|
Chris@0
|
692 return false;
|
Chris@0
|
693 }
|
Chris@0
|
694
|
Chris@0
|
695 /**
|
Chris@0
|
696 * Returns true if the immediate parent element is of the given tagname.
|
Chris@17
|
697 *
|
Chris@17
|
698 * @param string $tagName
|
Chris@17
|
699 *
|
Chris@17
|
700 * @return bool
|
Chris@0
|
701 */
|
Chris@17
|
702 protected function isParent($tagName)
|
Chris@0
|
703 {
|
Chris@17
|
704 return $this->current->tagName === $tagName;
|
Chris@0
|
705 }
|
Chris@0
|
706 }
|