Chris@0: scanner = $scanner; Chris@0: $this->events = $eventHandler; Chris@0: $this->mode = $mode; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Begin parsing. Chris@0: * Chris@0: * This will begin scanning the document, tokenizing as it goes. Chris@0: * Tokens are emitted into the event handler. Chris@0: * Chris@0: * Tokenizing will continue until the document is completely Chris@0: * read. Errors are emitted into the event handler, but Chris@0: * the parser will attempt to continue parsing until the Chris@0: * entire input stream is read. Chris@0: */ Chris@0: public function parse() Chris@0: { Chris@0: do { Chris@0: $this->consumeData(); Chris@0: // FIXME: Add infinite loop protection. Chris@0: } while ($this->carryOn); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Set the text mode for the character data reader. Chris@0: * Chris@0: * HTML5 defines three different modes for reading text: Chris@0: * - Normal: Read until a tag is encountered. Chris@0: * - RCDATA: Read until a tag is encountered, but skip a few otherwise- Chris@0: * special characters. Chris@0: * - Raw: Read until a special closing tag is encountered (viz. pre, script) Chris@0: * Chris@0: * This allows those modes to be set. Chris@0: * Chris@0: * Normally, setting is done by the event handler via a special return code on Chris@0: * startTag(), but it can also be set manually using this function. Chris@0: * Chris@17: * @param int $textmode One of Elements::TEXT_*. Chris@17: * @param string $untilTag The tag that should stop RAW or RCDATA mode. Normal mode does not Chris@17: * use this indicator. Chris@0: */ Chris@0: public function setTextMode($textmode, $untilTag = null) Chris@0: { Chris@0: $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA); Chris@0: $this->untilTag = $untilTag; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Consume a character and make a move. Chris@17: * HTML5 8.2.4.1. Chris@0: */ Chris@0: protected function consumeData() Chris@0: { Chris@17: $tok = $this->scanner->current(); Chris@17: Chris@17: if ('&' === $tok) { Chris@17: // Character reference Chris@17: $ref = $this->decodeCharacterReference(); Chris@17: $this->buffer($ref); Chris@17: Chris@17: $tok = $this->scanner->current(); Chris@17: } Chris@17: Chris@17: // Parse tag Chris@17: if ('<' === $tok) { Chris@17: // Any buffered text data can go out now. Chris@17: $this->flushBuffer(); Chris@17: Chris@17: $tok = $this->scanner->next(); Chris@17: Chris@17: if ('!' === $tok) { Chris@17: $this->markupDeclaration(); Chris@17: } elseif ('/' === $tok) { Chris@17: $this->endTag(); Chris@17: } elseif ('?' === $tok) { Chris@17: $this->processingInstruction(); Chris@17: } elseif (ctype_alpha($tok)) { Chris@17: $this->tagName(); Chris@17: } else { Chris@17: $this->parseError('Illegal tag opening'); Chris@17: // TODO is this necessary ? Chris@17: $this->characterData(); Chris@17: } Chris@17: Chris@17: $tok = $this->scanner->current(); Chris@17: } Chris@17: Chris@17: if (false === $tok) { Chris@17: // Handle end of document Chris@17: $this->eof(); Chris@17: } else { Chris@17: // Parse character Chris@17: switch ($this->textMode) { Chris@17: case Elements::TEXT_RAW: Chris@17: $this->rawText($tok); Chris@17: break; Chris@17: Chris@17: case Elements::TEXT_RCDATA: Chris@17: $this->rcdata($tok); Chris@17: break; Chris@17: Chris@17: default: Chris@17: if ('<' === $tok || '&' === $tok) { Chris@17: break; Chris@17: } Chris@17: Chris@17: // NULL character Chris@17: if ("\00" === $tok) { Chris@17: $this->parseError('Received null character.'); Chris@17: Chris@17: $this->text .= $tok; Chris@17: $this->scanner->consume(); Chris@17: Chris@17: break; Chris@17: } Chris@17: Chris@17: $this->text .= $this->scanner->charsUntil("<&\0"); Chris@17: } Chris@17: } Chris@0: Chris@0: return $this->carryOn; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Parse anything that looks like character data. Chris@0: * Chris@0: * Different rules apply based on the current text mode. Chris@0: * Chris@0: * @see Elements::TEXT_RAW Elements::TEXT_RCDATA. Chris@0: */ Chris@0: protected function characterData() Chris@0: { Chris@0: $tok = $this->scanner->current(); Chris@17: if (false === $tok) { Chris@0: return false; Chris@0: } Chris@0: switch ($this->textMode) { Chris@0: case Elements::TEXT_RAW: Chris@17: return $this->rawText($tok); Chris@0: case Elements::TEXT_RCDATA: Chris@17: return $this->rcdata($tok); Chris@0: default: Chris@17: if ('<' === $tok || '&' === $tok) { Chris@0: return false; Chris@0: } Chris@17: Chris@17: return $this->text($tok); Chris@0: } Chris@0: } Chris@0: Chris@0: /** Chris@0: * This buffers the current token as character data. Chris@17: * Chris@17: * @param string $tok The current token. Chris@17: * Chris@17: * @return bool Chris@0: */ Chris@17: protected function text($tok) Chris@0: { Chris@0: // This should never happen... Chris@17: if (false === $tok) { Chris@0: return false; Chris@0: } Chris@17: Chris@17: // NULL character Chris@17: if ("\00" === $tok) { Chris@17: $this->parseError('Received null character.'); Chris@0: } Chris@17: Chris@0: $this->buffer($tok); Chris@17: $this->scanner->consume(); Chris@17: Chris@0: return true; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Read text in RAW mode. Chris@17: * Chris@17: * @param string $tok The current token. Chris@17: * Chris@17: * @return bool Chris@0: */ Chris@17: protected function rawText($tok) Chris@0: { Chris@0: if (is_null($this->untilTag)) { Chris@17: return $this->text($tok); Chris@0: } Chris@17: Chris@0: $sequence = 'untilTag . '>'; Chris@0: $txt = $this->readUntilSequence($sequence); Chris@0: $this->events->text($txt); Chris@0: $this->setTextMode(0); Chris@17: Chris@0: return $this->endTag(); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Read text in RCDATA mode. Chris@17: * Chris@17: * @param string $tok The current token. Chris@17: * Chris@17: * @return bool Chris@0: */ Chris@17: protected function rcdata($tok) Chris@0: { Chris@0: if (is_null($this->untilTag)) { Chris@17: return $this->text($tok); Chris@0: } Chris@17: Chris@0: $sequence = 'untilTag; Chris@0: $txt = ''; Chris@0: Chris@0: $caseSensitive = !Elements::isHtml5Element($this->untilTag); Chris@17: while (false !== $tok && !('<' == $tok && ($this->scanner->sequenceMatches($sequence, $caseSensitive)))) { Chris@17: if ('&' == $tok) { Chris@0: $txt .= $this->decodeCharacterReference(); Chris@0: $tok = $this->scanner->current(); Chris@0: } else { Chris@0: $txt .= $tok; Chris@0: $tok = $this->scanner->next(); Chris@0: } Chris@0: } Chris@0: $len = strlen($sequence); Chris@0: $this->scanner->consume($len); Chris@17: $len += $this->scanner->whitespace(); Chris@17: if ('>' !== $this->scanner->current()) { Chris@17: $this->parseError('Unclosed RCDATA end tag'); Chris@0: } Chris@17: Chris@0: $this->scanner->unconsume($len); Chris@0: $this->events->text($txt); Chris@0: $this->setTextMode(0); Chris@17: Chris@0: return $this->endTag(); Chris@0: } Chris@0: Chris@0: /** Chris@0: * If the document is read, emit an EOF event. Chris@0: */ Chris@0: protected function eof() Chris@0: { Chris@17: // fprintf(STDOUT, "EOF"); Chris@0: $this->flushBuffer(); Chris@17: $this->events->eof(); Chris@17: $this->carryOn = false; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Look for markup. Chris@0: */ Chris@0: protected function markupDeclaration() Chris@0: { Chris@0: $tok = $this->scanner->next(); Chris@0: Chris@0: // Comment: Chris@17: if ('-' == $tok && '-' == $this->scanner->peek()) { Chris@17: $this->scanner->consume(2); Chris@17: Chris@0: return $this->comment(); Chris@17: } elseif ('D' == $tok || 'd' == $tok) { // Doctype Chris@0: return $this->doctype(); Chris@17: } elseif ('[' == $tok) { // CDATA section Chris@0: return $this->cdataSection(); Chris@0: } Chris@0: Chris@0: // FINISH Chris@17: $this->parseError('Expected . Emit an empty comment because 8.2.4.46 says to. Chris@17: if ('>' == $tok) { Chris@0: // Parse error. Emit the comment token. Chris@0: $this->parseError("Expected comment data, got '>'"); Chris@0: $this->events->comment(''); Chris@17: $this->scanner->consume(); Chris@17: Chris@0: return true; Chris@0: } Chris@0: Chris@0: // Replace NULL with the replacement char. Chris@17: if ("\0" == $tok) { Chris@0: $tok = UTF8Utils::FFFD; Chris@0: } Chris@17: while (!$this->isCommentEnd()) { Chris@0: $comment .= $tok; Chris@0: $tok = $this->scanner->next(); Chris@0: } Chris@0: Chris@0: $this->events->comment($comment); Chris@17: $this->scanner->consume(); Chris@17: Chris@0: return true; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Check if the scanner has reached the end of a comment. Chris@17: * Chris@17: * @return bool Chris@0: */ Chris@0: protected function isCommentEnd() Chris@0: { Chris@0: $tok = $this->scanner->current(); Chris@0: Chris@0: // EOF Chris@17: if (false === $tok) { Chris@0: // Hit the end. Chris@17: $this->parseError('Unexpected EOF in a comment.'); Chris@17: Chris@0: return true; Chris@0: } Chris@0: Chris@0: // If it doesn't start with -, not the end. Chris@17: if ('-' != $tok) { Chris@0: return false; Chris@0: } Chris@0: Chris@0: // Advance one, and test for '->' Chris@17: if ('-' == $this->scanner->next() && '>' == $this->scanner->peek()) { Chris@17: $this->scanner->consume(); // Consume the last '>' Chris@0: return true; Chris@0: } Chris@0: // Unread '-'; Chris@0: $this->scanner->unconsume(1); Chris@17: Chris@0: return false; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Parse a DOCTYPE. Chris@0: * Chris@0: * Parse a DOCTYPE declaration. This method has strong bearing on whether or Chris@0: * not Quirksmode is enabled on the event handler. Chris@0: * Chris@0: * @todo This method is a little long. Should probably refactor. Chris@17: * Chris@17: * @return bool Chris@0: */ Chris@0: protected function doctype() Chris@0: { Chris@0: // Check that string is DOCTYPE. Chris@17: if ($this->scanner->sequenceMatches('DOCTYPE', false)) { Chris@17: $this->scanner->consume(7); Chris@17: } else { Chris@17: $chars = $this->scanner->charsWhile('DOCTYPEdoctype'); Chris@0: $this->parseError('Expected DOCTYPE, got %s', $chars); Chris@17: Chris@0: return $this->bogusComment('scanner->whitespace(); Chris@0: $tok = $this->scanner->current(); Chris@0: Chris@0: // EOF: die. Chris@17: if (false === $tok) { Chris@0: $this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true); Chris@17: $this->eof(); Chris@17: Chris@17: return true; Chris@0: } Chris@0: Chris@0: // NULL char: convert. Chris@17: if ("\0" === $tok) { Chris@17: $this->parseError('Unexpected null character in DOCTYPE.'); Chris@0: } Chris@0: Chris@0: $stop = " \n\f>"; Chris@0: $doctypeName = $this->scanner->charsUntil($stop); Chris@0: // Lowercase ASCII, replace \0 with FFFD Chris@0: $doctypeName = strtolower(strtr($doctypeName, "\0", UTF8Utils::FFFD)); Chris@0: Chris@0: $tok = $this->scanner->current(); Chris@0: Chris@0: // If false, emit a parse error, DOCTYPE, and return. Chris@17: if (false === $tok) { Chris@0: $this->parseError('Unexpected EOF in DOCTYPE declaration.'); Chris@0: $this->events->doctype($doctypeName, EventHandler::DOCTYPE_NONE, null, true); Chris@17: Chris@0: return true; Chris@0: } Chris@0: Chris@0: // Short DOCTYPE, like Chris@17: if ('>' == $tok) { Chris@0: // DOCTYPE without a name. Chris@17: if (0 == strlen($doctypeName)) { Chris@17: $this->parseError('Expected a DOCTYPE name. Got nothing.'); Chris@0: $this->events->doctype($doctypeName, 0, null, true); Chris@17: $this->scanner->consume(); Chris@17: Chris@0: return true; Chris@0: } Chris@0: $this->events->doctype($doctypeName); Chris@17: $this->scanner->consume(); Chris@17: Chris@0: return true; Chris@0: } Chris@0: $this->scanner->whitespace(); Chris@0: Chris@0: $pub = strtoupper($this->scanner->getAsciiAlpha()); Chris@17: $white = $this->scanner->whitespace(); Chris@0: Chris@0: // Get ID, and flag it as pub or system. Chris@17: if (('PUBLIC' == $pub || 'SYSTEM' == $pub) && $white > 0) { Chris@0: // Get the sys ID. Chris@17: $type = 'PUBLIC' == $pub ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM; Chris@0: $id = $this->quotedString("\0>"); Chris@17: if (false === $id) { Chris@0: $this->events->doctype($doctypeName, $type, $pub, false); Chris@17: Chris@17: return true; Chris@0: } Chris@0: Chris@0: // Premature EOF. Chris@17: if (false === $this->scanner->current()) { Chris@17: $this->parseError('Unexpected EOF in DOCTYPE'); Chris@0: $this->events->doctype($doctypeName, $type, $id, true); Chris@17: Chris@0: return true; Chris@0: } Chris@0: Chris@0: // Well-formed complete DOCTYPE. Chris@0: $this->scanner->whitespace(); Chris@17: if ('>' == $this->scanner->current()) { Chris@0: $this->events->doctype($doctypeName, $type, $id, false); Chris@17: $this->scanner->consume(); Chris@17: Chris@0: return true; Chris@0: } Chris@0: Chris@0: // If we get here, we have scanner->charsUntil('>'); Chris@17: $this->parseError('Malformed DOCTYPE.'); Chris@0: $this->events->doctype($doctypeName, $type, $id, true); Chris@17: $this->scanner->consume(); Chris@17: Chris@0: return true; Chris@0: } Chris@0: Chris@0: // Else it's a bogus DOCTYPE. Chris@0: // Consume to > and trash. Chris@0: $this->scanner->charsUntil('>'); Chris@0: Chris@17: $this->parseError('Expected PUBLIC or SYSTEM. Got %s.', $pub); Chris@0: $this->events->doctype($doctypeName, 0, null, true); Chris@17: $this->scanner->consume(); Chris@17: Chris@0: return true; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Utility for reading a quoted string. Chris@0: * Chris@17: * @param string $stopchars Characters (in addition to a close-quote) that should stop the string. Chris@17: * E.g. sometimes '>' is higher precedence than '"' or "'". Chris@17: * Chris@17: * @return mixed String if one is found (quotations omitted). Chris@0: */ Chris@0: protected function quotedString($stopchars) Chris@0: { Chris@0: $tok = $this->scanner->current(); Chris@17: if ('"' == $tok || "'" == $tok) { Chris@17: $this->scanner->consume(); Chris@0: $ret = $this->scanner->charsUntil($tok . $stopchars); Chris@0: if ($this->scanner->current() == $tok) { Chris@17: $this->scanner->consume(); Chris@0: } else { Chris@0: // Parse error because no close quote. Chris@17: $this->parseError('Expected %s, got %s', $tok, $this->scanner->current()); Chris@0: } Chris@17: Chris@0: return $ret; Chris@0: } Chris@17: Chris@0: return false; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Handle a CDATA section. Chris@17: * Chris@17: * @return bool Chris@0: */ Chris@0: protected function cdataSection() Chris@0: { Chris@0: $cdata = ''; Chris@17: $this->scanner->consume(); Chris@0: Chris@0: $chars = $this->scanner->charsWhile('CDAT'); Chris@17: if ('CDATA' != $chars || '[' != $this->scanner->current()) { Chris@0: $this->parseError('Expected [CDATA[, got %s', $chars); Chris@17: Chris@0: return $this->bogusComment('scanner->next(); Chris@0: do { Chris@17: if (false === $tok) { Chris@0: $this->parseError('Unexpected EOF inside CDATA.'); Chris@0: $this->bogusComment('scanner->next(); Chris@17: } while (!$this->scanner->sequenceMatches(']]>')); Chris@0: Chris@0: // Consume ]]> Chris@0: $this->scanner->consume(3); Chris@0: Chris@0: $this->events->cdata($cdata); Chris@17: Chris@0: return true; Chris@0: } Chris@0: Chris@0: // ================================================================ Chris@0: // Non-HTML5 Chris@0: // ================================================================ Chris@17: Chris@0: /** Chris@0: * Handle a processing instruction. Chris@0: * Chris@0: * XML processing instructions are supposed to be ignored in HTML5, Chris@0: * treated as "bogus comments". However, since we're not a user Chris@0: * agent, we allow them. We consume until ?> and then issue a Chris@0: * EventListener::processingInstruction() event. Chris@17: * Chris@17: * @return bool Chris@0: */ Chris@0: protected function processingInstruction() Chris@0: { Chris@17: if ('?' != $this->scanner->current()) { Chris@0: return false; Chris@0: } Chris@0: Chris@0: $tok = $this->scanner->next(); Chris@0: $procName = $this->scanner->getAsciiAlpha(); Chris@17: $white = $this->scanner->whitespace(); Chris@0: Chris@0: // If not a PI, send to bogusComment. Chris@17: if (0 == strlen($procName) || 0 == $white || false == $this->scanner->current()) { Chris@0: $this->parseError("Expected processing instruction name, got $tok"); Chris@0: $this->bogusComment('. Chris@17: while (!('?' == $this->scanner->current() && '>' == $this->scanner->peek())) { Chris@0: $data .= $this->scanner->current(); Chris@0: Chris@0: $tok = $this->scanner->next(); Chris@17: if (false === $tok) { Chris@17: $this->parseError('Unexpected EOF in processing instruction.'); Chris@0: $this->events->processingInstruction($procName, $data); Chris@17: Chris@0: return true; Chris@0: } Chris@0: } Chris@0: Chris@17: $this->scanner->consume(2); // Consume the closing tag Chris@0: $this->events->processingInstruction($procName, $data); Chris@17: Chris@0: return true; Chris@0: } Chris@0: Chris@0: // ================================================================ Chris@0: // UTILITY FUNCTIONS Chris@0: // ================================================================ Chris@0: Chris@0: /** Chris@0: * Read from the input stream until we get to the desired sequene Chris@0: * or hit the end of the input stream. Chris@17: * Chris@17: * @param string $sequence Chris@17: * Chris@17: * @return string Chris@0: */ Chris@0: protected function readUntilSequence($sequence) Chris@0: { Chris@0: $buffer = ''; Chris@0: Chris@0: // Optimization for reading larger blocks faster. Chris@0: $first = substr($sequence, 0, 1); Chris@17: while (false !== $this->scanner->current()) { Chris@0: $buffer .= $this->scanner->charsUntil($first); Chris@0: Chris@0: // Stop as soon as we hit the stopping condition. Chris@17: if ($this->scanner->sequenceMatches($sequence, false)) { Chris@0: return $buffer; Chris@0: } Chris@0: $buffer .= $this->scanner->current(); Chris@17: $this->scanner->consume(); Chris@0: } Chris@0: Chris@0: // If we get here, we hit the EOF. Chris@17: $this->parseError('Unexpected EOF during text read.'); Chris@17: Chris@0: return $buffer; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Check if upcomming chars match the given sequence. Chris@0: * Chris@0: * This will read the stream for the $sequence. If it's Chris@0: * found, this will return true. If not, return false. Chris@0: * Since this unconsumes any chars it reads, the caller Chris@0: * will still need to read the next sequence, even if Chris@0: * this returns true. Chris@0: * Chris@17: * Example: $this->scanner->sequenceMatches('') will Chris@0: * see if the input stream is at the start of a Chris@0: * '' string. Chris@17: * Chris@17: * @param string $sequence Chris@17: * @param bool $caseSensitive Chris@17: * Chris@17: * @return bool Chris@0: */ Chris@0: protected function sequenceMatches($sequence, $caseSensitive = true) Chris@0: { Chris@17: @trigger_error(__METHOD__ . ' method is deprecated since version 2.4 and will be removed in 3.0. Use Scanner::sequenceMatches() instead.', E_USER_DEPRECATED); Chris@0: Chris@17: return $this->scanner->sequenceMatches($sequence, $caseSensitive); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Send a TEXT event with the contents of the text buffer. Chris@0: * Chris@0: * This emits an EventHandler::text() event with the current contents of the Chris@0: * temporary text buffer. (The buffer is used to group as much PCDATA Chris@0: * as we can instead of emitting lots and lots of TEXT events.) Chris@0: */ Chris@0: protected function flushBuffer() Chris@0: { Chris@17: if ('' === $this->text) { Chris@0: return; Chris@0: } Chris@0: $this->events->text($this->text); Chris@0: $this->text = ''; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Add text to the temporary buffer. Chris@0: * Chris@0: * @see flushBuffer() Chris@17: * Chris@17: * @param string $str Chris@0: */ Chris@0: protected function buffer($str) Chris@0: { Chris@0: $this->text .= $str; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Emit a parse error. Chris@0: * Chris@0: * A parse error always returns false because it never consumes any Chris@0: * characters. Chris@17: * Chris@17: * @param string $msg Chris@17: * Chris@17: * @return string Chris@0: */ Chris@0: protected function parseError($msg) Chris@0: { Chris@0: $args = func_get_args(); Chris@0: Chris@0: if (count($args) > 1) { Chris@0: array_shift($args); Chris@0: $msg = vsprintf($msg, $args); Chris@0: } Chris@0: Chris@0: $line = $this->scanner->currentLine(); Chris@0: $col = $this->scanner->columnOffset(); Chris@0: $this->events->parseError($msg, $line, $col); Chris@17: Chris@0: return false; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Decode a character reference and return the string. Chris@0: * Chris@17: * If $inAttribute is set to true, a bare & will be returned as-is. Chris@0: * Chris@17: * @param bool $inAttribute Set to true if the text is inside of an attribute value. Chris@17: * false otherwise. Chris@17: * Chris@17: * @return string Chris@0: */ Chris@0: protected function decodeCharacterReference($inAttribute = false) Chris@0: { Chris@0: // Next char after &. Chris@0: $tok = $this->scanner->next(); Chris@0: $start = $this->scanner->position(); Chris@0: Chris@17: if (false === $tok) { Chris@0: return '&'; Chris@0: } Chris@0: Chris@0: // These indicate not an entity. We return just Chris@0: // the &. Chris@17: if ("\t" === $tok || "\n" === $tok || "\f" === $tok || ' ' === $tok || '&' === $tok || '<' === $tok) { Chris@0: // $this->scanner->next(); Chris@0: return '&'; Chris@0: } Chris@0: Chris@0: // Numeric entity Chris@17: if ('#' === $tok) { Chris@0: $tok = $this->scanner->next(); Chris@0: Chris@0: // Hexidecimal encoding. Chris@0: // X[0-9a-fA-F]+; Chris@0: // x[0-9a-fA-F]+; Chris@17: if ('x' === $tok || 'X' === $tok) { Chris@0: $tok = $this->scanner->next(); // Consume x Chris@0: Chris@0: // Convert from hex code to char. Chris@0: $hex = $this->scanner->getHex(); Chris@0: if (empty($hex)) { Chris@17: $this->parseError('Expected &#xHEX;, got &#x%s', $tok); Chris@0: // We unconsume because we don't know what parser rules might Chris@0: // be in effect for the remaining chars. For example. '&#>' Chris@0: // might result in a specific parsing rule inside of tag Chris@0: // contexts, while not inside of pcdata context. Chris@0: $this->scanner->unconsume(2); Chris@17: Chris@0: return '&'; Chris@0: } Chris@0: $entity = CharacterReference::lookupHex($hex); Chris@0: } // Decimal encoding. Chris@0: // [0-9]+; Chris@0: else { Chris@0: // Convert from decimal to char. Chris@0: $numeric = $this->scanner->getNumeric(); Chris@17: if (false === $numeric) { Chris@17: $this->parseError('Expected &#DIGITS;, got &#%s', $tok); Chris@0: $this->scanner->unconsume(2); Chris@17: Chris@0: return '&'; Chris@0: } Chris@0: $entity = CharacterReference::lookupDecimal($numeric); Chris@0: } Chris@17: } elseif ('=' === $tok && $inAttribute) { Chris@0: return '&'; Chris@0: } else { // String entity. Chris@0: // Attempt to consume a string up to a ';'. Chris@0: // [a-zA-Z0-9]+; Chris@0: $cname = $this->scanner->getAsciiAlphaNum(); Chris@0: $entity = CharacterReference::lookupName($cname); Chris@0: Chris@0: // When no entity is found provide the name of the unmatched string Chris@0: // and continue on as the & is not part of an entity. The & will Chris@0: // be converted to & elsewhere. Chris@17: if (null === $entity) { Chris@17: if (!$inAttribute || '' === $cname) { Chris@0: $this->parseError("No match in entity table for '%s'", $cname); Chris@0: } Chris@0: $this->scanner->unconsume($this->scanner->position() - $start); Chris@17: Chris@0: return '&'; Chris@0: } Chris@0: } Chris@0: Chris@0: // The scanner has advanced the cursor for us. Chris@0: $tok = $this->scanner->current(); Chris@0: Chris@0: // We have an entity. We're done here. Chris@17: if (';' === $tok) { Chris@17: $this->scanner->consume(); Chris@17: Chris@0: return $entity; Chris@0: } Chris@0: Chris@0: // If in an attribute, then failing to match ; means unconsume the Chris@0: // entire string. Otherwise, failure to match is an error. Chris@0: if ($inAttribute) { Chris@0: $this->scanner->unconsume($this->scanner->position() - $start); Chris@17: Chris@0: return '&'; Chris@0: } Chris@0: Chris@17: $this->parseError('Expected &ENTITY;, got &ENTITY%s (no trailing ;) ', $tok); Chris@17: Chris@0: return '&' . $entity; Chris@0: } Chris@0: }