Chris@0: scanner = $scanner; Chris@0: $this->events = $eventHandler; Chris@0: $this->mode = $mode; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Begin parsing. Chris@0: * Chris@0: * This will begin scanning the document, tokenizing as it goes. Chris@0: * Tokens are emitted into the event handler. Chris@0: * Chris@0: * Tokenizing will continue until the document is completely Chris@0: * read. Errors are emitted into the event handler, but Chris@0: * the parser will attempt to continue parsing until the Chris@0: * entire input stream is read. Chris@0: */ Chris@0: public function parse() Chris@0: { Chris@0: do { Chris@0: $this->consumeData(); Chris@0: // FIXME: Add infinite loop protection. Chris@0: } while ($this->carryOn); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Set the text mode for the character data reader. Chris@0: * Chris@0: * HTML5 defines three different modes for reading text: Chris@0: * - Normal: Read until a tag is encountered. Chris@0: * - RCDATA: Read until a tag is encountered, but skip a few otherwise- Chris@0: * special characters. Chris@0: * - Raw: Read until a special closing tag is encountered (viz. pre, script) Chris@0: * Chris@0: * This allows those modes to be set. Chris@0: * Chris@0: * Normally, setting is done by the event handler via a special return code on Chris@0: * startTag(), but it can also be set manually using this function. Chris@0: * Chris@0: * @param integer $textmode Chris@0: * One of Elements::TEXT_* Chris@0: * @param string $untilTag Chris@0: * The tag that should stop RAW or RCDATA mode. Normal mode does not Chris@0: * use this indicator. Chris@0: */ Chris@0: public function setTextMode($textmode, $untilTag = null) Chris@0: { Chris@0: $this->textMode = $textmode & (Elements::TEXT_RAW | Elements::TEXT_RCDATA); Chris@0: $this->untilTag = $untilTag; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Consume a character and make a move. Chris@0: * HTML5 8.2.4.1 Chris@0: */ Chris@0: protected function consumeData() Chris@0: { Chris@0: // Character Ref Chris@0: /* Chris@0: * $this->characterReference() || $this->tagOpen() || $this->eof() || $this->characterData(); Chris@0: */ Chris@0: $this->characterReference(); Chris@0: $this->tagOpen(); Chris@0: $this->eof(); Chris@0: $this->characterData(); Chris@0: Chris@0: return $this->carryOn; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Parse anything that looks like character data. Chris@0: * Chris@0: * Different rules apply based on the current text mode. Chris@0: * Chris@0: * @see Elements::TEXT_RAW Elements::TEXT_RCDATA. Chris@0: */ Chris@0: protected function characterData() Chris@0: { Chris@0: $tok = $this->scanner->current(); Chris@0: if ($tok === false) { Chris@0: return false; Chris@0: } Chris@0: switch ($this->textMode) { Chris@0: case Elements::TEXT_RAW: Chris@0: return $this->rawText(); Chris@0: case Elements::TEXT_RCDATA: Chris@0: return $this->rcdata(); Chris@0: default: Chris@0: if (strspn($tok, "<&")) { Chris@0: return false; Chris@0: } Chris@0: return $this->text(); Chris@0: } Chris@0: } Chris@0: Chris@0: /** Chris@0: * This buffers the current token as character data. Chris@0: */ Chris@0: protected function text() Chris@0: { Chris@0: $tok = $this->scanner->current(); Chris@0: Chris@0: // This should never happen... Chris@0: if ($tok === false) { Chris@0: return false; Chris@0: } Chris@0: // Null Chris@0: if ($tok === "\00") { Chris@0: $this->parseError("Received null character."); Chris@0: } Chris@0: // fprintf(STDOUT, "Writing '%s'", $tok); Chris@0: $this->buffer($tok); Chris@0: $this->scanner->next(); Chris@0: return true; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Read text in RAW mode. Chris@0: */ Chris@0: protected function rawText() Chris@0: { Chris@0: if (is_null($this->untilTag)) { Chris@0: return $this->text(); Chris@0: } Chris@0: $sequence = 'untilTag . '>'; Chris@0: $txt = $this->readUntilSequence($sequence); Chris@0: $this->events->text($txt); Chris@0: $this->setTextMode(0); Chris@0: return $this->endTag(); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Read text in RCDATA mode. Chris@0: */ Chris@0: protected function rcdata() Chris@0: { Chris@0: if (is_null($this->untilTag)) { Chris@0: return $this->text(); Chris@0: } Chris@0: $sequence = 'untilTag; Chris@0: $txt = ''; Chris@0: $tok = $this->scanner->current(); Chris@0: Chris@0: $caseSensitive = !Elements::isHtml5Element($this->untilTag); Chris@0: while ($tok !== false && ! ($tok == '<' && ($this->sequenceMatches($sequence, $caseSensitive)))) { Chris@0: if ($tok == '&') { Chris@0: $txt .= $this->decodeCharacterReference(); Chris@0: $tok = $this->scanner->current(); Chris@0: } else { Chris@0: $txt .= $tok; Chris@0: $tok = $this->scanner->next(); Chris@0: } Chris@0: } Chris@0: $len = strlen($sequence); Chris@0: $this->scanner->consume($len); Chris@0: $len += strlen($this->scanner->whitespace()); Chris@0: if ($this->scanner->current() !== '>') { Chris@0: $this->parseError("Unclosed RCDATA end tag"); Chris@0: } Chris@0: $this->scanner->unconsume($len); Chris@0: $this->events->text($txt); Chris@0: $this->setTextMode(0); Chris@0: return $this->endTag(); Chris@0: } Chris@0: Chris@0: /** Chris@0: * If the document is read, emit an EOF event. Chris@0: */ Chris@0: protected function eof() Chris@0: { Chris@0: if ($this->scanner->current() === false) { Chris@0: // fprintf(STDOUT, "EOF"); Chris@0: $this->flushBuffer(); Chris@0: $this->events->eof(); Chris@0: $this->carryOn = false; Chris@0: return true; Chris@0: } Chris@0: return false; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Handle character references (aka entities). Chris@0: * Chris@0: * This version is specific to PCDATA, as it buffers data into the Chris@0: * text buffer. For a generic version, see decodeCharacterReference(). Chris@0: * Chris@0: * HTML5 8.2.4.2 Chris@0: */ Chris@0: protected function characterReference() Chris@0: { Chris@0: $ref = $this->decodeCharacterReference(); Chris@0: if ($ref !== false) { Chris@0: $this->buffer($ref); Chris@0: return true; Chris@0: } Chris@0: return false; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Emit a tagStart event on encountering a tag. Chris@0: * Chris@0: * 8.2.4.8 Chris@0: */ Chris@0: protected function tagOpen() Chris@0: { Chris@0: if ($this->scanner->current() != '<') { Chris@0: return false; Chris@0: } Chris@0: Chris@0: // Any buffered text data can go out now. Chris@0: $this->flushBuffer(); Chris@0: Chris@0: $this->scanner->next(); Chris@0: Chris@0: return $this->markupDeclaration() || $this->endTag() || $this->processingInstruction() || $this->tagName() || Chris@0: /* This always returns false. */ Chris@0: $this->parseError("Illegal tag opening") || $this->characterData(); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Look for markup. Chris@0: */ Chris@0: protected function markupDeclaration() Chris@0: { Chris@0: if ($this->scanner->current() != '!') { Chris@0: return false; Chris@0: } Chris@0: Chris@0: $tok = $this->scanner->next(); Chris@0: Chris@0: // Comment: Chris@0: if ($tok == '-' && $this->scanner->peek() == '-') { Chris@0: $this->scanner->next(); // Consume the other '-' Chris@0: $this->scanner->next(); // Next char. Chris@0: return $this->comment(); Chris@0: } Chris@0: Chris@0: elseif ($tok == 'D' || $tok == 'd') { // Doctype Chris@0: return $this->doctype(); Chris@0: } Chris@0: Chris@0: elseif ($tok == '[') { // CDATA section Chris@0: return $this->cdataSection(); Chris@0: } Chris@0: Chris@0: // FINISH Chris@0: $this->parseError("Expected . Emit an empty comment because 8.2.4.46 says to. Chris@0: if ($tok == '>') { Chris@0: // Parse error. Emit the comment token. Chris@0: $this->parseError("Expected comment data, got '>'"); Chris@0: $this->events->comment(''); Chris@0: $this->scanner->next(); Chris@0: return true; Chris@0: } Chris@0: Chris@0: // Replace NULL with the replacement char. Chris@0: if ($tok == "\0") { Chris@0: $tok = UTF8Utils::FFFD; Chris@0: } Chris@0: while (! $this->isCommentEnd()) { Chris@0: $comment .= $tok; Chris@0: $tok = $this->scanner->next(); Chris@0: } Chris@0: Chris@0: $this->events->comment($comment); Chris@0: $this->scanner->next(); Chris@0: return true; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Check if the scanner has reached the end of a comment. Chris@0: */ Chris@0: protected function isCommentEnd() Chris@0: { Chris@0: $tok = $this->scanner->current(); Chris@0: Chris@0: // EOF Chris@0: if ($tok === false) { Chris@0: // Hit the end. Chris@0: $this->parseError("Unexpected EOF in a comment."); Chris@0: return true; Chris@0: } Chris@0: Chris@0: // If it doesn't start with -, not the end. Chris@0: if ($tok != '-') { Chris@0: return false; Chris@0: } Chris@0: Chris@0: // Advance one, and test for '->' Chris@0: if ($this->scanner->next() == '-' && $this->scanner->peek() == '>') { Chris@0: $this->scanner->next(); // Consume the last '>' Chris@0: return true; Chris@0: } Chris@0: // Unread '-'; Chris@0: $this->scanner->unconsume(1); Chris@0: return false; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Parse a DOCTYPE. Chris@0: * Chris@0: * Parse a DOCTYPE declaration. This method has strong bearing on whether or Chris@0: * not Quirksmode is enabled on the event handler. Chris@0: * Chris@0: * @todo This method is a little long. Should probably refactor. Chris@0: */ Chris@0: protected function doctype() Chris@0: { Chris@0: if (strcasecmp($this->scanner->current(), 'D')) { Chris@0: return false; Chris@0: } Chris@0: // Check that string is DOCTYPE. Chris@0: $chars = $this->scanner->charsWhile("DOCTYPEdoctype"); Chris@0: if (strcasecmp($chars, 'DOCTYPE')) { Chris@0: $this->parseError('Expected DOCTYPE, got %s', $chars); Chris@0: return $this->bogusComment('scanner->whitespace(); Chris@0: $tok = $this->scanner->current(); Chris@0: Chris@0: // EOF: die. Chris@0: if ($tok === false) { Chris@0: $this->events->doctype('html5', EventHandler::DOCTYPE_NONE, '', true); Chris@0: return $this->eof(); Chris@0: } Chris@0: Chris@0: $doctypeName = ''; Chris@0: Chris@0: // NULL char: convert. Chris@0: if ($tok === "\0") { Chris@0: $this->parseError("Unexpected null character in DOCTYPE."); Chris@0: $doctypeName .= UTF8::FFFD; Chris@0: $tok = $this->scanner->next(); Chris@0: } Chris@0: Chris@0: $stop = " \n\f>"; Chris@0: $doctypeName = $this->scanner->charsUntil($stop); Chris@0: // Lowercase ASCII, replace \0 with FFFD Chris@0: $doctypeName = strtolower(strtr($doctypeName, "\0", UTF8Utils::FFFD)); Chris@0: Chris@0: $tok = $this->scanner->current(); Chris@0: Chris@0: // If false, emit a parse error, DOCTYPE, and return. Chris@0: if ($tok === false) { Chris@0: $this->parseError('Unexpected EOF in DOCTYPE declaration.'); Chris@0: $this->events->doctype($doctypeName, EventHandler::DOCTYPE_NONE, null, true); Chris@0: return true; Chris@0: } Chris@0: Chris@0: // Short DOCTYPE, like Chris@0: if ($tok == '>') { Chris@0: // DOCTYPE without a name. Chris@0: if (strlen($doctypeName) == 0) { Chris@0: $this->parseError("Expected a DOCTYPE name. Got nothing."); Chris@0: $this->events->doctype($doctypeName, 0, null, true); Chris@0: $this->scanner->next(); Chris@0: return true; Chris@0: } Chris@0: $this->events->doctype($doctypeName); Chris@0: $this->scanner->next(); Chris@0: return true; Chris@0: } Chris@0: $this->scanner->whitespace(); Chris@0: Chris@0: $pub = strtoupper($this->scanner->getAsciiAlpha()); Chris@0: $white = strlen($this->scanner->whitespace()); Chris@0: Chris@0: // Get ID, and flag it as pub or system. Chris@0: if (($pub == 'PUBLIC' || $pub == 'SYSTEM') && $white > 0) { Chris@0: // Get the sys ID. Chris@0: $type = $pub == 'PUBLIC' ? EventHandler::DOCTYPE_PUBLIC : EventHandler::DOCTYPE_SYSTEM; Chris@0: $id = $this->quotedString("\0>"); Chris@0: if ($id === false) { Chris@0: $this->events->doctype($doctypeName, $type, $pub, false); Chris@0: return false; Chris@0: } Chris@0: Chris@0: // Premature EOF. Chris@0: if ($this->scanner->current() === false) { Chris@0: $this->parseError("Unexpected EOF in DOCTYPE"); Chris@0: $this->events->doctype($doctypeName, $type, $id, true); Chris@0: return true; Chris@0: } Chris@0: Chris@0: // Well-formed complete DOCTYPE. Chris@0: $this->scanner->whitespace(); Chris@0: if ($this->scanner->current() == '>') { Chris@0: $this->events->doctype($doctypeName, $type, $id, false); Chris@0: $this->scanner->next(); Chris@0: return true; Chris@0: } Chris@0: Chris@0: // If we get here, we have scanner->charsUntil(">"); Chris@0: $this->parseError("Malformed DOCTYPE."); Chris@0: $this->events->doctype($doctypeName, $type, $id, true); Chris@0: $this->scanner->next(); Chris@0: return true; Chris@0: } Chris@0: Chris@0: // Else it's a bogus DOCTYPE. Chris@0: // Consume to > and trash. Chris@0: $this->scanner->charsUntil('>'); Chris@0: Chris@0: $this->parseError("Expected PUBLIC or SYSTEM. Got %s.", $pub); Chris@0: $this->events->doctype($doctypeName, 0, null, true); Chris@0: $this->scanner->next(); Chris@0: return true; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Utility for reading a quoted string. Chris@0: * Chris@0: * @param string $stopchars Chris@0: * Characters (in addition to a close-quote) that should stop the string. Chris@0: * E.g. sometimes '>' is higher precedence than '"' or "'". Chris@0: * @return mixed String if one is found (quotations omitted) Chris@0: */ Chris@0: protected function quotedString($stopchars) Chris@0: { Chris@0: $tok = $this->scanner->current(); Chris@0: if ($tok == '"' || $tok == "'") { Chris@0: $this->scanner->next(); Chris@0: $ret = $this->scanner->charsUntil($tok . $stopchars); Chris@0: if ($this->scanner->current() == $tok) { Chris@0: $this->scanner->next(); Chris@0: } else { Chris@0: // Parse error because no close quote. Chris@0: $this->parseError("Expected %s, got %s", $tok, $this->scanner->current()); Chris@0: } Chris@0: return $ret; Chris@0: } Chris@0: return false; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Handle a CDATA section. Chris@0: */ Chris@0: protected function cdataSection() Chris@0: { Chris@0: if ($this->scanner->current() != '[') { Chris@0: return false; Chris@0: } Chris@0: $cdata = ''; Chris@0: $this->scanner->next(); Chris@0: Chris@0: $chars = $this->scanner->charsWhile('CDAT'); Chris@0: if ($chars != 'CDATA' || $this->scanner->current() != '[') { Chris@0: $this->parseError('Expected [CDATA[, got %s', $chars); Chris@0: return $this->bogusComment('scanner->next(); Chris@0: do { Chris@0: if ($tok === false) { Chris@0: $this->parseError('Unexpected EOF inside CDATA.'); Chris@0: $this->bogusComment('scanner->next(); Chris@0: } while (! $this->sequenceMatches(']]>')); Chris@0: Chris@0: // Consume ]]> Chris@0: $this->scanner->consume(3); Chris@0: Chris@0: $this->events->cdata($cdata); Chris@0: return true; Chris@0: } Chris@0: Chris@0: // ================================================================ Chris@0: // Non-HTML5 Chris@0: // ================================================================ Chris@0: /** Chris@0: * Handle a processing instruction. Chris@0: * Chris@0: * XML processing instructions are supposed to be ignored in HTML5, Chris@0: * treated as "bogus comments". However, since we're not a user Chris@0: * agent, we allow them. We consume until ?> and then issue a Chris@0: * EventListener::processingInstruction() event. Chris@0: */ Chris@0: protected function processingInstruction() Chris@0: { Chris@0: if ($this->scanner->current() != '?') { Chris@0: return false; Chris@0: } Chris@0: Chris@0: $tok = $this->scanner->next(); Chris@0: $procName = $this->scanner->getAsciiAlpha(); Chris@0: $white = strlen($this->scanner->whitespace()); Chris@0: Chris@0: // If not a PI, send to bogusComment. Chris@0: if (strlen($procName) == 0 || $white == 0 || $this->scanner->current() == false) { Chris@0: $this->parseError("Expected processing instruction name, got $tok"); Chris@0: $this->bogusComment('. Chris@0: while (! ($this->scanner->current() == '?' && $this->scanner->peek() == '>')) { Chris@0: $data .= $this->scanner->current(); Chris@0: Chris@0: $tok = $this->scanner->next(); Chris@0: if ($tok === false) { Chris@0: $this->parseError("Unexpected EOF in processing instruction."); Chris@0: $this->events->processingInstruction($procName, $data); Chris@0: return true; Chris@0: } Chris@0: } Chris@0: Chris@0: $this->scanner->next(); // > Chris@0: $this->scanner->next(); // Next token. Chris@0: $this->events->processingInstruction($procName, $data); Chris@0: return true; Chris@0: } Chris@0: Chris@0: // ================================================================ Chris@0: // UTILITY FUNCTIONS Chris@0: // ================================================================ Chris@0: Chris@0: /** Chris@0: * Read from the input stream until we get to the desired sequene Chris@0: * or hit the end of the input stream. Chris@0: */ Chris@0: protected function readUntilSequence($sequence) Chris@0: { Chris@0: $buffer = ''; Chris@0: Chris@0: // Optimization for reading larger blocks faster. Chris@0: $first = substr($sequence, 0, 1); Chris@0: while ($this->scanner->current() !== false) { Chris@0: $buffer .= $this->scanner->charsUntil($first); Chris@0: Chris@0: // Stop as soon as we hit the stopping condition. Chris@0: if ($this->sequenceMatches($sequence, false)) { Chris@0: return $buffer; Chris@0: } Chris@0: $buffer .= $this->scanner->current(); Chris@0: $this->scanner->next(); Chris@0: } Chris@0: Chris@0: // If we get here, we hit the EOF. Chris@0: $this->parseError("Unexpected EOF during text read."); Chris@0: return $buffer; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Check if upcomming chars match the given sequence. Chris@0: * Chris@0: * This will read the stream for the $sequence. If it's Chris@0: * found, this will return true. If not, return false. Chris@0: * Since this unconsumes any chars it reads, the caller Chris@0: * will still need to read the next sequence, even if Chris@0: * this returns true. Chris@0: * Chris@0: * Example: $this->sequenceMatches('') will Chris@0: * see if the input stream is at the start of a Chris@0: * '' string. Chris@0: */ Chris@0: protected function sequenceMatches($sequence, $caseSensitive = true) Chris@0: { Chris@0: $len = strlen($sequence); Chris@0: $buffer = ''; Chris@0: for ($i = 0; $i < $len; ++ $i) { Chris@0: $tok = $this->scanner->current(); Chris@0: $buffer .= $tok; Chris@0: Chris@0: // EOF. Rewind and let the caller handle it. Chris@0: if ($tok === false) { Chris@0: $this->scanner->unconsume($i); Chris@0: return false; Chris@0: } Chris@0: $this->scanner->next(); Chris@0: } Chris@0: Chris@0: $this->scanner->unconsume($len); Chris@0: return $caseSensitive ? $buffer == $sequence : strcasecmp($buffer, $sequence) === 0; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Send a TEXT event with the contents of the text buffer. Chris@0: * Chris@0: * This emits an EventHandler::text() event with the current contents of the Chris@0: * temporary text buffer. (The buffer is used to group as much PCDATA Chris@0: * as we can instead of emitting lots and lots of TEXT events.) Chris@0: */ Chris@0: protected function flushBuffer() Chris@0: { Chris@0: if ($this->text === '') { Chris@0: return; Chris@0: } Chris@0: $this->events->text($this->text); Chris@0: $this->text = ''; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Add text to the temporary buffer. Chris@0: * Chris@0: * @see flushBuffer() Chris@0: */ Chris@0: protected function buffer($str) Chris@0: { Chris@0: $this->text .= $str; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Emit a parse error. Chris@0: * Chris@0: * A parse error always returns false because it never consumes any Chris@0: * characters. Chris@0: */ Chris@0: protected function parseError($msg) Chris@0: { Chris@0: $args = func_get_args(); Chris@0: Chris@0: if (count($args) > 1) { Chris@0: array_shift($args); Chris@0: $msg = vsprintf($msg, $args); Chris@0: } Chris@0: Chris@0: $line = $this->scanner->currentLine(); Chris@0: $col = $this->scanner->columnOffset(); Chris@0: $this->events->parseError($msg, $line, $col); Chris@0: return false; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Decode a character reference and return the string. Chris@0: * Chris@0: * Returns false if the entity could not be found. If $inAttribute is set Chris@0: * to true, a bare & will be returned as-is. Chris@0: * Chris@0: * @param boolean $inAttribute Chris@0: * Set to true if the text is inside of an attribute value. Chris@0: * false otherwise. Chris@0: */ Chris@0: protected function decodeCharacterReference($inAttribute = false) Chris@0: { Chris@0: Chris@0: // If it fails this, it's definitely not an entity. Chris@0: if ($this->scanner->current() != '&') { Chris@0: return false; Chris@0: } Chris@0: Chris@0: // Next char after &. Chris@0: $tok = $this->scanner->next(); Chris@0: $entity = ''; Chris@0: $start = $this->scanner->position(); Chris@0: Chris@0: if ($tok == false) { Chris@0: return '&'; Chris@0: } Chris@0: Chris@0: // These indicate not an entity. We return just Chris@0: // the &. Chris@0: if (strspn($tok, static::WHITE . "&<") == 1) { Chris@0: // $this->scanner->next(); Chris@0: return '&'; Chris@0: } Chris@0: Chris@0: // Numeric entity Chris@0: if ($tok == '#') { Chris@0: $tok = $this->scanner->next(); Chris@0: Chris@0: // Hexidecimal encoding. Chris@0: // X[0-9a-fA-F]+; Chris@0: // x[0-9a-fA-F]+; Chris@0: if ($tok == 'x' || $tok == 'X') { Chris@0: $tok = $this->scanner->next(); // Consume x Chris@0: Chris@0: // Convert from hex code to char. Chris@0: $hex = $this->scanner->getHex(); Chris@0: if (empty($hex)) { Chris@0: $this->parseError("Expected &#xHEX;, got &#x%s", $tok); Chris@0: // We unconsume because we don't know what parser rules might Chris@0: // be in effect for the remaining chars. For example. '&#>' Chris@0: // might result in a specific parsing rule inside of tag Chris@0: // contexts, while not inside of pcdata context. Chris@0: $this->scanner->unconsume(2); Chris@0: return '&'; Chris@0: } Chris@0: $entity = CharacterReference::lookupHex($hex); Chris@0: } // Decimal encoding. Chris@0: // [0-9]+; Chris@0: else { Chris@0: // Convert from decimal to char. Chris@0: $numeric = $this->scanner->getNumeric(); Chris@0: if ($numeric === false) { Chris@0: $this->parseError("Expected &#DIGITS;, got &#%s", $tok); Chris@0: $this->scanner->unconsume(2); Chris@0: return '&'; Chris@0: } Chris@0: $entity = CharacterReference::lookupDecimal($numeric); Chris@0: } Chris@0: } elseif ($tok === '=' && $inAttribute) { Chris@0: return '&'; Chris@0: } else { // String entity. Chris@0: Chris@0: // Attempt to consume a string up to a ';'. Chris@0: // [a-zA-Z0-9]+; Chris@0: $cname = $this->scanner->getAsciiAlphaNum(); Chris@0: $entity = CharacterReference::lookupName($cname); Chris@0: Chris@0: // When no entity is found provide the name of the unmatched string Chris@0: // and continue on as the & is not part of an entity. The & will Chris@0: // be converted to & elsewhere. Chris@0: if ($entity == null) { Chris@0: if (!$inAttribute || strlen($cname) === 0) { Chris@0: $this->parseError("No match in entity table for '%s'", $cname); Chris@0: } Chris@0: $this->scanner->unconsume($this->scanner->position() - $start); Chris@0: return '&'; Chris@0: } Chris@0: } Chris@0: Chris@0: // The scanner has advanced the cursor for us. Chris@0: $tok = $this->scanner->current(); Chris@0: Chris@0: // We have an entity. We're done here. Chris@0: if ($tok == ';') { Chris@0: $this->scanner->next(); Chris@0: return $entity; Chris@0: } Chris@0: Chris@0: // If in an attribute, then failing to match ; means unconsume the Chris@0: // entire string. Otherwise, failure to match is an error. Chris@0: if ($inAttribute) { Chris@0: $this->scanner->unconsume($this->scanner->position() - $start); Chris@0: return '&'; Chris@0: } Chris@0: Chris@0: $this->parseError("Expected &ENTITY;, got &ENTITY%s (no trailing ;) ", $tok); Chris@0: return '&' . $entity; Chris@0: } Chris@0: }