isophonics-drupal-site: vendor/masterminds/html5/src/HTML5/Parser/Scanner.php annotate

annotate vendor/masterminds/html5/src/HTML5/Parser/Scanner.php @ 19:fa3358dc1485 tip

Add ndrum files

author	Chris Cannam
date	Wed, 28 Aug 2019 13:14:47 +0100
parents	129ea1e6d783
children

rev	line source
Chris@0	1 <?php
Chris@17	2
Chris@0	3 namespace Masterminds\HTML5\Parser;
Chris@0	4
Chris@17	5 use Masterminds\HTML5\Exception;
Chris@17	6
Chris@0	7 /**
Chris@17	8 * The scanner scans over a given data input to react appropriately to characters.
Chris@0	9 */
Chris@0	10 class Scanner
Chris@0	11 {
Chris@0	12 const CHARS_HEX = 'abcdefABCDEF01234567890';
Chris@0	13 const CHARS_ALNUM = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890';
Chris@0	14 const CHARS_ALPHA = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ';
Chris@0	15
Chris@17	16 /**
Chris@17	17 * The string data we're parsing.
Chris@17	18 */
Chris@17	19 private $data;
Chris@0	20
Chris@17	21 /**
Chris@17	22 * The current integer byte position we are in $data.
Chris@17	23 */
Chris@17	24 private $char;
Chris@17	25
Chris@17	26 /**
Chris@17	27 * Length of $data; when $char === $data, we are at the end-of-file.
Chris@17	28 */
Chris@17	29 private $EOF;
Chris@17	30
Chris@17	31 /**
Chris@17	32 * Parse errors.
Chris@17	33 */
Chris@17	34 public $errors = array();
Chris@0	35
Chris@0	36 /**
Chris@0	37 * Create a new Scanner.
Chris@0	38 *
Chris@17	39 * @param string $data Data to parse.
Chris@17	40 * @param string $encoding The encoding to use for the data.
Chris@17	41 *
Chris@17	42 * @throws Exception If the given data cannot be encoded to UTF-8.
Chris@0	43 */
Chris@17	44 public function __construct($data, $encoding = 'UTF-8')
Chris@0	45 {
Chris@17	46 if ($data instanceof InputStream) {
Chris@17	47 @trigger_error('InputStream objects are deprecated since version 2.4 and will be removed in 3.0. Use strings instead.', E_USER_DEPRECATED);
Chris@17	48 $data = (string) $data;
Chris@17	49 }
Chris@17	50
Chris@17	51 $data = UTF8Utils::convertToUTF8($data, $encoding);
Chris@17	52
Chris@17	53 // There is good reason to question whether it makes sense to
Chris@17	54 // do this here, since most of these checks are done during
Chris@17	55 // parsing, and since this check doesn't actually do anything.
Chris@17	56 $this->errors = UTF8Utils::checkForIllegalCodepoints($data);
Chris@17	57
Chris@17	58 $data = $this->replaceLinefeeds($data);
Chris@17	59
Chris@17	60 $this->data = $data;
Chris@17	61 $this->char = 0;
Chris@17	62 $this->EOF = strlen($data);
Chris@17	63 }
Chris@17	64
Chris@17	65 /**
Chris@17	66 * Check if upcomming chars match the given sequence.
Chris@17	67 *
Chris@17	68 * This will read the stream for the $sequence. If it's
Chris@17	69 * found, this will return true. If not, return false.
Chris@17	70 * Since this unconsumes any chars it reads, the caller
Chris@17	71 * will still need to read the next sequence, even if
Chris@17	72 * this returns true.
Chris@17	73 *
Chris@17	74 * Example: $this->scanner->sequenceMatches('</script>') will
Chris@17	75 * see if the input stream is at the start of a
Chris@17	76 * '</script>' string.
Chris@17	77 *
Chris@17	78 * @param string $sequence
Chris@17	79 * @param bool $caseSensitive
Chris@17	80 *
Chris@17	81 * @return bool
Chris@17	82 */
Chris@17	83 public function sequenceMatches($sequence, $caseSensitive = true)
Chris@17	84 {
Chris@17	85 $portion = substr($this->data, $this->char, strlen($sequence));
Chris@17	86
Chris@17	87 return $caseSensitive ? $portion === $sequence : 0 === strcasecmp($portion, $sequence);
Chris@0	88 }
Chris@0	89
Chris@0	90 /**
Chris@0	91 * Get the current position.
Chris@0	92 *
Chris@0	93 * @return int The current intiger byte position.
Chris@0	94 */
Chris@0	95 public function position()
Chris@0	96 {
Chris@17	97 return $this->char;
Chris@0	98 }
Chris@0	99
Chris@0	100 /**
Chris@0	101 * Take a peek at the next character in the data.
Chris@0	102 *
Chris@0	103 * @return string The next character.
Chris@0	104 */
Chris@0	105 public function peek()
Chris@0	106 {
Chris@17	107 if (($this->char + 1) <= $this->EOF) {
Chris@17	108 return $this->data[$this->char + 1];
Chris@17	109 }
Chris@17	110
Chris@17	111 return false;
Chris@0	112 }
Chris@0	113
Chris@0	114 /**
Chris@0	115 * Get the next character.
Chris@0	116 * Note: This advances the pointer.
Chris@0	117 *
Chris@0	118 * @return string The next character.
Chris@0	119 */
Chris@0	120 public function next()
Chris@0	121 {
Chris@17	122 ++$this->char;
Chris@17	123
Chris@17	124 if ($this->char < $this->EOF) {
Chris@17	125 return $this->data[$this->char];
Chris@0	126 }
Chris@0	127
Chris@0	128 return false;
Chris@0	129 }
Chris@0	130
Chris@0	131 /**
Chris@0	132 * Get the current character.
Chris@0	133 * Note, this does not advance the pointer.
Chris@0	134 *
Chris@0	135 * @return string The current character.
Chris@0	136 */
Chris@0	137 public function current()
Chris@0	138 {
Chris@17	139 if ($this->char < $this->EOF) {
Chris@17	140 return $this->data[$this->char];
Chris@0	141 }
Chris@0	142
Chris@0	143 return false;
Chris@0	144 }
Chris@0	145
Chris@0	146 /**
Chris@0	147 * Silently consume N chars.
Chris@17	148 *
Chris@17	149 * @param int $count
Chris@0	150 */
Chris@0	151 public function consume($count = 1)
Chris@0	152 {
Chris@17	153 $this->char += $count;
Chris@0	154 }
Chris@0	155
Chris@0	156 /**
Chris@0	157 * Unconsume some of the data.
Chris@0	158 * This moves the data pointer backwards.
Chris@0	159 *
Chris@17	160 * @param int $howMany The number of characters to move the pointer back.
Chris@0	161 */
Chris@0	162 public function unconsume($howMany = 1)
Chris@0	163 {
Chris@17	164 if (($this->char - $howMany) >= 0) {
Chris@17	165 $this->char -= $howMany;
Chris@17	166 }
Chris@0	167 }
Chris@0	168
Chris@0	169 /**
Chris@0	170 * Get the next group of that contains hex characters.
Chris@0	171 * Note, along with getting the characters the pointer in the data will be
Chris@0	172 * moved as well.
Chris@0	173 *
Chris@0	174 * @return string The next group that is hex characters.
Chris@0	175 */
Chris@0	176 public function getHex()
Chris@0	177 {
Chris@17	178 return $this->doCharsWhile(static::CHARS_HEX);
Chris@0	179 }
Chris@0	180
Chris@0	181 /**
Chris@0	182 * Get the next group of characters that are ASCII Alpha characters.
Chris@0	183 * Note, along with getting the characters the pointer in the data will be
Chris@0	184 * moved as well.
Chris@0	185 *
Chris@0	186 * @return string The next group of ASCII alpha characters.
Chris@0	187 */
Chris@0	188 public function getAsciiAlpha()
Chris@0	189 {
Chris@17	190 return $this->doCharsWhile(static::CHARS_ALPHA);
Chris@0	191 }
Chris@0	192
Chris@0	193 /**
Chris@0	194 * Get the next group of characters that are ASCII Alpha characters and numbers.
Chris@0	195 * Note, along with getting the characters the pointer in the data will be
Chris@0	196 * moved as well.
Chris@0	197 *
Chris@0	198 * @return string The next group of ASCII alpha characters and numbers.
Chris@0	199 */
Chris@0	200 public function getAsciiAlphaNum()
Chris@0	201 {
Chris@17	202 return $this->doCharsWhile(static::CHARS_ALNUM);
Chris@0	203 }
Chris@0	204
Chris@0	205 /**
Chris@0	206 * Get the next group of numbers.
Chris@0	207 * Note, along with getting the characters the pointer in the data will be
Chris@0	208 * moved as well.
Chris@0	209 *
Chris@0	210 * @return string The next group of numbers.
Chris@0	211 */
Chris@0	212 public function getNumeric()
Chris@0	213 {
Chris@17	214 return $this->doCharsWhile('0123456789');
Chris@0	215 }
Chris@0	216
Chris@0	217 /**
Chris@0	218 * Consume whitespace.
Chris@17	219 * Whitespace in HTML5 is: formfeed, tab, newline, space.
Chris@0	220 *
Chris@17	221 * @return int The length of the matched whitespaces.
Chris@0	222 */
Chris@0	223 public function whitespace()
Chris@0	224 {
Chris@17	225 if ($this->char >= $this->EOF) {
Chris@17	226 return false;
Chris@17	227 }
Chris@17	228
Chris@17	229 $len = strspn($this->data, "\n\t\f ", $this->char);
Chris@17	230
Chris@17	231 $this->char += $len;
Chris@17	232
Chris@17	233 return $len;
Chris@0	234 }
Chris@0	235
Chris@0	236 /**
Chris@0	237 * Returns the current line that is being consumed.
Chris@0	238 *
Chris@0	239 * @return int The current line number.
Chris@0	240 */
Chris@0	241 public function currentLine()
Chris@0	242 {
Chris@17	243 if (empty($this->EOF) \|\| 0 === $this->char) {
Chris@17	244 return 1;
Chris@17	245 }
Chris@17	246
Chris@17	247 // Add one to $this->char because we want the number for the next
Chris@17	248 // byte to be processed.
Chris@17	249 return substr_count($this->data, "\n", 0, min($this->char, $this->EOF)) + 1;
Chris@0	250 }
Chris@0	251
Chris@0	252 /**
Chris@0	253 * Read chars until something in the mask is encountered.
Chris@17	254 *
Chris@17	255 * @param string $mask
Chris@17	256 *
Chris@17	257 * @return mixed
Chris@0	258 */
Chris@0	259 public function charsUntil($mask)
Chris@0	260 {
Chris@17	261 return $this->doCharsUntil($mask);
Chris@0	262 }
Chris@0	263
Chris@0	264 /**
Chris@0	265 * Read chars as long as the mask matches.
Chris@17	266 *
Chris@17	267 * @param string $mask
Chris@17	268 *
Chris@17	269 * @return int
Chris@0	270 */
Chris@0	271 public function charsWhile($mask)
Chris@0	272 {
Chris@17	273 return $this->doCharsWhile($mask);
Chris@0	274 }
Chris@0	275
Chris@0	276 /**
Chris@0	277 * Returns the current column of the current line that the tokenizer is at.
Chris@0	278 *
Chris@0	279 * Newlines are column 0. The first char after a newline is column 1.
Chris@0	280 *
Chris@0	281 * @return int The column number.
Chris@0	282 */
Chris@0	283 public function columnOffset()
Chris@0	284 {
Chris@17	285 // Short circuit for the first char.
Chris@17	286 if (0 === $this->char) {
Chris@17	287 return 0;
Chris@17	288 }
Chris@17	289
Chris@17	290 // strrpos is weird, and the offset needs to be negative for what we
Chris@17	291 // want (i.e., the last \n before $this->char). This needs to not have
Chris@17	292 // one (to make it point to the next character, the one we want the
Chris@17	293 // position of) added to it because strrpos's behaviour includes the
Chris@17	294 // final offset byte.
Chris@17	295 $backwardFrom = $this->char - 1 - strlen($this->data);
Chris@17	296 $lastLine = strrpos($this->data, "\n", $backwardFrom);
Chris@17	297
Chris@17	298 // However, for here we want the length up until the next byte to be
Chris@17	299 // processed, so add one to the current byte ($this->char).
Chris@17	300 if (false !== $lastLine) {
Chris@17	301 $findLengthOf = substr($this->data, $lastLine + 1, $this->char - 1 - $lastLine);
Chris@17	302 } else {
Chris@17	303 // After a newline.
Chris@17	304 $findLengthOf = substr($this->data, 0, $this->char);
Chris@17	305 }
Chris@17	306
Chris@17	307 return UTF8Utils::countChars($findLengthOf);
Chris@0	308 }
Chris@0	309
Chris@0	310 /**
Chris@0	311 * Get all characters until EOF.
Chris@0	312 *
Chris@0	313 * This consumes characters until the EOF.
Chris@0	314 *
Chris@0	315 * @return int The number of characters remaining.
Chris@0	316 */
Chris@0	317 public function remainingChars()
Chris@0	318 {
Chris@17	319 if ($this->char < $this->EOF) {
Chris@17	320 $data = substr($this->data, $this->char);
Chris@17	321 $this->char = $this->EOF;
Chris@17	322
Chris@17	323 return $data;
Chris@17	324 }
Chris@17	325
Chris@17	326 return ''; // false;
Chris@17	327 }
Chris@17	328
Chris@17	329 /**
Chris@17	330 * Replace linefeed characters according to the spec.
Chris@17	331 *
Chris@17	332 * @param $data
Chris@17	333 *
Chris@17	334 * @return string
Chris@17	335 */
Chris@17	336 private function replaceLinefeeds($data)
Chris@17	337 {
Chris@17	338 /*
Chris@17	339 * U+000D CARRIAGE RETURN (CR) characters and U+000A LINE FEED (LF) characters are treated specially.
Chris@17	340 * Any CR characters that are followed by LF characters must be removed, and any CR characters not
Chris@17	341 * followed by LF characters must be converted to LF characters. Thus, newlines in HTML DOMs are
Chris@17	342 * represented by LF characters, and there are never any CR characters in the input to the tokenization
Chris@17	343 * stage.
Chris@17	344 */
Chris@17	345 $crlfTable = array(
Chris@17	346 "\0" => "\xEF\xBF\xBD",
Chris@17	347 "\r\n" => "\n",
Chris@17	348 "\r" => "\n",
Chris@17	349 );
Chris@17	350
Chris@17	351 return strtr($data, $crlfTable);
Chris@17	352 }
Chris@17	353
Chris@17	354 /**
Chris@17	355 * Read to a particular match (or until $max bytes are consumed).
Chris@17	356 *
Chris@17	357 * This operates on byte sequences, not characters.
Chris@17	358 *
Chris@17	359 * Matches as far as possible until we reach a certain set of bytes
Chris@17	360 * and returns the matched substring.
Chris@17	361 *
Chris@17	362 * @param string $bytes Bytes to match.
Chris@17	363 * @param int $max Maximum number of bytes to scan.
Chris@17	364 *
Chris@17	365 * @return mixed Index or false if no match is found. You should use strong
Chris@17	366 * equality when checking the result, since index could be 0.
Chris@17	367 */
Chris@17	368 private function doCharsUntil($bytes, $max = null)
Chris@17	369 {
Chris@17	370 if ($this->char >= $this->EOF) {
Chris@17	371 return false;
Chris@17	372 }
Chris@17	373
Chris@17	374 if (0 === $max \|\| $max) {
Chris@17	375 $len = strcspn($this->data, $bytes, $this->char, $max);
Chris@17	376 } else {
Chris@17	377 $len = strcspn($this->data, $bytes, $this->char);
Chris@17	378 }
Chris@17	379
Chris@17	380 $string = (string) substr($this->data, $this->char, $len);
Chris@17	381 $this->char += $len;
Chris@17	382
Chris@17	383 return $string;
Chris@17	384 }
Chris@17	385
Chris@17	386 /**
Chris@17	387 * Returns the string so long as $bytes matches.
Chris@17	388 *
Chris@17	389 * Matches as far as possible with a certain set of bytes
Chris@17	390 * and returns the matched substring.
Chris@17	391 *
Chris@17	392 * @param string $bytes A mask of bytes to match. If ANY byte in this mask matches the
Chris@17	393 * current char, the pointer advances and the char is part of the
Chris@17	394 * substring.
Chris@17	395 * @param int $max The max number of chars to read.
Chris@17	396 *
Chris@17	397 * @return string
Chris@17	398 */
Chris@17	399 private function doCharsWhile($bytes, $max = null)
Chris@17	400 {
Chris@17	401 if ($this->char >= $this->EOF) {
Chris@17	402 return false;
Chris@17	403 }
Chris@17	404
Chris@17	405 if (0 === $max \|\| $max) {
Chris@17	406 $len = strspn($this->data, $bytes, $this->char, $max);
Chris@17	407 } else {
Chris@17	408 $len = strspn($this->data, $bytes, $this->char);
Chris@17	409 }
Chris@17	410
Chris@17	411 $string = (string) substr($this->data, $this->char, $len);
Chris@17	412 $this->char += $len;
Chris@17	413
Chris@17	414 return $string;
Chris@0	415 }
Chris@0	416 }

Mercurial > hg > isophonics-drupal-site

annotate vendor/masterminds/html5/src/HTML5/Parser/Scanner.php @ 19:fa3358dc1485 tip