Mercurial > hg > isophonics-drupal-site
diff vendor/symfony/dom-crawler/Crawler.php @ 0:4c8ae668cc8c
Initial import (non-working)
author | Chris Cannam |
---|---|
date | Wed, 29 Nov 2017 16:09:58 +0000 |
parents | |
children | 7a779792577d |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vendor/symfony/dom-crawler/Crawler.php Wed Nov 29 16:09:58 2017 +0000 @@ -0,0 +1,1174 @@ +<?php + +/* + * This file is part of the Symfony package. + * + * (c) Fabien Potencier <fabien@symfony.com> + * + * For the full copyright and license information, please view the LICENSE + * file that was distributed with this source code. + */ + +namespace Symfony\Component\DomCrawler; + +use Symfony\Component\CssSelector\CssSelectorConverter; + +/** + * Crawler eases navigation of a list of \DOMNode objects. + * + * @author Fabien Potencier <fabien@symfony.com> + */ +class Crawler implements \Countable, \IteratorAggregate +{ + /** + * @var string The current URI + */ + protected $uri; + + /** + * @var string The default namespace prefix to be used with XPath and CSS expressions + */ + private $defaultNamespacePrefix = 'default'; + + /** + * @var array A map of manually registered namespaces + */ + private $namespaces = array(); + + /** + * @var string The base href value + */ + private $baseHref; + + /** + * @var \DOMDocument|null + */ + private $document; + + /** + * @var \DOMElement[] + */ + private $nodes = array(); + + /** + * Whether the Crawler contains HTML or XML content (used when converting CSS to XPath). + * + * @var bool + */ + private $isHtml = true; + + /** + * @param mixed $node A Node to use as the base for the crawling + * @param string $currentUri The current URI + * @param string $baseHref The base href value + */ + public function __construct($node = null, $currentUri = null, $baseHref = null) + { + $this->uri = $currentUri; + $this->baseHref = $baseHref ?: $currentUri; + + $this->add($node); + } + + /** + * Returns the current URI. + * + * @return string + */ + public function getUri() + { + return $this->uri; + } + + /** + * Returns base href. + * + * @return string + */ + public function getBaseHref() + { + return $this->baseHref; + } + + /** + * Removes all the nodes. + */ + public function clear() + { + $this->nodes = array(); + $this->document = null; + } + + /** + * Adds a node to the current list of nodes. + * + * This method uses the appropriate specialized add*() method based + * on the type of the argument. + * + * @param \DOMNodeList|\DOMNode|array|string|null $node A node + * + * @throws \InvalidArgumentException When node is not the expected type. + */ + public function add($node) + { + if ($node instanceof \DOMNodeList) { + $this->addNodeList($node); + } elseif ($node instanceof \DOMNode) { + $this->addNode($node); + } elseif (is_array($node)) { + $this->addNodes($node); + } elseif (is_string($node)) { + $this->addContent($node); + } elseif (null !== $node) { + throw new \InvalidArgumentException(sprintf('Expecting a DOMNodeList or DOMNode instance, an array, a string, or null, but got "%s".', is_object($node) ? get_class($node) : gettype($node))); + } + } + + /** + * Adds HTML/XML content. + * + * If the charset is not set via the content type, it is assumed + * to be ISO-8859-1, which is the default charset defined by the + * HTTP 1.1 specification. + * + * @param string $content A string to parse as HTML/XML + * @param null|string $type The content type of the string + */ + public function addContent($content, $type = null) + { + if (empty($type)) { + $type = 0 === strpos($content, '<?xml') ? 'application/xml' : 'text/html'; + } + + // DOM only for HTML/XML content + if (!preg_match('/(x|ht)ml/i', $type, $xmlMatches)) { + return; + } + + $charset = null; + if (false !== $pos = stripos($type, 'charset=')) { + $charset = substr($type, $pos + 8); + if (false !== $pos = strpos($charset, ';')) { + $charset = substr($charset, 0, $pos); + } + } + + // http://www.w3.org/TR/encoding/#encodings + // http://www.w3.org/TR/REC-xml/#NT-EncName + if (null === $charset && + preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9_:.]+)/i', $content, $matches)) { + $charset = $matches[1]; + } + + if (null === $charset) { + $charset = 'ISO-8859-1'; + } + + if ('x' === $xmlMatches[1]) { + $this->addXmlContent($content, $charset); + } else { + $this->addHtmlContent($content, $charset); + } + } + + /** + * Adds an HTML content to the list of nodes. + * + * The libxml errors are disabled when the content is parsed. + * + * If you want to get parsing errors, be sure to enable + * internal errors via libxml_use_internal_errors(true) + * and then, get the errors via libxml_get_errors(). Be + * sure to clear errors with libxml_clear_errors() afterward. + * + * @param string $content The HTML content + * @param string $charset The charset + */ + public function addHtmlContent($content, $charset = 'UTF-8') + { + $internalErrors = libxml_use_internal_errors(true); + $disableEntities = libxml_disable_entity_loader(true); + + $dom = new \DOMDocument('1.0', $charset); + $dom->validateOnParse = true; + + set_error_handler(function () { throw new \Exception(); }); + + try { + // Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML() + $content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset); + } catch (\Exception $e) { + } + + restore_error_handler(); + + if ('' !== trim($content)) { + @$dom->loadHTML($content); + } + + libxml_use_internal_errors($internalErrors); + libxml_disable_entity_loader($disableEntities); + + $this->addDocument($dom); + + $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(array('href')); + + $baseHref = current($base); + if (count($base) && !empty($baseHref)) { + if ($this->baseHref) { + $linkNode = $dom->createElement('a'); + $linkNode->setAttribute('href', $baseHref); + $link = new Link($linkNode, $this->baseHref); + $this->baseHref = $link->getUri(); + } else { + $this->baseHref = $baseHref; + } + } + } + + /** + * Adds an XML content to the list of nodes. + * + * The libxml errors are disabled when the content is parsed. + * + * If you want to get parsing errors, be sure to enable + * internal errors via libxml_use_internal_errors(true) + * and then, get the errors via libxml_get_errors(). Be + * sure to clear errors with libxml_clear_errors() afterward. + * + * @param string $content The XML content + * @param string $charset The charset + * @param int $options Bitwise OR of the libxml option constants + * LIBXML_PARSEHUGE is dangerous, see + * http://symfony.com/blog/security-release-symfony-2-0-17-released + */ + public function addXmlContent($content, $charset = 'UTF-8', $options = LIBXML_NONET) + { + // remove the default namespace if it's the only namespace to make XPath expressions simpler + if (!preg_match('/xmlns:/', $content)) { + $content = str_replace('xmlns', 'ns', $content); + } + + $internalErrors = libxml_use_internal_errors(true); + $disableEntities = libxml_disable_entity_loader(true); + + $dom = new \DOMDocument('1.0', $charset); + $dom->validateOnParse = true; + + if ('' !== trim($content)) { + @$dom->loadXML($content, $options); + } + + libxml_use_internal_errors($internalErrors); + libxml_disable_entity_loader($disableEntities); + + $this->addDocument($dom); + + $this->isHtml = false; + } + + /** + * Adds a \DOMDocument to the list of nodes. + * + * @param \DOMDocument $dom A \DOMDocument instance + */ + public function addDocument(\DOMDocument $dom) + { + if ($dom->documentElement) { + $this->addNode($dom->documentElement); + } + } + + /** + * Adds a \DOMNodeList to the list of nodes. + * + * @param \DOMNodeList $nodes A \DOMNodeList instance + */ + public function addNodeList(\DOMNodeList $nodes) + { + foreach ($nodes as $node) { + if ($node instanceof \DOMNode) { + $this->addNode($node); + } + } + } + + /** + * Adds an array of \DOMNode instances to the list of nodes. + * + * @param \DOMNode[] $nodes An array of \DOMNode instances + */ + public function addNodes(array $nodes) + { + foreach ($nodes as $node) { + $this->add($node); + } + } + + /** + * Adds a \DOMNode instance to the list of nodes. + * + * @param \DOMNode $node A \DOMNode instance + */ + public function addNode(\DOMNode $node) + { + if ($node instanceof \DOMDocument) { + $node = $node->documentElement; + } + + if (null !== $this->document && $this->document !== $node->ownerDocument) { + throw new \InvalidArgumentException('Attaching DOM nodes from multiple documents in the same crawler is forbidden.'); + } + + if (null === $this->document) { + $this->document = $node->ownerDocument; + } + + // Don't add duplicate nodes in the Crawler + if (in_array($node, $this->nodes, true)) { + return; + } + + $this->nodes[] = $node; + } + + /** + * Returns a node given its position in the node list. + * + * @param int $position The position + * + * @return self + */ + public function eq($position) + { + if (isset($this->nodes[$position])) { + return $this->createSubCrawler($this->nodes[$position]); + } + + return $this->createSubCrawler(null); + } + + /** + * Calls an anonymous function on each node of the list. + * + * The anonymous function receives the position and the node wrapped + * in a Crawler instance as arguments. + * + * Example: + * + * $crawler->filter('h1')->each(function ($node, $i) { + * return $node->text(); + * }); + * + * @param \Closure $closure An anonymous function + * + * @return array An array of values returned by the anonymous function + */ + public function each(\Closure $closure) + { + $data = array(); + foreach ($this->nodes as $i => $node) { + $data[] = $closure($this->createSubCrawler($node), $i); + } + + return $data; + } + + /** + * Slices the list of nodes by $offset and $length. + * + * @param int $offset + * @param int $length + * + * @return self + */ + public function slice($offset = 0, $length = null) + { + return $this->createSubCrawler(array_slice($this->nodes, $offset, $length)); + } + + /** + * Reduces the list of nodes by calling an anonymous function. + * + * To remove a node from the list, the anonymous function must return false. + * + * @param \Closure $closure An anonymous function + * + * @return self + */ + public function reduce(\Closure $closure) + { + $nodes = array(); + foreach ($this->nodes as $i => $node) { + if (false !== $closure($this->createSubCrawler($node), $i)) { + $nodes[] = $node; + } + } + + return $this->createSubCrawler($nodes); + } + + /** + * Returns the first node of the current selection. + * + * @return self + */ + public function first() + { + return $this->eq(0); + } + + /** + * Returns the last node of the current selection. + * + * @return self + */ + public function last() + { + return $this->eq(count($this->nodes) - 1); + } + + /** + * Returns the siblings nodes of the current selection. + * + * @return self + * + * @throws \InvalidArgumentException When current node is empty + */ + public function siblings() + { + if (!$this->nodes) { + throw new \InvalidArgumentException('The current node list is empty.'); + } + + return $this->createSubCrawler($this->sibling($this->getNode(0)->parentNode->firstChild)); + } + + /** + * Returns the next siblings nodes of the current selection. + * + * @return self + * + * @throws \InvalidArgumentException When current node is empty + */ + public function nextAll() + { + if (!$this->nodes) { + throw new \InvalidArgumentException('The current node list is empty.'); + } + + return $this->createSubCrawler($this->sibling($this->getNode(0))); + } + + /** + * Returns the previous sibling nodes of the current selection. + * + * @return self + * + * @throws \InvalidArgumentException + */ + public function previousAll() + { + if (!$this->nodes) { + throw new \InvalidArgumentException('The current node list is empty.'); + } + + return $this->createSubCrawler($this->sibling($this->getNode(0), 'previousSibling')); + } + + /** + * Returns the parents nodes of the current selection. + * + * @return self + * + * @throws \InvalidArgumentException When current node is empty + */ + public function parents() + { + if (!$this->nodes) { + throw new \InvalidArgumentException('The current node list is empty.'); + } + + $node = $this->getNode(0); + $nodes = array(); + + while ($node = $node->parentNode) { + if (XML_ELEMENT_NODE === $node->nodeType) { + $nodes[] = $node; + } + } + + return $this->createSubCrawler($nodes); + } + + /** + * Returns the children nodes of the current selection. + * + * @return self + * + * @throws \InvalidArgumentException When current node is empty + */ + public function children() + { + if (!$this->nodes) { + throw new \InvalidArgumentException('The current node list is empty.'); + } + + $node = $this->getNode(0)->firstChild; + + return $this->createSubCrawler($node ? $this->sibling($node) : array()); + } + + /** + * Returns the attribute value of the first node of the list. + * + * @param string $attribute The attribute name + * + * @return string|null The attribute value or null if the attribute does not exist + * + * @throws \InvalidArgumentException When current node is empty + */ + public function attr($attribute) + { + if (!$this->nodes) { + throw new \InvalidArgumentException('The current node list is empty.'); + } + + $node = $this->getNode(0); + + return $node->hasAttribute($attribute) ? $node->getAttribute($attribute) : null; + } + + /** + * Returns the node name of the first node of the list. + * + * @return string The node name + * + * @throws \InvalidArgumentException When current node is empty + */ + public function nodeName() + { + if (!$this->nodes) { + throw new \InvalidArgumentException('The current node list is empty.'); + } + + return $this->getNode(0)->nodeName; + } + + /** + * Returns the node value of the first node of the list. + * + * @return string The node value + * + * @throws \InvalidArgumentException When current node is empty + */ + public function text() + { + if (!$this->nodes) { + throw new \InvalidArgumentException('The current node list is empty.'); + } + + return $this->getNode(0)->nodeValue; + } + + /** + * Returns the first node of the list as HTML. + * + * @return string The node html + * + * @throws \InvalidArgumentException When current node is empty + */ + public function html() + { + if (!$this->nodes) { + throw new \InvalidArgumentException('The current node list is empty.'); + } + + $html = ''; + foreach ($this->getNode(0)->childNodes as $child) { + $html .= $child->ownerDocument->saveHTML($child); + } + + return $html; + } + + /** + * Evaluates an XPath expression. + * + * Since an XPath expression might evaluate to either a simple type or a \DOMNodeList, + * this method will return either an array of simple types or a new Crawler instance. + * + * @param string $xpath An XPath expression + * + * @return array|Crawler An array of evaluation results or a new Crawler instance + */ + public function evaluate($xpath) + { + if (null === $this->document) { + throw new \LogicException('Cannot evaluate the expression on an uninitialized crawler.'); + } + + $data = array(); + $domxpath = $this->createDOMXPath($this->document, $this->findNamespacePrefixes($xpath)); + + foreach ($this->nodes as $node) { + $data[] = $domxpath->evaluate($xpath, $node); + } + + if (isset($data[0]) && $data[0] instanceof \DOMNodeList) { + return $this->createSubCrawler($data); + } + + return $data; + } + + /** + * Extracts information from the list of nodes. + * + * You can extract attributes or/and the node value (_text). + * + * Example: + * + * $crawler->filter('h1 a')->extract(array('_text', 'href')); + * + * @param array $attributes An array of attributes + * + * @return array An array of extracted values + */ + public function extract($attributes) + { + $attributes = (array) $attributes; + $count = count($attributes); + + $data = array(); + foreach ($this->nodes as $node) { + $elements = array(); + foreach ($attributes as $attribute) { + if ('_text' === $attribute) { + $elements[] = $node->nodeValue; + } else { + $elements[] = $node->getAttribute($attribute); + } + } + + $data[] = $count > 1 ? $elements : $elements[0]; + } + + return $data; + } + + /** + * Filters the list of nodes with an XPath expression. + * + * The XPath expression is evaluated in the context of the crawler, which + * is considered as a fake parent of the elements inside it. + * This means that a child selector "div" or "./div" will match only + * the div elements of the current crawler, not their children. + * + * @param string $xpath An XPath expression + * + * @return self + */ + public function filterXPath($xpath) + { + $xpath = $this->relativize($xpath); + + // If we dropped all expressions in the XPath while preparing it, there would be no match + if ('' === $xpath) { + return $this->createSubCrawler(null); + } + + return $this->filterRelativeXPath($xpath); + } + + /** + * Filters the list of nodes with a CSS selector. + * + * This method only works if you have installed the CssSelector Symfony Component. + * + * @param string $selector A CSS selector + * + * @return self + * + * @throws \RuntimeException if the CssSelector Component is not available + */ + public function filter($selector) + { + if (!class_exists('Symfony\\Component\\CssSelector\\CssSelectorConverter')) { + throw new \RuntimeException('Unable to filter with a CSS selector as the Symfony CssSelector 2.8+ is not installed (you can use filterXPath instead).'); + } + + $converter = new CssSelectorConverter($this->isHtml); + + // The CssSelector already prefixes the selector with descendant-or-self:: + return $this->filterRelativeXPath($converter->toXPath($selector)); + } + + /** + * Selects links by name or alt value for clickable images. + * + * @param string $value The link text + * + * @return self + */ + public function selectLink($value) + { + $xpath = sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) ', static::xpathLiteral(' '.$value.' ')). + sprintf('or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]]', static::xpathLiteral(' '.$value.' ')); + + return $this->filterRelativeXPath($xpath); + } + + /** + * Selects images by alt value. + * + * @param string $value The image alt + * + * @return self A new instance of Crawler with the filtered list of nodes + */ + public function selectImage($value) + { + $xpath = sprintf('descendant-or-self::img[contains(normalize-space(string(@alt)), %s)]', static::xpathLiteral($value)); + + return $this->filterRelativeXPath($xpath); + } + + /** + * Selects a button by name or alt value for images. + * + * @param string $value The button text + * + * @return self + */ + public function selectButton($value) + { + $translate = 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")'; + $xpath = sprintf('descendant-or-self::input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', $translate, $translate, static::xpathLiteral(' '.$value.' ')). + sprintf('or (contains(%s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id=%s or @name=%s] ', $translate, static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value), static::xpathLiteral($value)). + sprintf('| descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id=%s or @name=%s]', static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value), static::xpathLiteral($value)); + + return $this->filterRelativeXPath($xpath); + } + + /** + * Returns a Link object for the first node in the list. + * + * @param string $method The method for the link (get by default) + * + * @return Link A Link instance + * + * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement + */ + public function link($method = 'get') + { + if (!$this->nodes) { + throw new \InvalidArgumentException('The current node list is empty.'); + } + + $node = $this->getNode(0); + + if (!$node instanceof \DOMElement) { + throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node))); + } + + return new Link($node, $this->baseHref, $method); + } + + /** + * Returns an array of Link objects for the nodes in the list. + * + * @return Link[] An array of Link instances + * + * @throws \InvalidArgumentException If the current node list contains non-DOMElement instances + */ + public function links() + { + $links = array(); + foreach ($this->nodes as $node) { + if (!$node instanceof \DOMElement) { + throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_class($node))); + } + + $links[] = new Link($node, $this->baseHref, 'get'); + } + + return $links; + } + + /** + * Returns an Image object for the first node in the list. + * + * @return Image An Image instance + * + * @throws \InvalidArgumentException If the current node list is empty + */ + public function image() + { + if (!count($this)) { + throw new \InvalidArgumentException('The current node list is empty.'); + } + + $node = $this->getNode(0); + + if (!$node instanceof \DOMElement) { + throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node))); + } + + return new Image($node, $this->baseHref); + } + + /** + * Returns an array of Image objects for the nodes in the list. + * + * @return Image[] An array of Image instances + */ + public function images() + { + $images = array(); + foreach ($this as $node) { + if (!$node instanceof \DOMElement) { + throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_class($node))); + } + + $images[] = new Image($node, $this->baseHref); + } + + return $images; + } + + /** + * Returns a Form object for the first node in the list. + * + * @param array $values An array of values for the form fields + * @param string $method The method for the form + * + * @return Form A Form instance + * + * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement + */ + public function form(array $values = null, $method = null) + { + if (!$this->nodes) { + throw new \InvalidArgumentException('The current node list is empty.'); + } + + $node = $this->getNode(0); + + if (!$node instanceof \DOMElement) { + throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node))); + } + + $form = new Form($node, $this->uri, $method, $this->baseHref); + + if (null !== $values) { + $form->setValues($values); + } + + return $form; + } + + /** + * Overloads a default namespace prefix to be used with XPath and CSS expressions. + * + * @param string $prefix + */ + public function setDefaultNamespacePrefix($prefix) + { + $this->defaultNamespacePrefix = $prefix; + } + + /** + * @param string $prefix + * @param string $namespace + */ + public function registerNamespace($prefix, $namespace) + { + $this->namespaces[$prefix] = $namespace; + } + + /** + * Converts string for XPath expressions. + * + * Escaped characters are: quotes (") and apostrophe ('). + * + * Examples: + * <code> + * echo Crawler::xpathLiteral('foo " bar'); + * //prints 'foo " bar' + * + * echo Crawler::xpathLiteral("foo ' bar"); + * //prints "foo ' bar" + * + * echo Crawler::xpathLiteral('a\'b"c'); + * //prints concat('a', "'", 'b"c') + * </code> + * + * @param string $s String to be escaped + * + * @return string Converted string + */ + public static function xpathLiteral($s) + { + if (false === strpos($s, "'")) { + return sprintf("'%s'", $s); + } + + if (false === strpos($s, '"')) { + return sprintf('"%s"', $s); + } + + $string = $s; + $parts = array(); + while (true) { + if (false !== $pos = strpos($string, "'")) { + $parts[] = sprintf("'%s'", substr($string, 0, $pos)); + $parts[] = "\"'\""; + $string = substr($string, $pos + 1); + } else { + $parts[] = "'$string'"; + break; + } + } + + return sprintf('concat(%s)', implode(', ', $parts)); + } + + /** + * Filters the list of nodes with an XPath expression. + * + * The XPath expression should already be processed to apply it in the context of each node. + * + * @param string $xpath + * + * @return self + */ + private function filterRelativeXPath($xpath) + { + $prefixes = $this->findNamespacePrefixes($xpath); + + $crawler = $this->createSubCrawler(null); + + foreach ($this->nodes as $node) { + $domxpath = $this->createDOMXPath($node->ownerDocument, $prefixes); + $crawler->add($domxpath->query($xpath, $node)); + } + + return $crawler; + } + + /** + * Make the XPath relative to the current context. + * + * The returned XPath will match elements matching the XPath inside the current crawler + * when running in the context of a node of the crawler. + * + * @param string $xpath + * + * @return string + */ + private function relativize($xpath) + { + $expressions = array(); + + // An expression which will never match to replace expressions which cannot match in the crawler + // We cannot simply drop + $nonMatchingExpression = 'a[name() = "b"]'; + + $xpathLen = strlen($xpath); + $openedBrackets = 0; + $startPosition = strspn($xpath, " \t\n\r\0\x0B"); + + for ($i = $startPosition; $i <= $xpathLen; ++$i) { + $i += strcspn($xpath, '"\'[]|', $i); + + if ($i < $xpathLen) { + switch ($xpath[$i]) { + case '"': + case "'": + if (false === $i = strpos($xpath, $xpath[$i], $i + 1)) { + return $xpath; // The XPath expression is invalid + } + continue 2; + case '[': + ++$openedBrackets; + continue 2; + case ']': + --$openedBrackets; + continue 2; + } + } + if ($openedBrackets) { + continue; + } + + if ($startPosition < $xpathLen && '(' === $xpath[$startPosition]) { + // If the union is inside some braces, we need to preserve the opening braces and apply + // the change only inside it. + $j = 1 + strspn($xpath, "( \t\n\r\0\x0B", $startPosition + 1); + $parenthesis = substr($xpath, $startPosition, $j); + $startPosition += $j; + } else { + $parenthesis = ''; + } + $expression = rtrim(substr($xpath, $startPosition, $i - $startPosition)); + + if (0 === strpos($expression, 'self::*/')) { + $expression = './'.substr($expression, 8); + } + + // add prefix before absolute element selector + if ('' === $expression) { + $expression = $nonMatchingExpression; + } elseif (0 === strpos($expression, '//')) { + $expression = 'descendant-or-self::'.substr($expression, 2); + } elseif (0 === strpos($expression, './/')) { + $expression = 'descendant-or-self::'.substr($expression, 3); + } elseif (0 === strpos($expression, './')) { + $expression = 'self::'.substr($expression, 2); + } elseif (0 === strpos($expression, 'child::')) { + $expression = 'self::'.substr($expression, 7); + } elseif ('/' === $expression[0] || '.' === $expression[0] || 0 === strpos($expression, 'self::')) { + $expression = $nonMatchingExpression; + } elseif (0 === strpos($expression, 'descendant::')) { + $expression = 'descendant-or-self::'.substr($expression, 12); + } elseif (preg_match('/^(ancestor|ancestor-or-self|attribute|following|following-sibling|namespace|parent|preceding|preceding-sibling)::/', $expression)) { + // the fake root has no parent, preceding or following nodes and also no attributes (even no namespace attributes) + $expression = $nonMatchingExpression; + } elseif (0 !== strpos($expression, 'descendant-or-self::')) { + $expression = 'self::'.$expression; + } + $expressions[] = $parenthesis.$expression; + + if ($i === $xpathLen) { + return implode(' | ', $expressions); + } + + $i += strspn($xpath, " \t\n\r\0\x0B", $i + 1); + $startPosition = $i + 1; + } + + return $xpath; // The XPath expression is invalid + } + + /** + * @param int $position + * + * @return \DOMElement|null + */ + public function getNode($position) + { + if (isset($this->nodes[$position])) { + return $this->nodes[$position]; + } + } + + /** + * @return int + */ + public function count() + { + return count($this->nodes); + } + + /** + * @return \ArrayIterator + */ + public function getIterator() + { + return new \ArrayIterator($this->nodes); + } + + /** + * @param \DOMElement $node + * @param string $siblingDir + * + * @return array + */ + protected function sibling($node, $siblingDir = 'nextSibling') + { + $nodes = array(); + + do { + if ($node !== $this->getNode(0) && $node->nodeType === 1) { + $nodes[] = $node; + } + } while ($node = $node->$siblingDir); + + return $nodes; + } + + /** + * @param \DOMDocument $document + * @param array $prefixes + * + * @return \DOMXPath + * + * @throws \InvalidArgumentException + */ + private function createDOMXPath(\DOMDocument $document, array $prefixes = array()) + { + $domxpath = new \DOMXPath($document); + + foreach ($prefixes as $prefix) { + $namespace = $this->discoverNamespace($domxpath, $prefix); + if (null !== $namespace) { + $domxpath->registerNamespace($prefix, $namespace); + } + } + + return $domxpath; + } + + /** + * @param \DOMXPath $domxpath + * @param string $prefix + * + * @return string + * + * @throws \InvalidArgumentException + */ + private function discoverNamespace(\DOMXPath $domxpath, $prefix) + { + if (isset($this->namespaces[$prefix])) { + return $this->namespaces[$prefix]; + } + + // ask for one namespace, otherwise we'd get a collection with an item for each node + $namespaces = $domxpath->query(sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix)); + + if ($node = $namespaces->item(0)) { + return $node->nodeValue; + } + } + + /** + * @param string $xpath + * + * @return array + */ + private function findNamespacePrefixes($xpath) + { + if (preg_match_all('/(?P<prefix>[a-z_][a-z_0-9\-\.]*+):[^"\/:]/i', $xpath, $matches)) { + return array_unique($matches['prefix']); + } + + return array(); + } + + /** + * Creates a crawler for some subnodes. + * + * @param \DOMElement|\DOMElement[]|\DOMNodeList|null $nodes + * + * @return static + */ + private function createSubCrawler($nodes) + { + $crawler = new static($nodes, $this->uri, $this->baseHref); + $crawler->isHtml = $this->isHtml; + $crawler->document = $this->document; + $crawler->namespaces = $this->namespaces; + + return $crawler; + } +}