Chris@0: Chris@0: * Chris@0: * For the full copyright and license information, please view the LICENSE Chris@0: * file that was distributed with this source code. Chris@0: */ Chris@0: Chris@0: namespace Symfony\Component\DomCrawler; Chris@0: Chris@0: use Symfony\Component\CssSelector\CssSelectorConverter; Chris@0: Chris@0: /** Chris@0: * Crawler eases navigation of a list of \DOMNode objects. Chris@0: * Chris@0: * @author Fabien Potencier Chris@0: */ Chris@0: class Crawler implements \Countable, \IteratorAggregate Chris@0: { Chris@0: protected $uri; Chris@0: Chris@0: /** Chris@0: * @var string The default namespace prefix to be used with XPath and CSS expressions Chris@0: */ Chris@0: private $defaultNamespacePrefix = 'default'; Chris@0: Chris@0: /** Chris@0: * @var array A map of manually registered namespaces Chris@0: */ Chris@17: private $namespaces = []; Chris@0: Chris@0: /** Chris@0: * @var string The base href value Chris@0: */ Chris@0: private $baseHref; Chris@0: Chris@0: /** Chris@0: * @var \DOMDocument|null Chris@0: */ Chris@0: private $document; Chris@0: Chris@0: /** Chris@0: * @var \DOMElement[] Chris@0: */ Chris@17: private $nodes = []; Chris@0: Chris@0: /** Chris@0: * Whether the Crawler contains HTML or XML content (used when converting CSS to XPath). Chris@0: * Chris@0: * @var bool Chris@0: */ Chris@0: private $isHtml = true; Chris@0: Chris@0: /** Chris@12: * @param mixed $node A Node to use as the base for the crawling Chris@12: * @param string $uri The current URI Chris@12: * @param string $baseHref The base href value Chris@0: */ Chris@12: public function __construct($node = null, $uri = null, $baseHref = null) Chris@0: { Chris@12: $this->uri = $uri; Chris@12: $this->baseHref = $baseHref ?: $uri; Chris@0: Chris@0: $this->add($node); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Returns the current URI. Chris@0: * Chris@0: * @return string Chris@0: */ Chris@0: public function getUri() Chris@0: { Chris@0: return $this->uri; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Returns base href. Chris@0: * Chris@0: * @return string Chris@0: */ Chris@0: public function getBaseHref() Chris@0: { Chris@0: return $this->baseHref; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Removes all the nodes. Chris@0: */ Chris@0: public function clear() Chris@0: { Chris@17: $this->nodes = []; Chris@0: $this->document = null; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Adds a node to the current list of nodes. Chris@0: * Chris@0: * This method uses the appropriate specialized add*() method based Chris@0: * on the type of the argument. Chris@0: * Chris@0: * @param \DOMNodeList|\DOMNode|array|string|null $node A node Chris@0: * Chris@12: * @throws \InvalidArgumentException when node is not the expected type Chris@0: */ Chris@0: public function add($node) Chris@0: { Chris@0: if ($node instanceof \DOMNodeList) { Chris@0: $this->addNodeList($node); Chris@0: } elseif ($node instanceof \DOMNode) { Chris@0: $this->addNode($node); Chris@17: } elseif (\is_array($node)) { Chris@0: $this->addNodes($node); Chris@17: } elseif (\is_string($node)) { Chris@0: $this->addContent($node); Chris@0: } elseif (null !== $node) { Chris@17: throw new \InvalidArgumentException(sprintf('Expecting a DOMNodeList or DOMNode instance, an array, a string, or null, but got "%s".', \is_object($node) ? \get_class($node) : \gettype($node))); Chris@0: } Chris@0: } Chris@0: Chris@0: /** Chris@0: * Adds HTML/XML content. Chris@0: * Chris@12: * If the charset is not set via the content type, it is assumed to be UTF-8, Chris@12: * or ISO-8859-1 as a fallback, which is the default charset defined by the Chris@0: * HTTP 1.1 specification. Chris@0: * Chris@0: * @param string $content A string to parse as HTML/XML Chris@17: * @param string|null $type The content type of the string Chris@0: */ Chris@0: public function addContent($content, $type = null) Chris@0: { Chris@0: if (empty($type)) { Chris@0: $type = 0 === strpos($content, ']+charset *= *["\']?([a-zA-Z\-0-9_:.]+)/i', $content, $matches)) { Chris@0: $charset = $matches[1]; Chris@0: } Chris@0: Chris@0: if (null === $charset) { Chris@12: $charset = preg_match('//u', $content) ? 'UTF-8' : 'ISO-8859-1'; Chris@0: } Chris@0: Chris@0: if ('x' === $xmlMatches[1]) { Chris@0: $this->addXmlContent($content, $charset); Chris@0: } else { Chris@0: $this->addHtmlContent($content, $charset); Chris@0: } Chris@0: } Chris@0: Chris@0: /** Chris@0: * Adds an HTML content to the list of nodes. Chris@0: * Chris@0: * The libxml errors are disabled when the content is parsed. Chris@0: * Chris@0: * If you want to get parsing errors, be sure to enable Chris@0: * internal errors via libxml_use_internal_errors(true) Chris@0: * and then, get the errors via libxml_get_errors(). Be Chris@0: * sure to clear errors with libxml_clear_errors() afterward. Chris@0: * Chris@0: * @param string $content The HTML content Chris@0: * @param string $charset The charset Chris@0: */ Chris@0: public function addHtmlContent($content, $charset = 'UTF-8') Chris@0: { Chris@0: $internalErrors = libxml_use_internal_errors(true); Chris@0: $disableEntities = libxml_disable_entity_loader(true); Chris@0: Chris@0: $dom = new \DOMDocument('1.0', $charset); Chris@0: $dom->validateOnParse = true; Chris@0: Chris@0: set_error_handler(function () { throw new \Exception(); }); Chris@0: Chris@0: try { Chris@0: // Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML() Chris@0: $content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset); Chris@0: } catch (\Exception $e) { Chris@0: } Chris@0: Chris@0: restore_error_handler(); Chris@0: Chris@0: if ('' !== trim($content)) { Chris@0: @$dom->loadHTML($content); Chris@0: } Chris@0: Chris@0: libxml_use_internal_errors($internalErrors); Chris@0: libxml_disable_entity_loader($disableEntities); Chris@0: Chris@0: $this->addDocument($dom); Chris@0: Chris@17: $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']); Chris@0: Chris@0: $baseHref = current($base); Chris@17: if (\count($base) && !empty($baseHref)) { Chris@0: if ($this->baseHref) { Chris@0: $linkNode = $dom->createElement('a'); Chris@0: $linkNode->setAttribute('href', $baseHref); Chris@0: $link = new Link($linkNode, $this->baseHref); Chris@0: $this->baseHref = $link->getUri(); Chris@0: } else { Chris@0: $this->baseHref = $baseHref; Chris@0: } Chris@0: } Chris@0: } Chris@0: Chris@0: /** Chris@0: * Adds an XML content to the list of nodes. Chris@0: * Chris@0: * The libxml errors are disabled when the content is parsed. Chris@0: * Chris@0: * If you want to get parsing errors, be sure to enable Chris@0: * internal errors via libxml_use_internal_errors(true) Chris@0: * and then, get the errors via libxml_get_errors(). Be Chris@0: * sure to clear errors with libxml_clear_errors() afterward. Chris@0: * Chris@0: * @param string $content The XML content Chris@0: * @param string $charset The charset Chris@0: * @param int $options Bitwise OR of the libxml option constants Chris@0: * LIBXML_PARSEHUGE is dangerous, see Chris@0: * http://symfony.com/blog/security-release-symfony-2-0-17-released Chris@0: */ Chris@0: public function addXmlContent($content, $charset = 'UTF-8', $options = LIBXML_NONET) Chris@0: { Chris@0: // remove the default namespace if it's the only namespace to make XPath expressions simpler Chris@0: if (!preg_match('/xmlns:/', $content)) { Chris@0: $content = str_replace('xmlns', 'ns', $content); Chris@0: } Chris@0: Chris@0: $internalErrors = libxml_use_internal_errors(true); Chris@0: $disableEntities = libxml_disable_entity_loader(true); Chris@0: Chris@0: $dom = new \DOMDocument('1.0', $charset); Chris@0: $dom->validateOnParse = true; Chris@0: Chris@0: if ('' !== trim($content)) { Chris@0: @$dom->loadXML($content, $options); Chris@0: } Chris@0: Chris@0: libxml_use_internal_errors($internalErrors); Chris@0: libxml_disable_entity_loader($disableEntities); Chris@0: Chris@0: $this->addDocument($dom); Chris@0: Chris@0: $this->isHtml = false; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Adds a \DOMDocument to the list of nodes. Chris@0: * Chris@0: * @param \DOMDocument $dom A \DOMDocument instance Chris@0: */ Chris@0: public function addDocument(\DOMDocument $dom) Chris@0: { Chris@0: if ($dom->documentElement) { Chris@0: $this->addNode($dom->documentElement); Chris@0: } Chris@0: } Chris@0: Chris@0: /** Chris@0: * Adds a \DOMNodeList to the list of nodes. Chris@0: * Chris@0: * @param \DOMNodeList $nodes A \DOMNodeList instance Chris@0: */ Chris@0: public function addNodeList(\DOMNodeList $nodes) Chris@0: { Chris@0: foreach ($nodes as $node) { Chris@0: if ($node instanceof \DOMNode) { Chris@0: $this->addNode($node); Chris@0: } Chris@0: } Chris@0: } Chris@0: Chris@0: /** Chris@0: * Adds an array of \DOMNode instances to the list of nodes. Chris@0: * Chris@0: * @param \DOMNode[] $nodes An array of \DOMNode instances Chris@0: */ Chris@0: public function addNodes(array $nodes) Chris@0: { Chris@0: foreach ($nodes as $node) { Chris@0: $this->add($node); Chris@0: } Chris@0: } Chris@0: Chris@0: /** Chris@0: * Adds a \DOMNode instance to the list of nodes. Chris@0: * Chris@0: * @param \DOMNode $node A \DOMNode instance Chris@0: */ Chris@0: public function addNode(\DOMNode $node) Chris@0: { Chris@0: if ($node instanceof \DOMDocument) { Chris@0: $node = $node->documentElement; Chris@0: } Chris@0: Chris@0: if (null !== $this->document && $this->document !== $node->ownerDocument) { Chris@0: throw new \InvalidArgumentException('Attaching DOM nodes from multiple documents in the same crawler is forbidden.'); Chris@0: } Chris@0: Chris@0: if (null === $this->document) { Chris@0: $this->document = $node->ownerDocument; Chris@0: } Chris@0: Chris@0: // Don't add duplicate nodes in the Crawler Chris@17: if (\in_array($node, $this->nodes, true)) { Chris@0: return; Chris@0: } Chris@0: Chris@0: $this->nodes[] = $node; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Returns a node given its position in the node list. Chris@0: * Chris@0: * @param int $position The position Chris@0: * Chris@0: * @return self Chris@0: */ Chris@0: public function eq($position) Chris@0: { Chris@0: if (isset($this->nodes[$position])) { Chris@0: return $this->createSubCrawler($this->nodes[$position]); Chris@0: } Chris@0: Chris@0: return $this->createSubCrawler(null); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Calls an anonymous function on each node of the list. Chris@0: * Chris@0: * The anonymous function receives the position and the node wrapped Chris@0: * in a Crawler instance as arguments. Chris@0: * Chris@0: * Example: Chris@0: * Chris@0: * $crawler->filter('h1')->each(function ($node, $i) { Chris@0: * return $node->text(); Chris@0: * }); Chris@0: * Chris@0: * @param \Closure $closure An anonymous function Chris@0: * Chris@0: * @return array An array of values returned by the anonymous function Chris@0: */ Chris@0: public function each(\Closure $closure) Chris@0: { Chris@17: $data = []; Chris@0: foreach ($this->nodes as $i => $node) { Chris@0: $data[] = $closure($this->createSubCrawler($node), $i); Chris@0: } Chris@0: Chris@0: return $data; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Slices the list of nodes by $offset and $length. Chris@0: * Chris@0: * @param int $offset Chris@0: * @param int $length Chris@0: * Chris@0: * @return self Chris@0: */ Chris@0: public function slice($offset = 0, $length = null) Chris@0: { Chris@17: return $this->createSubCrawler(\array_slice($this->nodes, $offset, $length)); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Reduces the list of nodes by calling an anonymous function. Chris@0: * Chris@0: * To remove a node from the list, the anonymous function must return false. Chris@0: * Chris@0: * @param \Closure $closure An anonymous function Chris@0: * Chris@0: * @return self Chris@0: */ Chris@0: public function reduce(\Closure $closure) Chris@0: { Chris@17: $nodes = []; Chris@0: foreach ($this->nodes as $i => $node) { Chris@0: if (false !== $closure($this->createSubCrawler($node), $i)) { Chris@0: $nodes[] = $node; Chris@0: } Chris@0: } Chris@0: Chris@0: return $this->createSubCrawler($nodes); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Returns the first node of the current selection. Chris@0: * Chris@0: * @return self Chris@0: */ Chris@0: public function first() Chris@0: { Chris@0: return $this->eq(0); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Returns the last node of the current selection. Chris@0: * Chris@0: * @return self Chris@0: */ Chris@0: public function last() Chris@0: { Chris@17: return $this->eq(\count($this->nodes) - 1); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Returns the siblings nodes of the current selection. Chris@0: * Chris@0: * @return self Chris@0: * Chris@0: * @throws \InvalidArgumentException When current node is empty Chris@0: */ Chris@0: public function siblings() Chris@0: { Chris@0: if (!$this->nodes) { Chris@0: throw new \InvalidArgumentException('The current node list is empty.'); Chris@0: } Chris@0: Chris@0: return $this->createSubCrawler($this->sibling($this->getNode(0)->parentNode->firstChild)); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Returns the next siblings nodes of the current selection. Chris@0: * Chris@0: * @return self Chris@0: * Chris@0: * @throws \InvalidArgumentException When current node is empty Chris@0: */ Chris@0: public function nextAll() Chris@0: { Chris@0: if (!$this->nodes) { Chris@0: throw new \InvalidArgumentException('The current node list is empty.'); Chris@0: } Chris@0: Chris@0: return $this->createSubCrawler($this->sibling($this->getNode(0))); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Returns the previous sibling nodes of the current selection. Chris@0: * Chris@0: * @return self Chris@0: * Chris@0: * @throws \InvalidArgumentException Chris@0: */ Chris@0: public function previousAll() Chris@0: { Chris@0: if (!$this->nodes) { Chris@0: throw new \InvalidArgumentException('The current node list is empty.'); Chris@0: } Chris@0: Chris@0: return $this->createSubCrawler($this->sibling($this->getNode(0), 'previousSibling')); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Returns the parents nodes of the current selection. Chris@0: * Chris@0: * @return self Chris@0: * Chris@0: * @throws \InvalidArgumentException When current node is empty Chris@0: */ Chris@0: public function parents() Chris@0: { Chris@0: if (!$this->nodes) { Chris@0: throw new \InvalidArgumentException('The current node list is empty.'); Chris@0: } Chris@0: Chris@0: $node = $this->getNode(0); Chris@17: $nodes = []; Chris@0: Chris@0: while ($node = $node->parentNode) { Chris@0: if (XML_ELEMENT_NODE === $node->nodeType) { Chris@0: $nodes[] = $node; Chris@0: } Chris@0: } Chris@0: Chris@0: return $this->createSubCrawler($nodes); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Returns the children nodes of the current selection. Chris@0: * Chris@0: * @return self Chris@0: * Chris@0: * @throws \InvalidArgumentException When current node is empty Chris@0: */ Chris@0: public function children() Chris@0: { Chris@0: if (!$this->nodes) { Chris@0: throw new \InvalidArgumentException('The current node list is empty.'); Chris@0: } Chris@0: Chris@0: $node = $this->getNode(0)->firstChild; Chris@0: Chris@17: return $this->createSubCrawler($node ? $this->sibling($node) : []); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Returns the attribute value of the first node of the list. Chris@0: * Chris@0: * @param string $attribute The attribute name Chris@0: * Chris@0: * @return string|null The attribute value or null if the attribute does not exist Chris@0: * Chris@0: * @throws \InvalidArgumentException When current node is empty Chris@0: */ Chris@0: public function attr($attribute) Chris@0: { Chris@0: if (!$this->nodes) { Chris@0: throw new \InvalidArgumentException('The current node list is empty.'); Chris@0: } Chris@0: Chris@0: $node = $this->getNode(0); Chris@0: Chris@0: return $node->hasAttribute($attribute) ? $node->getAttribute($attribute) : null; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Returns the node name of the first node of the list. Chris@0: * Chris@0: * @return string The node name Chris@0: * Chris@0: * @throws \InvalidArgumentException When current node is empty Chris@0: */ Chris@0: public function nodeName() Chris@0: { Chris@0: if (!$this->nodes) { Chris@0: throw new \InvalidArgumentException('The current node list is empty.'); Chris@0: } Chris@0: Chris@0: return $this->getNode(0)->nodeName; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Returns the node value of the first node of the list. Chris@0: * Chris@0: * @return string The node value Chris@0: * Chris@0: * @throws \InvalidArgumentException When current node is empty Chris@0: */ Chris@0: public function text() Chris@0: { Chris@0: if (!$this->nodes) { Chris@0: throw new \InvalidArgumentException('The current node list is empty.'); Chris@0: } Chris@0: Chris@0: return $this->getNode(0)->nodeValue; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Returns the first node of the list as HTML. Chris@0: * Chris@0: * @return string The node html Chris@0: * Chris@0: * @throws \InvalidArgumentException When current node is empty Chris@0: */ Chris@0: public function html() Chris@0: { Chris@0: if (!$this->nodes) { Chris@0: throw new \InvalidArgumentException('The current node list is empty.'); Chris@0: } Chris@0: Chris@0: $html = ''; Chris@0: foreach ($this->getNode(0)->childNodes as $child) { Chris@0: $html .= $child->ownerDocument->saveHTML($child); Chris@0: } Chris@0: Chris@0: return $html; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Evaluates an XPath expression. Chris@0: * Chris@0: * Since an XPath expression might evaluate to either a simple type or a \DOMNodeList, Chris@0: * this method will return either an array of simple types or a new Crawler instance. Chris@0: * Chris@0: * @param string $xpath An XPath expression Chris@0: * Chris@0: * @return array|Crawler An array of evaluation results or a new Crawler instance Chris@0: */ Chris@0: public function evaluate($xpath) Chris@0: { Chris@0: if (null === $this->document) { Chris@0: throw new \LogicException('Cannot evaluate the expression on an uninitialized crawler.'); Chris@0: } Chris@0: Chris@17: $data = []; Chris@0: $domxpath = $this->createDOMXPath($this->document, $this->findNamespacePrefixes($xpath)); Chris@0: Chris@0: foreach ($this->nodes as $node) { Chris@0: $data[] = $domxpath->evaluate($xpath, $node); Chris@0: } Chris@0: Chris@0: if (isset($data[0]) && $data[0] instanceof \DOMNodeList) { Chris@0: return $this->createSubCrawler($data); Chris@0: } Chris@0: Chris@0: return $data; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Extracts information from the list of nodes. Chris@0: * Chris@0: * You can extract attributes or/and the node value (_text). Chris@0: * Chris@0: * Example: Chris@0: * Chris@17: * $crawler->filter('h1 a')->extract(['_text', 'href']); Chris@0: * Chris@0: * @param array $attributes An array of attributes Chris@0: * Chris@0: * @return array An array of extracted values Chris@0: */ Chris@0: public function extract($attributes) Chris@0: { Chris@0: $attributes = (array) $attributes; Chris@17: $count = \count($attributes); Chris@0: Chris@17: $data = []; Chris@0: foreach ($this->nodes as $node) { Chris@17: $elements = []; Chris@0: foreach ($attributes as $attribute) { Chris@0: if ('_text' === $attribute) { Chris@0: $elements[] = $node->nodeValue; Chris@0: } else { Chris@0: $elements[] = $node->getAttribute($attribute); Chris@0: } Chris@0: } Chris@0: Chris@13: $data[] = 1 === $count ? $elements[0] : $elements; Chris@0: } Chris@0: Chris@0: return $data; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Filters the list of nodes with an XPath expression. Chris@0: * Chris@0: * The XPath expression is evaluated in the context of the crawler, which Chris@0: * is considered as a fake parent of the elements inside it. Chris@0: * This means that a child selector "div" or "./div" will match only Chris@0: * the div elements of the current crawler, not their children. Chris@0: * Chris@0: * @param string $xpath An XPath expression Chris@0: * Chris@0: * @return self Chris@0: */ Chris@0: public function filterXPath($xpath) Chris@0: { Chris@0: $xpath = $this->relativize($xpath); Chris@0: Chris@0: // If we dropped all expressions in the XPath while preparing it, there would be no match Chris@0: if ('' === $xpath) { Chris@0: return $this->createSubCrawler(null); Chris@0: } Chris@0: Chris@0: return $this->filterRelativeXPath($xpath); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Filters the list of nodes with a CSS selector. Chris@0: * Chris@0: * This method only works if you have installed the CssSelector Symfony Component. Chris@0: * Chris@0: * @param string $selector A CSS selector Chris@0: * Chris@0: * @return self Chris@0: * Chris@0: * @throws \RuntimeException if the CssSelector Component is not available Chris@0: */ Chris@0: public function filter($selector) Chris@0: { Chris@12: if (!class_exists(CssSelectorConverter::class)) { Chris@12: throw new \RuntimeException('To filter with a CSS selector, install the CssSelector component ("composer require symfony/css-selector"). Or use filterXpath instead.'); Chris@0: } Chris@0: Chris@0: $converter = new CssSelectorConverter($this->isHtml); Chris@0: Chris@0: // The CssSelector already prefixes the selector with descendant-or-self:: Chris@0: return $this->filterRelativeXPath($converter->toXPath($selector)); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Selects links by name or alt value for clickable images. Chris@0: * Chris@0: * @param string $value The link text Chris@0: * Chris@0: * @return self Chris@0: */ Chris@0: public function selectLink($value) Chris@0: { Chris@0: $xpath = sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) ', static::xpathLiteral(' '.$value.' ')). Chris@0: sprintf('or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]]', static::xpathLiteral(' '.$value.' ')); Chris@0: Chris@0: return $this->filterRelativeXPath($xpath); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Selects images by alt value. Chris@0: * Chris@0: * @param string $value The image alt Chris@0: * Chris@0: * @return self A new instance of Crawler with the filtered list of nodes Chris@0: */ Chris@0: public function selectImage($value) Chris@0: { Chris@0: $xpath = sprintf('descendant-or-self::img[contains(normalize-space(string(@alt)), %s)]', static::xpathLiteral($value)); Chris@0: Chris@0: return $this->filterRelativeXPath($xpath); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Selects a button by name or alt value for images. Chris@0: * Chris@0: * @param string $value The button text Chris@0: * Chris@0: * @return self Chris@0: */ Chris@0: public function selectButton($value) Chris@0: { Chris@0: $translate = 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")'; Chris@12: $xpath = sprintf('descendant-or-self::input[((contains(%s, "submit") or contains(%1$s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', $translate, static::xpathLiteral(' '.$value.' ')). Chris@0: sprintf('or (contains(%s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id=%s or @name=%s] ', $translate, static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value), static::xpathLiteral($value)). Chris@0: sprintf('| descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id=%s or @name=%s]', static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value), static::xpathLiteral($value)); Chris@0: Chris@0: return $this->filterRelativeXPath($xpath); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Returns a Link object for the first node in the list. Chris@0: * Chris@0: * @param string $method The method for the link (get by default) Chris@0: * Chris@0: * @return Link A Link instance Chris@0: * Chris@0: * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement Chris@0: */ Chris@0: public function link($method = 'get') Chris@0: { Chris@0: if (!$this->nodes) { Chris@0: throw new \InvalidArgumentException('The current node list is empty.'); Chris@0: } Chris@0: Chris@0: $node = $this->getNode(0); Chris@0: Chris@0: if (!$node instanceof \DOMElement) { Chris@17: throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', \get_class($node))); Chris@0: } Chris@0: Chris@0: return new Link($node, $this->baseHref, $method); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Returns an array of Link objects for the nodes in the list. Chris@0: * Chris@0: * @return Link[] An array of Link instances Chris@0: * Chris@0: * @throws \InvalidArgumentException If the current node list contains non-DOMElement instances Chris@0: */ Chris@0: public function links() Chris@0: { Chris@17: $links = []; Chris@0: foreach ($this->nodes as $node) { Chris@0: if (!$node instanceof \DOMElement) { Chris@17: throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', \get_class($node))); Chris@0: } Chris@0: Chris@0: $links[] = new Link($node, $this->baseHref, 'get'); Chris@0: } Chris@0: Chris@0: return $links; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Returns an Image object for the first node in the list. Chris@0: * Chris@0: * @return Image An Image instance Chris@0: * Chris@0: * @throws \InvalidArgumentException If the current node list is empty Chris@0: */ Chris@0: public function image() Chris@0: { Chris@17: if (!\count($this)) { Chris@0: throw new \InvalidArgumentException('The current node list is empty.'); Chris@0: } Chris@0: Chris@0: $node = $this->getNode(0); Chris@0: Chris@0: if (!$node instanceof \DOMElement) { Chris@17: throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', \get_class($node))); Chris@0: } Chris@0: Chris@0: return new Image($node, $this->baseHref); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Returns an array of Image objects for the nodes in the list. Chris@0: * Chris@0: * @return Image[] An array of Image instances Chris@0: */ Chris@0: public function images() Chris@0: { Chris@17: $images = []; Chris@0: foreach ($this as $node) { Chris@0: if (!$node instanceof \DOMElement) { Chris@17: throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', \get_class($node))); Chris@0: } Chris@0: Chris@0: $images[] = new Image($node, $this->baseHref); Chris@0: } Chris@0: Chris@0: return $images; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Returns a Form object for the first node in the list. Chris@0: * Chris@0: * @param array $values An array of values for the form fields Chris@0: * @param string $method The method for the form Chris@0: * Chris@0: * @return Form A Form instance Chris@0: * Chris@0: * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement Chris@0: */ Chris@0: public function form(array $values = null, $method = null) Chris@0: { Chris@0: if (!$this->nodes) { Chris@0: throw new \InvalidArgumentException('The current node list is empty.'); Chris@0: } Chris@0: Chris@0: $node = $this->getNode(0); Chris@0: Chris@0: if (!$node instanceof \DOMElement) { Chris@17: throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', \get_class($node))); Chris@0: } Chris@0: Chris@0: $form = new Form($node, $this->uri, $method, $this->baseHref); Chris@0: Chris@0: if (null !== $values) { Chris@0: $form->setValues($values); Chris@0: } Chris@0: Chris@0: return $form; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Overloads a default namespace prefix to be used with XPath and CSS expressions. Chris@0: * Chris@0: * @param string $prefix Chris@0: */ Chris@0: public function setDefaultNamespacePrefix($prefix) Chris@0: { Chris@0: $this->defaultNamespacePrefix = $prefix; Chris@0: } Chris@0: Chris@0: /** Chris@0: * @param string $prefix Chris@0: * @param string $namespace Chris@0: */ Chris@0: public function registerNamespace($prefix, $namespace) Chris@0: { Chris@0: $this->namespaces[$prefix] = $namespace; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Converts string for XPath expressions. Chris@0: * Chris@0: * Escaped characters are: quotes (") and apostrophe ('). Chris@0: * Chris@0: * Examples: Chris@17: * Chris@0: * echo Crawler::xpathLiteral('foo " bar'); Chris@0: * //prints 'foo " bar' Chris@0: * Chris@0: * echo Crawler::xpathLiteral("foo ' bar"); Chris@0: * //prints "foo ' bar" Chris@0: * Chris@0: * echo Crawler::xpathLiteral('a\'b"c'); Chris@0: * //prints concat('a', "'", 'b"c') Chris@17: * Chris@0: * Chris@0: * @param string $s String to be escaped Chris@0: * Chris@0: * @return string Converted string Chris@0: */ Chris@0: public static function xpathLiteral($s) Chris@0: { Chris@0: if (false === strpos($s, "'")) { Chris@0: return sprintf("'%s'", $s); Chris@0: } Chris@0: Chris@0: if (false === strpos($s, '"')) { Chris@0: return sprintf('"%s"', $s); Chris@0: } Chris@0: Chris@0: $string = $s; Chris@17: $parts = []; Chris@0: while (true) { Chris@0: if (false !== $pos = strpos($string, "'")) { Chris@0: $parts[] = sprintf("'%s'", substr($string, 0, $pos)); Chris@0: $parts[] = "\"'\""; Chris@0: $string = substr($string, $pos + 1); Chris@0: } else { Chris@0: $parts[] = "'$string'"; Chris@0: break; Chris@0: } Chris@0: } Chris@0: Chris@0: return sprintf('concat(%s)', implode(', ', $parts)); Chris@0: } Chris@0: Chris@0: /** Chris@0: * Filters the list of nodes with an XPath expression. Chris@0: * Chris@0: * The XPath expression should already be processed to apply it in the context of each node. Chris@0: * Chris@0: * @param string $xpath Chris@0: * Chris@0: * @return self Chris@0: */ Chris@0: private function filterRelativeXPath($xpath) Chris@0: { Chris@0: $prefixes = $this->findNamespacePrefixes($xpath); Chris@0: Chris@0: $crawler = $this->createSubCrawler(null); Chris@0: Chris@0: foreach ($this->nodes as $node) { Chris@0: $domxpath = $this->createDOMXPath($node->ownerDocument, $prefixes); Chris@0: $crawler->add($domxpath->query($xpath, $node)); Chris@0: } Chris@0: Chris@0: return $crawler; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Make the XPath relative to the current context. Chris@0: * Chris@0: * The returned XPath will match elements matching the XPath inside the current crawler Chris@0: * when running in the context of a node of the crawler. Chris@0: * Chris@0: * @param string $xpath Chris@0: * Chris@0: * @return string Chris@0: */ Chris@0: private function relativize($xpath) Chris@0: { Chris@17: $expressions = []; Chris@0: Chris@0: // An expression which will never match to replace expressions which cannot match in the crawler Chris@18: // We cannot drop Chris@0: $nonMatchingExpression = 'a[name() = "b"]'; Chris@0: Chris@17: $xpathLen = \strlen($xpath); Chris@0: $openedBrackets = 0; Chris@0: $startPosition = strspn($xpath, " \t\n\r\0\x0B"); Chris@0: Chris@0: for ($i = $startPosition; $i <= $xpathLen; ++$i) { Chris@0: $i += strcspn($xpath, '"\'[]|', $i); Chris@0: Chris@0: if ($i < $xpathLen) { Chris@0: switch ($xpath[$i]) { Chris@0: case '"': Chris@0: case "'": Chris@0: if (false === $i = strpos($xpath, $xpath[$i], $i + 1)) { Chris@0: return $xpath; // The XPath expression is invalid Chris@0: } Chris@0: continue 2; Chris@0: case '[': Chris@0: ++$openedBrackets; Chris@0: continue 2; Chris@0: case ']': Chris@0: --$openedBrackets; Chris@0: continue 2; Chris@0: } Chris@0: } Chris@0: if ($openedBrackets) { Chris@0: continue; Chris@0: } Chris@0: Chris@0: if ($startPosition < $xpathLen && '(' === $xpath[$startPosition]) { Chris@0: // If the union is inside some braces, we need to preserve the opening braces and apply Chris@0: // the change only inside it. Chris@0: $j = 1 + strspn($xpath, "( \t\n\r\0\x0B", $startPosition + 1); Chris@0: $parenthesis = substr($xpath, $startPosition, $j); Chris@0: $startPosition += $j; Chris@0: } else { Chris@0: $parenthesis = ''; Chris@0: } Chris@0: $expression = rtrim(substr($xpath, $startPosition, $i - $startPosition)); Chris@0: Chris@0: if (0 === strpos($expression, 'self::*/')) { Chris@0: $expression = './'.substr($expression, 8); Chris@0: } Chris@0: Chris@0: // add prefix before absolute element selector Chris@0: if ('' === $expression) { Chris@0: $expression = $nonMatchingExpression; Chris@0: } elseif (0 === strpos($expression, '//')) { Chris@0: $expression = 'descendant-or-self::'.substr($expression, 2); Chris@0: } elseif (0 === strpos($expression, './/')) { Chris@0: $expression = 'descendant-or-self::'.substr($expression, 3); Chris@0: } elseif (0 === strpos($expression, './')) { Chris@0: $expression = 'self::'.substr($expression, 2); Chris@0: } elseif (0 === strpos($expression, 'child::')) { Chris@0: $expression = 'self::'.substr($expression, 7); Chris@0: } elseif ('/' === $expression[0] || '.' === $expression[0] || 0 === strpos($expression, 'self::')) { Chris@0: $expression = $nonMatchingExpression; Chris@0: } elseif (0 === strpos($expression, 'descendant::')) { Chris@0: $expression = 'descendant-or-self::'.substr($expression, 12); Chris@0: } elseif (preg_match('/^(ancestor|ancestor-or-self|attribute|following|following-sibling|namespace|parent|preceding|preceding-sibling)::/', $expression)) { Chris@0: // the fake root has no parent, preceding or following nodes and also no attributes (even no namespace attributes) Chris@0: $expression = $nonMatchingExpression; Chris@0: } elseif (0 !== strpos($expression, 'descendant-or-self::')) { Chris@0: $expression = 'self::'.$expression; Chris@0: } Chris@0: $expressions[] = $parenthesis.$expression; Chris@0: Chris@0: if ($i === $xpathLen) { Chris@0: return implode(' | ', $expressions); Chris@0: } Chris@0: Chris@0: $i += strspn($xpath, " \t\n\r\0\x0B", $i + 1); Chris@0: $startPosition = $i + 1; Chris@0: } Chris@0: Chris@0: return $xpath; // The XPath expression is invalid Chris@0: } Chris@0: Chris@0: /** Chris@0: * @param int $position Chris@0: * Chris@0: * @return \DOMElement|null Chris@0: */ Chris@0: public function getNode($position) Chris@0: { Chris@0: if (isset($this->nodes[$position])) { Chris@0: return $this->nodes[$position]; Chris@0: } Chris@0: } Chris@0: Chris@0: /** Chris@0: * @return int Chris@0: */ Chris@0: public function count() Chris@0: { Chris@17: return \count($this->nodes); Chris@0: } Chris@0: Chris@0: /** Chris@12: * @return \ArrayIterator|\DOMElement[] Chris@0: */ Chris@0: public function getIterator() Chris@0: { Chris@0: return new \ArrayIterator($this->nodes); Chris@0: } Chris@0: Chris@0: /** Chris@0: * @param \DOMElement $node Chris@0: * @param string $siblingDir Chris@0: * Chris@0: * @return array Chris@0: */ Chris@0: protected function sibling($node, $siblingDir = 'nextSibling') Chris@0: { Chris@17: $nodes = []; Chris@0: Chris@0: do { Chris@12: if ($node !== $this->getNode(0) && 1 === $node->nodeType) { Chris@0: $nodes[] = $node; Chris@0: } Chris@0: } while ($node = $node->$siblingDir); Chris@0: Chris@0: return $nodes; Chris@0: } Chris@0: Chris@0: /** Chris@0: * @param \DOMDocument $document Chris@0: * @param array $prefixes Chris@0: * Chris@0: * @return \DOMXPath Chris@0: * Chris@0: * @throws \InvalidArgumentException Chris@0: */ Chris@17: private function createDOMXPath(\DOMDocument $document, array $prefixes = []) Chris@0: { Chris@0: $domxpath = new \DOMXPath($document); Chris@0: Chris@0: foreach ($prefixes as $prefix) { Chris@0: $namespace = $this->discoverNamespace($domxpath, $prefix); Chris@0: if (null !== $namespace) { Chris@0: $domxpath->registerNamespace($prefix, $namespace); Chris@0: } Chris@0: } Chris@0: Chris@0: return $domxpath; Chris@0: } Chris@0: Chris@0: /** Chris@0: * @param \DOMXPath $domxpath Chris@0: * @param string $prefix Chris@0: * Chris@0: * @return string Chris@0: * Chris@0: * @throws \InvalidArgumentException Chris@0: */ Chris@0: private function discoverNamespace(\DOMXPath $domxpath, $prefix) Chris@0: { Chris@0: if (isset($this->namespaces[$prefix])) { Chris@0: return $this->namespaces[$prefix]; Chris@0: } Chris@0: Chris@0: // ask for one namespace, otherwise we'd get a collection with an item for each node Chris@0: $namespaces = $domxpath->query(sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix)); Chris@0: Chris@0: if ($node = $namespaces->item(0)) { Chris@0: return $node->nodeValue; Chris@0: } Chris@0: } Chris@0: Chris@0: /** Chris@0: * @param string $xpath Chris@0: * Chris@0: * @return array Chris@0: */ Chris@0: private function findNamespacePrefixes($xpath) Chris@0: { Chris@0: if (preg_match_all('/(?P[a-z_][a-z_0-9\-\.]*+):[^"\/:]/i', $xpath, $matches)) { Chris@0: return array_unique($matches['prefix']); Chris@0: } Chris@0: Chris@17: return []; Chris@0: } Chris@0: Chris@0: /** Chris@0: * Creates a crawler for some subnodes. Chris@0: * Chris@0: * @param \DOMElement|\DOMElement[]|\DOMNodeList|null $nodes Chris@0: * Chris@0: * @return static Chris@0: */ Chris@0: private function createSubCrawler($nodes) Chris@0: { Chris@0: $crawler = new static($nodes, $this->uri, $this->baseHref); Chris@0: $crawler->isHtml = $this->isHtml; Chris@0: $crawler->document = $this->document; Chris@0: $crawler->namespaces = $this->namespaces; Chris@0: Chris@0: return $crawler; Chris@0: } Chris@0: }