Mercurial > hg > cmmr2012-drupal-site
comparison vendor/symfony/dom-crawler/Crawler.php @ 0:c75dbcec494b
Initial commit from drush-created site
author | Chris Cannam |
---|---|
date | Thu, 05 Jul 2018 14:24:15 +0000 |
parents | |
children | a9cd425dd02b |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c75dbcec494b |
---|---|
1 <?php | |
2 | |
3 /* | |
4 * This file is part of the Symfony package. | |
5 * | |
6 * (c) Fabien Potencier <fabien@symfony.com> | |
7 * | |
8 * For the full copyright and license information, please view the LICENSE | |
9 * file that was distributed with this source code. | |
10 */ | |
11 | |
12 namespace Symfony\Component\DomCrawler; | |
13 | |
14 use Symfony\Component\CssSelector\CssSelectorConverter; | |
15 | |
16 /** | |
17 * Crawler eases navigation of a list of \DOMNode objects. | |
18 * | |
19 * @author Fabien Potencier <fabien@symfony.com> | |
20 */ | |
21 class Crawler implements \Countable, \IteratorAggregate | |
22 { | |
23 protected $uri; | |
24 | |
25 /** | |
26 * @var string The default namespace prefix to be used with XPath and CSS expressions | |
27 */ | |
28 private $defaultNamespacePrefix = 'default'; | |
29 | |
30 /** | |
31 * @var array A map of manually registered namespaces | |
32 */ | |
33 private $namespaces = array(); | |
34 | |
35 /** | |
36 * @var string The base href value | |
37 */ | |
38 private $baseHref; | |
39 | |
40 /** | |
41 * @var \DOMDocument|null | |
42 */ | |
43 private $document; | |
44 | |
45 /** | |
46 * @var \DOMElement[] | |
47 */ | |
48 private $nodes = array(); | |
49 | |
50 /** | |
51 * Whether the Crawler contains HTML or XML content (used when converting CSS to XPath). | |
52 * | |
53 * @var bool | |
54 */ | |
55 private $isHtml = true; | |
56 | |
57 /** | |
58 * @param mixed $node A Node to use as the base for the crawling | |
59 * @param string $uri The current URI | |
60 * @param string $baseHref The base href value | |
61 */ | |
62 public function __construct($node = null, $uri = null, $baseHref = null) | |
63 { | |
64 $this->uri = $uri; | |
65 $this->baseHref = $baseHref ?: $uri; | |
66 | |
67 $this->add($node); | |
68 } | |
69 | |
70 /** | |
71 * Returns the current URI. | |
72 * | |
73 * @return string | |
74 */ | |
75 public function getUri() | |
76 { | |
77 return $this->uri; | |
78 } | |
79 | |
80 /** | |
81 * Returns base href. | |
82 * | |
83 * @return string | |
84 */ | |
85 public function getBaseHref() | |
86 { | |
87 return $this->baseHref; | |
88 } | |
89 | |
90 /** | |
91 * Removes all the nodes. | |
92 */ | |
93 public function clear() | |
94 { | |
95 $this->nodes = array(); | |
96 $this->document = null; | |
97 } | |
98 | |
99 /** | |
100 * Adds a node to the current list of nodes. | |
101 * | |
102 * This method uses the appropriate specialized add*() method based | |
103 * on the type of the argument. | |
104 * | |
105 * @param \DOMNodeList|\DOMNode|array|string|null $node A node | |
106 * | |
107 * @throws \InvalidArgumentException when node is not the expected type | |
108 */ | |
109 public function add($node) | |
110 { | |
111 if ($node instanceof \DOMNodeList) { | |
112 $this->addNodeList($node); | |
113 } elseif ($node instanceof \DOMNode) { | |
114 $this->addNode($node); | |
115 } elseif (is_array($node)) { | |
116 $this->addNodes($node); | |
117 } elseif (is_string($node)) { | |
118 $this->addContent($node); | |
119 } elseif (null !== $node) { | |
120 throw new \InvalidArgumentException(sprintf('Expecting a DOMNodeList or DOMNode instance, an array, a string, or null, but got "%s".', is_object($node) ? get_class($node) : gettype($node))); | |
121 } | |
122 } | |
123 | |
124 /** | |
125 * Adds HTML/XML content. | |
126 * | |
127 * If the charset is not set via the content type, it is assumed to be UTF-8, | |
128 * or ISO-8859-1 as a fallback, which is the default charset defined by the | |
129 * HTTP 1.1 specification. | |
130 * | |
131 * @param string $content A string to parse as HTML/XML | |
132 * @param null|string $type The content type of the string | |
133 */ | |
134 public function addContent($content, $type = null) | |
135 { | |
136 if (empty($type)) { | |
137 $type = 0 === strpos($content, '<?xml') ? 'application/xml' : 'text/html'; | |
138 } | |
139 | |
140 // DOM only for HTML/XML content | |
141 if (!preg_match('/(x|ht)ml/i', $type, $xmlMatches)) { | |
142 return; | |
143 } | |
144 | |
145 $charset = null; | |
146 if (false !== $pos = stripos($type, 'charset=')) { | |
147 $charset = substr($type, $pos + 8); | |
148 if (false !== $pos = strpos($charset, ';')) { | |
149 $charset = substr($charset, 0, $pos); | |
150 } | |
151 } | |
152 | |
153 // http://www.w3.org/TR/encoding/#encodings | |
154 // http://www.w3.org/TR/REC-xml/#NT-EncName | |
155 if (null === $charset && | |
156 preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9_:.]+)/i', $content, $matches)) { | |
157 $charset = $matches[1]; | |
158 } | |
159 | |
160 if (null === $charset) { | |
161 $charset = preg_match('//u', $content) ? 'UTF-8' : 'ISO-8859-1'; | |
162 } | |
163 | |
164 if ('x' === $xmlMatches[1]) { | |
165 $this->addXmlContent($content, $charset); | |
166 } else { | |
167 $this->addHtmlContent($content, $charset); | |
168 } | |
169 } | |
170 | |
171 /** | |
172 * Adds an HTML content to the list of nodes. | |
173 * | |
174 * The libxml errors are disabled when the content is parsed. | |
175 * | |
176 * If you want to get parsing errors, be sure to enable | |
177 * internal errors via libxml_use_internal_errors(true) | |
178 * and then, get the errors via libxml_get_errors(). Be | |
179 * sure to clear errors with libxml_clear_errors() afterward. | |
180 * | |
181 * @param string $content The HTML content | |
182 * @param string $charset The charset | |
183 */ | |
184 public function addHtmlContent($content, $charset = 'UTF-8') | |
185 { | |
186 $internalErrors = libxml_use_internal_errors(true); | |
187 $disableEntities = libxml_disable_entity_loader(true); | |
188 | |
189 $dom = new \DOMDocument('1.0', $charset); | |
190 $dom->validateOnParse = true; | |
191 | |
192 set_error_handler(function () { throw new \Exception(); }); | |
193 | |
194 try { | |
195 // Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML() | |
196 $content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset); | |
197 } catch (\Exception $e) { | |
198 } | |
199 | |
200 restore_error_handler(); | |
201 | |
202 if ('' !== trim($content)) { | |
203 @$dom->loadHTML($content); | |
204 } | |
205 | |
206 libxml_use_internal_errors($internalErrors); | |
207 libxml_disable_entity_loader($disableEntities); | |
208 | |
209 $this->addDocument($dom); | |
210 | |
211 $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(array('href')); | |
212 | |
213 $baseHref = current($base); | |
214 if (count($base) && !empty($baseHref)) { | |
215 if ($this->baseHref) { | |
216 $linkNode = $dom->createElement('a'); | |
217 $linkNode->setAttribute('href', $baseHref); | |
218 $link = new Link($linkNode, $this->baseHref); | |
219 $this->baseHref = $link->getUri(); | |
220 } else { | |
221 $this->baseHref = $baseHref; | |
222 } | |
223 } | |
224 } | |
225 | |
226 /** | |
227 * Adds an XML content to the list of nodes. | |
228 * | |
229 * The libxml errors are disabled when the content is parsed. | |
230 * | |
231 * If you want to get parsing errors, be sure to enable | |
232 * internal errors via libxml_use_internal_errors(true) | |
233 * and then, get the errors via libxml_get_errors(). Be | |
234 * sure to clear errors with libxml_clear_errors() afterward. | |
235 * | |
236 * @param string $content The XML content | |
237 * @param string $charset The charset | |
238 * @param int $options Bitwise OR of the libxml option constants | |
239 * LIBXML_PARSEHUGE is dangerous, see | |
240 * http://symfony.com/blog/security-release-symfony-2-0-17-released | |
241 */ | |
242 public function addXmlContent($content, $charset = 'UTF-8', $options = LIBXML_NONET) | |
243 { | |
244 // remove the default namespace if it's the only namespace to make XPath expressions simpler | |
245 if (!preg_match('/xmlns:/', $content)) { | |
246 $content = str_replace('xmlns', 'ns', $content); | |
247 } | |
248 | |
249 $internalErrors = libxml_use_internal_errors(true); | |
250 $disableEntities = libxml_disable_entity_loader(true); | |
251 | |
252 $dom = new \DOMDocument('1.0', $charset); | |
253 $dom->validateOnParse = true; | |
254 | |
255 if ('' !== trim($content)) { | |
256 @$dom->loadXML($content, $options); | |
257 } | |
258 | |
259 libxml_use_internal_errors($internalErrors); | |
260 libxml_disable_entity_loader($disableEntities); | |
261 | |
262 $this->addDocument($dom); | |
263 | |
264 $this->isHtml = false; | |
265 } | |
266 | |
267 /** | |
268 * Adds a \DOMDocument to the list of nodes. | |
269 * | |
270 * @param \DOMDocument $dom A \DOMDocument instance | |
271 */ | |
272 public function addDocument(\DOMDocument $dom) | |
273 { | |
274 if ($dom->documentElement) { | |
275 $this->addNode($dom->documentElement); | |
276 } | |
277 } | |
278 | |
279 /** | |
280 * Adds a \DOMNodeList to the list of nodes. | |
281 * | |
282 * @param \DOMNodeList $nodes A \DOMNodeList instance | |
283 */ | |
284 public function addNodeList(\DOMNodeList $nodes) | |
285 { | |
286 foreach ($nodes as $node) { | |
287 if ($node instanceof \DOMNode) { | |
288 $this->addNode($node); | |
289 } | |
290 } | |
291 } | |
292 | |
293 /** | |
294 * Adds an array of \DOMNode instances to the list of nodes. | |
295 * | |
296 * @param \DOMNode[] $nodes An array of \DOMNode instances | |
297 */ | |
298 public function addNodes(array $nodes) | |
299 { | |
300 foreach ($nodes as $node) { | |
301 $this->add($node); | |
302 } | |
303 } | |
304 | |
305 /** | |
306 * Adds a \DOMNode instance to the list of nodes. | |
307 * | |
308 * @param \DOMNode $node A \DOMNode instance | |
309 */ | |
310 public function addNode(\DOMNode $node) | |
311 { | |
312 if ($node instanceof \DOMDocument) { | |
313 $node = $node->documentElement; | |
314 } | |
315 | |
316 if (null !== $this->document && $this->document !== $node->ownerDocument) { | |
317 throw new \InvalidArgumentException('Attaching DOM nodes from multiple documents in the same crawler is forbidden.'); | |
318 } | |
319 | |
320 if (null === $this->document) { | |
321 $this->document = $node->ownerDocument; | |
322 } | |
323 | |
324 // Don't add duplicate nodes in the Crawler | |
325 if (in_array($node, $this->nodes, true)) { | |
326 return; | |
327 } | |
328 | |
329 $this->nodes[] = $node; | |
330 } | |
331 | |
332 /** | |
333 * Returns a node given its position in the node list. | |
334 * | |
335 * @param int $position The position | |
336 * | |
337 * @return self | |
338 */ | |
339 public function eq($position) | |
340 { | |
341 if (isset($this->nodes[$position])) { | |
342 return $this->createSubCrawler($this->nodes[$position]); | |
343 } | |
344 | |
345 return $this->createSubCrawler(null); | |
346 } | |
347 | |
348 /** | |
349 * Calls an anonymous function on each node of the list. | |
350 * | |
351 * The anonymous function receives the position and the node wrapped | |
352 * in a Crawler instance as arguments. | |
353 * | |
354 * Example: | |
355 * | |
356 * $crawler->filter('h1')->each(function ($node, $i) { | |
357 * return $node->text(); | |
358 * }); | |
359 * | |
360 * @param \Closure $closure An anonymous function | |
361 * | |
362 * @return array An array of values returned by the anonymous function | |
363 */ | |
364 public function each(\Closure $closure) | |
365 { | |
366 $data = array(); | |
367 foreach ($this->nodes as $i => $node) { | |
368 $data[] = $closure($this->createSubCrawler($node), $i); | |
369 } | |
370 | |
371 return $data; | |
372 } | |
373 | |
374 /** | |
375 * Slices the list of nodes by $offset and $length. | |
376 * | |
377 * @param int $offset | |
378 * @param int $length | |
379 * | |
380 * @return self | |
381 */ | |
382 public function slice($offset = 0, $length = null) | |
383 { | |
384 return $this->createSubCrawler(array_slice($this->nodes, $offset, $length)); | |
385 } | |
386 | |
387 /** | |
388 * Reduces the list of nodes by calling an anonymous function. | |
389 * | |
390 * To remove a node from the list, the anonymous function must return false. | |
391 * | |
392 * @param \Closure $closure An anonymous function | |
393 * | |
394 * @return self | |
395 */ | |
396 public function reduce(\Closure $closure) | |
397 { | |
398 $nodes = array(); | |
399 foreach ($this->nodes as $i => $node) { | |
400 if (false !== $closure($this->createSubCrawler($node), $i)) { | |
401 $nodes[] = $node; | |
402 } | |
403 } | |
404 | |
405 return $this->createSubCrawler($nodes); | |
406 } | |
407 | |
408 /** | |
409 * Returns the first node of the current selection. | |
410 * | |
411 * @return self | |
412 */ | |
413 public function first() | |
414 { | |
415 return $this->eq(0); | |
416 } | |
417 | |
418 /** | |
419 * Returns the last node of the current selection. | |
420 * | |
421 * @return self | |
422 */ | |
423 public function last() | |
424 { | |
425 return $this->eq(count($this->nodes) - 1); | |
426 } | |
427 | |
428 /** | |
429 * Returns the siblings nodes of the current selection. | |
430 * | |
431 * @return self | |
432 * | |
433 * @throws \InvalidArgumentException When current node is empty | |
434 */ | |
435 public function siblings() | |
436 { | |
437 if (!$this->nodes) { | |
438 throw new \InvalidArgumentException('The current node list is empty.'); | |
439 } | |
440 | |
441 return $this->createSubCrawler($this->sibling($this->getNode(0)->parentNode->firstChild)); | |
442 } | |
443 | |
444 /** | |
445 * Returns the next siblings nodes of the current selection. | |
446 * | |
447 * @return self | |
448 * | |
449 * @throws \InvalidArgumentException When current node is empty | |
450 */ | |
451 public function nextAll() | |
452 { | |
453 if (!$this->nodes) { | |
454 throw new \InvalidArgumentException('The current node list is empty.'); | |
455 } | |
456 | |
457 return $this->createSubCrawler($this->sibling($this->getNode(0))); | |
458 } | |
459 | |
460 /** | |
461 * Returns the previous sibling nodes of the current selection. | |
462 * | |
463 * @return self | |
464 * | |
465 * @throws \InvalidArgumentException | |
466 */ | |
467 public function previousAll() | |
468 { | |
469 if (!$this->nodes) { | |
470 throw new \InvalidArgumentException('The current node list is empty.'); | |
471 } | |
472 | |
473 return $this->createSubCrawler($this->sibling($this->getNode(0), 'previousSibling')); | |
474 } | |
475 | |
476 /** | |
477 * Returns the parents nodes of the current selection. | |
478 * | |
479 * @return self | |
480 * | |
481 * @throws \InvalidArgumentException When current node is empty | |
482 */ | |
483 public function parents() | |
484 { | |
485 if (!$this->nodes) { | |
486 throw new \InvalidArgumentException('The current node list is empty.'); | |
487 } | |
488 | |
489 $node = $this->getNode(0); | |
490 $nodes = array(); | |
491 | |
492 while ($node = $node->parentNode) { | |
493 if (XML_ELEMENT_NODE === $node->nodeType) { | |
494 $nodes[] = $node; | |
495 } | |
496 } | |
497 | |
498 return $this->createSubCrawler($nodes); | |
499 } | |
500 | |
501 /** | |
502 * Returns the children nodes of the current selection. | |
503 * | |
504 * @return self | |
505 * | |
506 * @throws \InvalidArgumentException When current node is empty | |
507 */ | |
508 public function children() | |
509 { | |
510 if (!$this->nodes) { | |
511 throw new \InvalidArgumentException('The current node list is empty.'); | |
512 } | |
513 | |
514 $node = $this->getNode(0)->firstChild; | |
515 | |
516 return $this->createSubCrawler($node ? $this->sibling($node) : array()); | |
517 } | |
518 | |
519 /** | |
520 * Returns the attribute value of the first node of the list. | |
521 * | |
522 * @param string $attribute The attribute name | |
523 * | |
524 * @return string|null The attribute value or null if the attribute does not exist | |
525 * | |
526 * @throws \InvalidArgumentException When current node is empty | |
527 */ | |
528 public function attr($attribute) | |
529 { | |
530 if (!$this->nodes) { | |
531 throw new \InvalidArgumentException('The current node list is empty.'); | |
532 } | |
533 | |
534 $node = $this->getNode(0); | |
535 | |
536 return $node->hasAttribute($attribute) ? $node->getAttribute($attribute) : null; | |
537 } | |
538 | |
539 /** | |
540 * Returns the node name of the first node of the list. | |
541 * | |
542 * @return string The node name | |
543 * | |
544 * @throws \InvalidArgumentException When current node is empty | |
545 */ | |
546 public function nodeName() | |
547 { | |
548 if (!$this->nodes) { | |
549 throw new \InvalidArgumentException('The current node list is empty.'); | |
550 } | |
551 | |
552 return $this->getNode(0)->nodeName; | |
553 } | |
554 | |
555 /** | |
556 * Returns the node value of the first node of the list. | |
557 * | |
558 * @return string The node value | |
559 * | |
560 * @throws \InvalidArgumentException When current node is empty | |
561 */ | |
562 public function text() | |
563 { | |
564 if (!$this->nodes) { | |
565 throw new \InvalidArgumentException('The current node list is empty.'); | |
566 } | |
567 | |
568 return $this->getNode(0)->nodeValue; | |
569 } | |
570 | |
571 /** | |
572 * Returns the first node of the list as HTML. | |
573 * | |
574 * @return string The node html | |
575 * | |
576 * @throws \InvalidArgumentException When current node is empty | |
577 */ | |
578 public function html() | |
579 { | |
580 if (!$this->nodes) { | |
581 throw new \InvalidArgumentException('The current node list is empty.'); | |
582 } | |
583 | |
584 $html = ''; | |
585 foreach ($this->getNode(0)->childNodes as $child) { | |
586 $html .= $child->ownerDocument->saveHTML($child); | |
587 } | |
588 | |
589 return $html; | |
590 } | |
591 | |
592 /** | |
593 * Evaluates an XPath expression. | |
594 * | |
595 * Since an XPath expression might evaluate to either a simple type or a \DOMNodeList, | |
596 * this method will return either an array of simple types or a new Crawler instance. | |
597 * | |
598 * @param string $xpath An XPath expression | |
599 * | |
600 * @return array|Crawler An array of evaluation results or a new Crawler instance | |
601 */ | |
602 public function evaluate($xpath) | |
603 { | |
604 if (null === $this->document) { | |
605 throw new \LogicException('Cannot evaluate the expression on an uninitialized crawler.'); | |
606 } | |
607 | |
608 $data = array(); | |
609 $domxpath = $this->createDOMXPath($this->document, $this->findNamespacePrefixes($xpath)); | |
610 | |
611 foreach ($this->nodes as $node) { | |
612 $data[] = $domxpath->evaluate($xpath, $node); | |
613 } | |
614 | |
615 if (isset($data[0]) && $data[0] instanceof \DOMNodeList) { | |
616 return $this->createSubCrawler($data); | |
617 } | |
618 | |
619 return $data; | |
620 } | |
621 | |
622 /** | |
623 * Extracts information from the list of nodes. | |
624 * | |
625 * You can extract attributes or/and the node value (_text). | |
626 * | |
627 * Example: | |
628 * | |
629 * $crawler->filter('h1 a')->extract(array('_text', 'href')); | |
630 * | |
631 * @param array $attributes An array of attributes | |
632 * | |
633 * @return array An array of extracted values | |
634 */ | |
635 public function extract($attributes) | |
636 { | |
637 $attributes = (array) $attributes; | |
638 $count = count($attributes); | |
639 | |
640 $data = array(); | |
641 foreach ($this->nodes as $node) { | |
642 $elements = array(); | |
643 foreach ($attributes as $attribute) { | |
644 if ('_text' === $attribute) { | |
645 $elements[] = $node->nodeValue; | |
646 } else { | |
647 $elements[] = $node->getAttribute($attribute); | |
648 } | |
649 } | |
650 | |
651 $data[] = 1 === $count ? $elements[0] : $elements; | |
652 } | |
653 | |
654 return $data; | |
655 } | |
656 | |
657 /** | |
658 * Filters the list of nodes with an XPath expression. | |
659 * | |
660 * The XPath expression is evaluated in the context of the crawler, which | |
661 * is considered as a fake parent of the elements inside it. | |
662 * This means that a child selector "div" or "./div" will match only | |
663 * the div elements of the current crawler, not their children. | |
664 * | |
665 * @param string $xpath An XPath expression | |
666 * | |
667 * @return self | |
668 */ | |
669 public function filterXPath($xpath) | |
670 { | |
671 $xpath = $this->relativize($xpath); | |
672 | |
673 // If we dropped all expressions in the XPath while preparing it, there would be no match | |
674 if ('' === $xpath) { | |
675 return $this->createSubCrawler(null); | |
676 } | |
677 | |
678 return $this->filterRelativeXPath($xpath); | |
679 } | |
680 | |
681 /** | |
682 * Filters the list of nodes with a CSS selector. | |
683 * | |
684 * This method only works if you have installed the CssSelector Symfony Component. | |
685 * | |
686 * @param string $selector A CSS selector | |
687 * | |
688 * @return self | |
689 * | |
690 * @throws \RuntimeException if the CssSelector Component is not available | |
691 */ | |
692 public function filter($selector) | |
693 { | |
694 if (!class_exists(CssSelectorConverter::class)) { | |
695 throw new \RuntimeException('To filter with a CSS selector, install the CssSelector component ("composer require symfony/css-selector"). Or use filterXpath instead.'); | |
696 } | |
697 | |
698 $converter = new CssSelectorConverter($this->isHtml); | |
699 | |
700 // The CssSelector already prefixes the selector with descendant-or-self:: | |
701 return $this->filterRelativeXPath($converter->toXPath($selector)); | |
702 } | |
703 | |
704 /** | |
705 * Selects links by name or alt value for clickable images. | |
706 * | |
707 * @param string $value The link text | |
708 * | |
709 * @return self | |
710 */ | |
711 public function selectLink($value) | |
712 { | |
713 $xpath = sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) ', static::xpathLiteral(' '.$value.' ')). | |
714 sprintf('or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]]', static::xpathLiteral(' '.$value.' ')); | |
715 | |
716 return $this->filterRelativeXPath($xpath); | |
717 } | |
718 | |
719 /** | |
720 * Selects images by alt value. | |
721 * | |
722 * @param string $value The image alt | |
723 * | |
724 * @return self A new instance of Crawler with the filtered list of nodes | |
725 */ | |
726 public function selectImage($value) | |
727 { | |
728 $xpath = sprintf('descendant-or-self::img[contains(normalize-space(string(@alt)), %s)]', static::xpathLiteral($value)); | |
729 | |
730 return $this->filterRelativeXPath($xpath); | |
731 } | |
732 | |
733 /** | |
734 * Selects a button by name or alt value for images. | |
735 * | |
736 * @param string $value The button text | |
737 * | |
738 * @return self | |
739 */ | |
740 public function selectButton($value) | |
741 { | |
742 $translate = 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")'; | |
743 $xpath = sprintf('descendant-or-self::input[((contains(%s, "submit") or contains(%1$s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', $translate, static::xpathLiteral(' '.$value.' ')). | |
744 sprintf('or (contains(%s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id=%s or @name=%s] ', $translate, static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value), static::xpathLiteral($value)). | |
745 sprintf('| descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id=%s or @name=%s]', static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value), static::xpathLiteral($value)); | |
746 | |
747 return $this->filterRelativeXPath($xpath); | |
748 } | |
749 | |
750 /** | |
751 * Returns a Link object for the first node in the list. | |
752 * | |
753 * @param string $method The method for the link (get by default) | |
754 * | |
755 * @return Link A Link instance | |
756 * | |
757 * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement | |
758 */ | |
759 public function link($method = 'get') | |
760 { | |
761 if (!$this->nodes) { | |
762 throw new \InvalidArgumentException('The current node list is empty.'); | |
763 } | |
764 | |
765 $node = $this->getNode(0); | |
766 | |
767 if (!$node instanceof \DOMElement) { | |
768 throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node))); | |
769 } | |
770 | |
771 return new Link($node, $this->baseHref, $method); | |
772 } | |
773 | |
774 /** | |
775 * Returns an array of Link objects for the nodes in the list. | |
776 * | |
777 * @return Link[] An array of Link instances | |
778 * | |
779 * @throws \InvalidArgumentException If the current node list contains non-DOMElement instances | |
780 */ | |
781 public function links() | |
782 { | |
783 $links = array(); | |
784 foreach ($this->nodes as $node) { | |
785 if (!$node instanceof \DOMElement) { | |
786 throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_class($node))); | |
787 } | |
788 | |
789 $links[] = new Link($node, $this->baseHref, 'get'); | |
790 } | |
791 | |
792 return $links; | |
793 } | |
794 | |
795 /** | |
796 * Returns an Image object for the first node in the list. | |
797 * | |
798 * @return Image An Image instance | |
799 * | |
800 * @throws \InvalidArgumentException If the current node list is empty | |
801 */ | |
802 public function image() | |
803 { | |
804 if (!count($this)) { | |
805 throw new \InvalidArgumentException('The current node list is empty.'); | |
806 } | |
807 | |
808 $node = $this->getNode(0); | |
809 | |
810 if (!$node instanceof \DOMElement) { | |
811 throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node))); | |
812 } | |
813 | |
814 return new Image($node, $this->baseHref); | |
815 } | |
816 | |
817 /** | |
818 * Returns an array of Image objects for the nodes in the list. | |
819 * | |
820 * @return Image[] An array of Image instances | |
821 */ | |
822 public function images() | |
823 { | |
824 $images = array(); | |
825 foreach ($this as $node) { | |
826 if (!$node instanceof \DOMElement) { | |
827 throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_class($node))); | |
828 } | |
829 | |
830 $images[] = new Image($node, $this->baseHref); | |
831 } | |
832 | |
833 return $images; | |
834 } | |
835 | |
836 /** | |
837 * Returns a Form object for the first node in the list. | |
838 * | |
839 * @param array $values An array of values for the form fields | |
840 * @param string $method The method for the form | |
841 * | |
842 * @return Form A Form instance | |
843 * | |
844 * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement | |
845 */ | |
846 public function form(array $values = null, $method = null) | |
847 { | |
848 if (!$this->nodes) { | |
849 throw new \InvalidArgumentException('The current node list is empty.'); | |
850 } | |
851 | |
852 $node = $this->getNode(0); | |
853 | |
854 if (!$node instanceof \DOMElement) { | |
855 throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node))); | |
856 } | |
857 | |
858 $form = new Form($node, $this->uri, $method, $this->baseHref); | |
859 | |
860 if (null !== $values) { | |
861 $form->setValues($values); | |
862 } | |
863 | |
864 return $form; | |
865 } | |
866 | |
867 /** | |
868 * Overloads a default namespace prefix to be used with XPath and CSS expressions. | |
869 * | |
870 * @param string $prefix | |
871 */ | |
872 public function setDefaultNamespacePrefix($prefix) | |
873 { | |
874 $this->defaultNamespacePrefix = $prefix; | |
875 } | |
876 | |
877 /** | |
878 * @param string $prefix | |
879 * @param string $namespace | |
880 */ | |
881 public function registerNamespace($prefix, $namespace) | |
882 { | |
883 $this->namespaces[$prefix] = $namespace; | |
884 } | |
885 | |
886 /** | |
887 * Converts string for XPath expressions. | |
888 * | |
889 * Escaped characters are: quotes (") and apostrophe ('). | |
890 * | |
891 * Examples: | |
892 * <code> | |
893 * echo Crawler::xpathLiteral('foo " bar'); | |
894 * //prints 'foo " bar' | |
895 * | |
896 * echo Crawler::xpathLiteral("foo ' bar"); | |
897 * //prints "foo ' bar" | |
898 * | |
899 * echo Crawler::xpathLiteral('a\'b"c'); | |
900 * //prints concat('a', "'", 'b"c') | |
901 * </code> | |
902 * | |
903 * @param string $s String to be escaped | |
904 * | |
905 * @return string Converted string | |
906 */ | |
907 public static function xpathLiteral($s) | |
908 { | |
909 if (false === strpos($s, "'")) { | |
910 return sprintf("'%s'", $s); | |
911 } | |
912 | |
913 if (false === strpos($s, '"')) { | |
914 return sprintf('"%s"', $s); | |
915 } | |
916 | |
917 $string = $s; | |
918 $parts = array(); | |
919 while (true) { | |
920 if (false !== $pos = strpos($string, "'")) { | |
921 $parts[] = sprintf("'%s'", substr($string, 0, $pos)); | |
922 $parts[] = "\"'\""; | |
923 $string = substr($string, $pos + 1); | |
924 } else { | |
925 $parts[] = "'$string'"; | |
926 break; | |
927 } | |
928 } | |
929 | |
930 return sprintf('concat(%s)', implode(', ', $parts)); | |
931 } | |
932 | |
933 /** | |
934 * Filters the list of nodes with an XPath expression. | |
935 * | |
936 * The XPath expression should already be processed to apply it in the context of each node. | |
937 * | |
938 * @param string $xpath | |
939 * | |
940 * @return self | |
941 */ | |
942 private function filterRelativeXPath($xpath) | |
943 { | |
944 $prefixes = $this->findNamespacePrefixes($xpath); | |
945 | |
946 $crawler = $this->createSubCrawler(null); | |
947 | |
948 foreach ($this->nodes as $node) { | |
949 $domxpath = $this->createDOMXPath($node->ownerDocument, $prefixes); | |
950 $crawler->add($domxpath->query($xpath, $node)); | |
951 } | |
952 | |
953 return $crawler; | |
954 } | |
955 | |
956 /** | |
957 * Make the XPath relative to the current context. | |
958 * | |
959 * The returned XPath will match elements matching the XPath inside the current crawler | |
960 * when running in the context of a node of the crawler. | |
961 * | |
962 * @param string $xpath | |
963 * | |
964 * @return string | |
965 */ | |
966 private function relativize($xpath) | |
967 { | |
968 $expressions = array(); | |
969 | |
970 // An expression which will never match to replace expressions which cannot match in the crawler | |
971 // We cannot simply drop | |
972 $nonMatchingExpression = 'a[name() = "b"]'; | |
973 | |
974 $xpathLen = strlen($xpath); | |
975 $openedBrackets = 0; | |
976 $startPosition = strspn($xpath, " \t\n\r\0\x0B"); | |
977 | |
978 for ($i = $startPosition; $i <= $xpathLen; ++$i) { | |
979 $i += strcspn($xpath, '"\'[]|', $i); | |
980 | |
981 if ($i < $xpathLen) { | |
982 switch ($xpath[$i]) { | |
983 case '"': | |
984 case "'": | |
985 if (false === $i = strpos($xpath, $xpath[$i], $i + 1)) { | |
986 return $xpath; // The XPath expression is invalid | |
987 } | |
988 continue 2; | |
989 case '[': | |
990 ++$openedBrackets; | |
991 continue 2; | |
992 case ']': | |
993 --$openedBrackets; | |
994 continue 2; | |
995 } | |
996 } | |
997 if ($openedBrackets) { | |
998 continue; | |
999 } | |
1000 | |
1001 if ($startPosition < $xpathLen && '(' === $xpath[$startPosition]) { | |
1002 // If the union is inside some braces, we need to preserve the opening braces and apply | |
1003 // the change only inside it. | |
1004 $j = 1 + strspn($xpath, "( \t\n\r\0\x0B", $startPosition + 1); | |
1005 $parenthesis = substr($xpath, $startPosition, $j); | |
1006 $startPosition += $j; | |
1007 } else { | |
1008 $parenthesis = ''; | |
1009 } | |
1010 $expression = rtrim(substr($xpath, $startPosition, $i - $startPosition)); | |
1011 | |
1012 if (0 === strpos($expression, 'self::*/')) { | |
1013 $expression = './'.substr($expression, 8); | |
1014 } | |
1015 | |
1016 // add prefix before absolute element selector | |
1017 if ('' === $expression) { | |
1018 $expression = $nonMatchingExpression; | |
1019 } elseif (0 === strpos($expression, '//')) { | |
1020 $expression = 'descendant-or-self::'.substr($expression, 2); | |
1021 } elseif (0 === strpos($expression, './/')) { | |
1022 $expression = 'descendant-or-self::'.substr($expression, 3); | |
1023 } elseif (0 === strpos($expression, './')) { | |
1024 $expression = 'self::'.substr($expression, 2); | |
1025 } elseif (0 === strpos($expression, 'child::')) { | |
1026 $expression = 'self::'.substr($expression, 7); | |
1027 } elseif ('/' === $expression[0] || '.' === $expression[0] || 0 === strpos($expression, 'self::')) { | |
1028 $expression = $nonMatchingExpression; | |
1029 } elseif (0 === strpos($expression, 'descendant::')) { | |
1030 $expression = 'descendant-or-self::'.substr($expression, 12); | |
1031 } elseif (preg_match('/^(ancestor|ancestor-or-self|attribute|following|following-sibling|namespace|parent|preceding|preceding-sibling)::/', $expression)) { | |
1032 // the fake root has no parent, preceding or following nodes and also no attributes (even no namespace attributes) | |
1033 $expression = $nonMatchingExpression; | |
1034 } elseif (0 !== strpos($expression, 'descendant-or-self::')) { | |
1035 $expression = 'self::'.$expression; | |
1036 } | |
1037 $expressions[] = $parenthesis.$expression; | |
1038 | |
1039 if ($i === $xpathLen) { | |
1040 return implode(' | ', $expressions); | |
1041 } | |
1042 | |
1043 $i += strspn($xpath, " \t\n\r\0\x0B", $i + 1); | |
1044 $startPosition = $i + 1; | |
1045 } | |
1046 | |
1047 return $xpath; // The XPath expression is invalid | |
1048 } | |
1049 | |
1050 /** | |
1051 * @param int $position | |
1052 * | |
1053 * @return \DOMElement|null | |
1054 */ | |
1055 public function getNode($position) | |
1056 { | |
1057 if (isset($this->nodes[$position])) { | |
1058 return $this->nodes[$position]; | |
1059 } | |
1060 } | |
1061 | |
1062 /** | |
1063 * @return int | |
1064 */ | |
1065 public function count() | |
1066 { | |
1067 return count($this->nodes); | |
1068 } | |
1069 | |
1070 /** | |
1071 * @return \ArrayIterator|\DOMElement[] | |
1072 */ | |
1073 public function getIterator() | |
1074 { | |
1075 return new \ArrayIterator($this->nodes); | |
1076 } | |
1077 | |
1078 /** | |
1079 * @param \DOMElement $node | |
1080 * @param string $siblingDir | |
1081 * | |
1082 * @return array | |
1083 */ | |
1084 protected function sibling($node, $siblingDir = 'nextSibling') | |
1085 { | |
1086 $nodes = array(); | |
1087 | |
1088 do { | |
1089 if ($node !== $this->getNode(0) && 1 === $node->nodeType) { | |
1090 $nodes[] = $node; | |
1091 } | |
1092 } while ($node = $node->$siblingDir); | |
1093 | |
1094 return $nodes; | |
1095 } | |
1096 | |
1097 /** | |
1098 * @param \DOMDocument $document | |
1099 * @param array $prefixes | |
1100 * | |
1101 * @return \DOMXPath | |
1102 * | |
1103 * @throws \InvalidArgumentException | |
1104 */ | |
1105 private function createDOMXPath(\DOMDocument $document, array $prefixes = array()) | |
1106 { | |
1107 $domxpath = new \DOMXPath($document); | |
1108 | |
1109 foreach ($prefixes as $prefix) { | |
1110 $namespace = $this->discoverNamespace($domxpath, $prefix); | |
1111 if (null !== $namespace) { | |
1112 $domxpath->registerNamespace($prefix, $namespace); | |
1113 } | |
1114 } | |
1115 | |
1116 return $domxpath; | |
1117 } | |
1118 | |
1119 /** | |
1120 * @param \DOMXPath $domxpath | |
1121 * @param string $prefix | |
1122 * | |
1123 * @return string | |
1124 * | |
1125 * @throws \InvalidArgumentException | |
1126 */ | |
1127 private function discoverNamespace(\DOMXPath $domxpath, $prefix) | |
1128 { | |
1129 if (isset($this->namespaces[$prefix])) { | |
1130 return $this->namespaces[$prefix]; | |
1131 } | |
1132 | |
1133 // ask for one namespace, otherwise we'd get a collection with an item for each node | |
1134 $namespaces = $domxpath->query(sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix)); | |
1135 | |
1136 if ($node = $namespaces->item(0)) { | |
1137 return $node->nodeValue; | |
1138 } | |
1139 } | |
1140 | |
1141 /** | |
1142 * @param string $xpath | |
1143 * | |
1144 * @return array | |
1145 */ | |
1146 private function findNamespacePrefixes($xpath) | |
1147 { | |
1148 if (preg_match_all('/(?P<prefix>[a-z_][a-z_0-9\-\.]*+):[^"\/:]/i', $xpath, $matches)) { | |
1149 return array_unique($matches['prefix']); | |
1150 } | |
1151 | |
1152 return array(); | |
1153 } | |
1154 | |
1155 /** | |
1156 * Creates a crawler for some subnodes. | |
1157 * | |
1158 * @param \DOMElement|\DOMElement[]|\DOMNodeList|null $nodes | |
1159 * | |
1160 * @return static | |
1161 */ | |
1162 private function createSubCrawler($nodes) | |
1163 { | |
1164 $crawler = new static($nodes, $this->uri, $this->baseHref); | |
1165 $crawler->isHtml = $this->isHtml; | |
1166 $crawler->document = $this->document; | |
1167 $crawler->namespaces = $this->namespaces; | |
1168 | |
1169 return $crawler; | |
1170 } | |
1171 } |