Mercurial > hg > isophonics-drupal-site
comparison vendor/symfony/dom-crawler/Crawler.php @ 0:4c8ae668cc8c
Initial import (non-working)
author | Chris Cannam |
---|---|
date | Wed, 29 Nov 2017 16:09:58 +0000 |
parents | |
children | 7a779792577d |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4c8ae668cc8c |
---|---|
1 <?php | |
2 | |
3 /* | |
4 * This file is part of the Symfony package. | |
5 * | |
6 * (c) Fabien Potencier <fabien@symfony.com> | |
7 * | |
8 * For the full copyright and license information, please view the LICENSE | |
9 * file that was distributed with this source code. | |
10 */ | |
11 | |
12 namespace Symfony\Component\DomCrawler; | |
13 | |
14 use Symfony\Component\CssSelector\CssSelectorConverter; | |
15 | |
16 /** | |
17 * Crawler eases navigation of a list of \DOMNode objects. | |
18 * | |
19 * @author Fabien Potencier <fabien@symfony.com> | |
20 */ | |
21 class Crawler implements \Countable, \IteratorAggregate | |
22 { | |
23 /** | |
24 * @var string The current URI | |
25 */ | |
26 protected $uri; | |
27 | |
28 /** | |
29 * @var string The default namespace prefix to be used with XPath and CSS expressions | |
30 */ | |
31 private $defaultNamespacePrefix = 'default'; | |
32 | |
33 /** | |
34 * @var array A map of manually registered namespaces | |
35 */ | |
36 private $namespaces = array(); | |
37 | |
38 /** | |
39 * @var string The base href value | |
40 */ | |
41 private $baseHref; | |
42 | |
43 /** | |
44 * @var \DOMDocument|null | |
45 */ | |
46 private $document; | |
47 | |
48 /** | |
49 * @var \DOMElement[] | |
50 */ | |
51 private $nodes = array(); | |
52 | |
53 /** | |
54 * Whether the Crawler contains HTML or XML content (used when converting CSS to XPath). | |
55 * | |
56 * @var bool | |
57 */ | |
58 private $isHtml = true; | |
59 | |
60 /** | |
61 * @param mixed $node A Node to use as the base for the crawling | |
62 * @param string $currentUri The current URI | |
63 * @param string $baseHref The base href value | |
64 */ | |
65 public function __construct($node = null, $currentUri = null, $baseHref = null) | |
66 { | |
67 $this->uri = $currentUri; | |
68 $this->baseHref = $baseHref ?: $currentUri; | |
69 | |
70 $this->add($node); | |
71 } | |
72 | |
73 /** | |
74 * Returns the current URI. | |
75 * | |
76 * @return string | |
77 */ | |
78 public function getUri() | |
79 { | |
80 return $this->uri; | |
81 } | |
82 | |
83 /** | |
84 * Returns base href. | |
85 * | |
86 * @return string | |
87 */ | |
88 public function getBaseHref() | |
89 { | |
90 return $this->baseHref; | |
91 } | |
92 | |
93 /** | |
94 * Removes all the nodes. | |
95 */ | |
96 public function clear() | |
97 { | |
98 $this->nodes = array(); | |
99 $this->document = null; | |
100 } | |
101 | |
102 /** | |
103 * Adds a node to the current list of nodes. | |
104 * | |
105 * This method uses the appropriate specialized add*() method based | |
106 * on the type of the argument. | |
107 * | |
108 * @param \DOMNodeList|\DOMNode|array|string|null $node A node | |
109 * | |
110 * @throws \InvalidArgumentException When node is not the expected type. | |
111 */ | |
112 public function add($node) | |
113 { | |
114 if ($node instanceof \DOMNodeList) { | |
115 $this->addNodeList($node); | |
116 } elseif ($node instanceof \DOMNode) { | |
117 $this->addNode($node); | |
118 } elseif (is_array($node)) { | |
119 $this->addNodes($node); | |
120 } elseif (is_string($node)) { | |
121 $this->addContent($node); | |
122 } elseif (null !== $node) { | |
123 throw new \InvalidArgumentException(sprintf('Expecting a DOMNodeList or DOMNode instance, an array, a string, or null, but got "%s".', is_object($node) ? get_class($node) : gettype($node))); | |
124 } | |
125 } | |
126 | |
127 /** | |
128 * Adds HTML/XML content. | |
129 * | |
130 * If the charset is not set via the content type, it is assumed | |
131 * to be ISO-8859-1, which is the default charset defined by the | |
132 * HTTP 1.1 specification. | |
133 * | |
134 * @param string $content A string to parse as HTML/XML | |
135 * @param null|string $type The content type of the string | |
136 */ | |
137 public function addContent($content, $type = null) | |
138 { | |
139 if (empty($type)) { | |
140 $type = 0 === strpos($content, '<?xml') ? 'application/xml' : 'text/html'; | |
141 } | |
142 | |
143 // DOM only for HTML/XML content | |
144 if (!preg_match('/(x|ht)ml/i', $type, $xmlMatches)) { | |
145 return; | |
146 } | |
147 | |
148 $charset = null; | |
149 if (false !== $pos = stripos($type, 'charset=')) { | |
150 $charset = substr($type, $pos + 8); | |
151 if (false !== $pos = strpos($charset, ';')) { | |
152 $charset = substr($charset, 0, $pos); | |
153 } | |
154 } | |
155 | |
156 // http://www.w3.org/TR/encoding/#encodings | |
157 // http://www.w3.org/TR/REC-xml/#NT-EncName | |
158 if (null === $charset && | |
159 preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9_:.]+)/i', $content, $matches)) { | |
160 $charset = $matches[1]; | |
161 } | |
162 | |
163 if (null === $charset) { | |
164 $charset = 'ISO-8859-1'; | |
165 } | |
166 | |
167 if ('x' === $xmlMatches[1]) { | |
168 $this->addXmlContent($content, $charset); | |
169 } else { | |
170 $this->addHtmlContent($content, $charset); | |
171 } | |
172 } | |
173 | |
174 /** | |
175 * Adds an HTML content to the list of nodes. | |
176 * | |
177 * The libxml errors are disabled when the content is parsed. | |
178 * | |
179 * If you want to get parsing errors, be sure to enable | |
180 * internal errors via libxml_use_internal_errors(true) | |
181 * and then, get the errors via libxml_get_errors(). Be | |
182 * sure to clear errors with libxml_clear_errors() afterward. | |
183 * | |
184 * @param string $content The HTML content | |
185 * @param string $charset The charset | |
186 */ | |
187 public function addHtmlContent($content, $charset = 'UTF-8') | |
188 { | |
189 $internalErrors = libxml_use_internal_errors(true); | |
190 $disableEntities = libxml_disable_entity_loader(true); | |
191 | |
192 $dom = new \DOMDocument('1.0', $charset); | |
193 $dom->validateOnParse = true; | |
194 | |
195 set_error_handler(function () { throw new \Exception(); }); | |
196 | |
197 try { | |
198 // Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML() | |
199 $content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset); | |
200 } catch (\Exception $e) { | |
201 } | |
202 | |
203 restore_error_handler(); | |
204 | |
205 if ('' !== trim($content)) { | |
206 @$dom->loadHTML($content); | |
207 } | |
208 | |
209 libxml_use_internal_errors($internalErrors); | |
210 libxml_disable_entity_loader($disableEntities); | |
211 | |
212 $this->addDocument($dom); | |
213 | |
214 $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(array('href')); | |
215 | |
216 $baseHref = current($base); | |
217 if (count($base) && !empty($baseHref)) { | |
218 if ($this->baseHref) { | |
219 $linkNode = $dom->createElement('a'); | |
220 $linkNode->setAttribute('href', $baseHref); | |
221 $link = new Link($linkNode, $this->baseHref); | |
222 $this->baseHref = $link->getUri(); | |
223 } else { | |
224 $this->baseHref = $baseHref; | |
225 } | |
226 } | |
227 } | |
228 | |
229 /** | |
230 * Adds an XML content to the list of nodes. | |
231 * | |
232 * The libxml errors are disabled when the content is parsed. | |
233 * | |
234 * If you want to get parsing errors, be sure to enable | |
235 * internal errors via libxml_use_internal_errors(true) | |
236 * and then, get the errors via libxml_get_errors(). Be | |
237 * sure to clear errors with libxml_clear_errors() afterward. | |
238 * | |
239 * @param string $content The XML content | |
240 * @param string $charset The charset | |
241 * @param int $options Bitwise OR of the libxml option constants | |
242 * LIBXML_PARSEHUGE is dangerous, see | |
243 * http://symfony.com/blog/security-release-symfony-2-0-17-released | |
244 */ | |
245 public function addXmlContent($content, $charset = 'UTF-8', $options = LIBXML_NONET) | |
246 { | |
247 // remove the default namespace if it's the only namespace to make XPath expressions simpler | |
248 if (!preg_match('/xmlns:/', $content)) { | |
249 $content = str_replace('xmlns', 'ns', $content); | |
250 } | |
251 | |
252 $internalErrors = libxml_use_internal_errors(true); | |
253 $disableEntities = libxml_disable_entity_loader(true); | |
254 | |
255 $dom = new \DOMDocument('1.0', $charset); | |
256 $dom->validateOnParse = true; | |
257 | |
258 if ('' !== trim($content)) { | |
259 @$dom->loadXML($content, $options); | |
260 } | |
261 | |
262 libxml_use_internal_errors($internalErrors); | |
263 libxml_disable_entity_loader($disableEntities); | |
264 | |
265 $this->addDocument($dom); | |
266 | |
267 $this->isHtml = false; | |
268 } | |
269 | |
270 /** | |
271 * Adds a \DOMDocument to the list of nodes. | |
272 * | |
273 * @param \DOMDocument $dom A \DOMDocument instance | |
274 */ | |
275 public function addDocument(\DOMDocument $dom) | |
276 { | |
277 if ($dom->documentElement) { | |
278 $this->addNode($dom->documentElement); | |
279 } | |
280 } | |
281 | |
282 /** | |
283 * Adds a \DOMNodeList to the list of nodes. | |
284 * | |
285 * @param \DOMNodeList $nodes A \DOMNodeList instance | |
286 */ | |
287 public function addNodeList(\DOMNodeList $nodes) | |
288 { | |
289 foreach ($nodes as $node) { | |
290 if ($node instanceof \DOMNode) { | |
291 $this->addNode($node); | |
292 } | |
293 } | |
294 } | |
295 | |
296 /** | |
297 * Adds an array of \DOMNode instances to the list of nodes. | |
298 * | |
299 * @param \DOMNode[] $nodes An array of \DOMNode instances | |
300 */ | |
301 public function addNodes(array $nodes) | |
302 { | |
303 foreach ($nodes as $node) { | |
304 $this->add($node); | |
305 } | |
306 } | |
307 | |
308 /** | |
309 * Adds a \DOMNode instance to the list of nodes. | |
310 * | |
311 * @param \DOMNode $node A \DOMNode instance | |
312 */ | |
313 public function addNode(\DOMNode $node) | |
314 { | |
315 if ($node instanceof \DOMDocument) { | |
316 $node = $node->documentElement; | |
317 } | |
318 | |
319 if (null !== $this->document && $this->document !== $node->ownerDocument) { | |
320 throw new \InvalidArgumentException('Attaching DOM nodes from multiple documents in the same crawler is forbidden.'); | |
321 } | |
322 | |
323 if (null === $this->document) { | |
324 $this->document = $node->ownerDocument; | |
325 } | |
326 | |
327 // Don't add duplicate nodes in the Crawler | |
328 if (in_array($node, $this->nodes, true)) { | |
329 return; | |
330 } | |
331 | |
332 $this->nodes[] = $node; | |
333 } | |
334 | |
335 /** | |
336 * Returns a node given its position in the node list. | |
337 * | |
338 * @param int $position The position | |
339 * | |
340 * @return self | |
341 */ | |
342 public function eq($position) | |
343 { | |
344 if (isset($this->nodes[$position])) { | |
345 return $this->createSubCrawler($this->nodes[$position]); | |
346 } | |
347 | |
348 return $this->createSubCrawler(null); | |
349 } | |
350 | |
351 /** | |
352 * Calls an anonymous function on each node of the list. | |
353 * | |
354 * The anonymous function receives the position and the node wrapped | |
355 * in a Crawler instance as arguments. | |
356 * | |
357 * Example: | |
358 * | |
359 * $crawler->filter('h1')->each(function ($node, $i) { | |
360 * return $node->text(); | |
361 * }); | |
362 * | |
363 * @param \Closure $closure An anonymous function | |
364 * | |
365 * @return array An array of values returned by the anonymous function | |
366 */ | |
367 public function each(\Closure $closure) | |
368 { | |
369 $data = array(); | |
370 foreach ($this->nodes as $i => $node) { | |
371 $data[] = $closure($this->createSubCrawler($node), $i); | |
372 } | |
373 | |
374 return $data; | |
375 } | |
376 | |
377 /** | |
378 * Slices the list of nodes by $offset and $length. | |
379 * | |
380 * @param int $offset | |
381 * @param int $length | |
382 * | |
383 * @return self | |
384 */ | |
385 public function slice($offset = 0, $length = null) | |
386 { | |
387 return $this->createSubCrawler(array_slice($this->nodes, $offset, $length)); | |
388 } | |
389 | |
390 /** | |
391 * Reduces the list of nodes by calling an anonymous function. | |
392 * | |
393 * To remove a node from the list, the anonymous function must return false. | |
394 * | |
395 * @param \Closure $closure An anonymous function | |
396 * | |
397 * @return self | |
398 */ | |
399 public function reduce(\Closure $closure) | |
400 { | |
401 $nodes = array(); | |
402 foreach ($this->nodes as $i => $node) { | |
403 if (false !== $closure($this->createSubCrawler($node), $i)) { | |
404 $nodes[] = $node; | |
405 } | |
406 } | |
407 | |
408 return $this->createSubCrawler($nodes); | |
409 } | |
410 | |
411 /** | |
412 * Returns the first node of the current selection. | |
413 * | |
414 * @return self | |
415 */ | |
416 public function first() | |
417 { | |
418 return $this->eq(0); | |
419 } | |
420 | |
421 /** | |
422 * Returns the last node of the current selection. | |
423 * | |
424 * @return self | |
425 */ | |
426 public function last() | |
427 { | |
428 return $this->eq(count($this->nodes) - 1); | |
429 } | |
430 | |
431 /** | |
432 * Returns the siblings nodes of the current selection. | |
433 * | |
434 * @return self | |
435 * | |
436 * @throws \InvalidArgumentException When current node is empty | |
437 */ | |
438 public function siblings() | |
439 { | |
440 if (!$this->nodes) { | |
441 throw new \InvalidArgumentException('The current node list is empty.'); | |
442 } | |
443 | |
444 return $this->createSubCrawler($this->sibling($this->getNode(0)->parentNode->firstChild)); | |
445 } | |
446 | |
447 /** | |
448 * Returns the next siblings nodes of the current selection. | |
449 * | |
450 * @return self | |
451 * | |
452 * @throws \InvalidArgumentException When current node is empty | |
453 */ | |
454 public function nextAll() | |
455 { | |
456 if (!$this->nodes) { | |
457 throw new \InvalidArgumentException('The current node list is empty.'); | |
458 } | |
459 | |
460 return $this->createSubCrawler($this->sibling($this->getNode(0))); | |
461 } | |
462 | |
463 /** | |
464 * Returns the previous sibling nodes of the current selection. | |
465 * | |
466 * @return self | |
467 * | |
468 * @throws \InvalidArgumentException | |
469 */ | |
470 public function previousAll() | |
471 { | |
472 if (!$this->nodes) { | |
473 throw new \InvalidArgumentException('The current node list is empty.'); | |
474 } | |
475 | |
476 return $this->createSubCrawler($this->sibling($this->getNode(0), 'previousSibling')); | |
477 } | |
478 | |
479 /** | |
480 * Returns the parents nodes of the current selection. | |
481 * | |
482 * @return self | |
483 * | |
484 * @throws \InvalidArgumentException When current node is empty | |
485 */ | |
486 public function parents() | |
487 { | |
488 if (!$this->nodes) { | |
489 throw new \InvalidArgumentException('The current node list is empty.'); | |
490 } | |
491 | |
492 $node = $this->getNode(0); | |
493 $nodes = array(); | |
494 | |
495 while ($node = $node->parentNode) { | |
496 if (XML_ELEMENT_NODE === $node->nodeType) { | |
497 $nodes[] = $node; | |
498 } | |
499 } | |
500 | |
501 return $this->createSubCrawler($nodes); | |
502 } | |
503 | |
504 /** | |
505 * Returns the children nodes of the current selection. | |
506 * | |
507 * @return self | |
508 * | |
509 * @throws \InvalidArgumentException When current node is empty | |
510 */ | |
511 public function children() | |
512 { | |
513 if (!$this->nodes) { | |
514 throw new \InvalidArgumentException('The current node list is empty.'); | |
515 } | |
516 | |
517 $node = $this->getNode(0)->firstChild; | |
518 | |
519 return $this->createSubCrawler($node ? $this->sibling($node) : array()); | |
520 } | |
521 | |
522 /** | |
523 * Returns the attribute value of the first node of the list. | |
524 * | |
525 * @param string $attribute The attribute name | |
526 * | |
527 * @return string|null The attribute value or null if the attribute does not exist | |
528 * | |
529 * @throws \InvalidArgumentException When current node is empty | |
530 */ | |
531 public function attr($attribute) | |
532 { | |
533 if (!$this->nodes) { | |
534 throw new \InvalidArgumentException('The current node list is empty.'); | |
535 } | |
536 | |
537 $node = $this->getNode(0); | |
538 | |
539 return $node->hasAttribute($attribute) ? $node->getAttribute($attribute) : null; | |
540 } | |
541 | |
542 /** | |
543 * Returns the node name of the first node of the list. | |
544 * | |
545 * @return string The node name | |
546 * | |
547 * @throws \InvalidArgumentException When current node is empty | |
548 */ | |
549 public function nodeName() | |
550 { | |
551 if (!$this->nodes) { | |
552 throw new \InvalidArgumentException('The current node list is empty.'); | |
553 } | |
554 | |
555 return $this->getNode(0)->nodeName; | |
556 } | |
557 | |
558 /** | |
559 * Returns the node value of the first node of the list. | |
560 * | |
561 * @return string The node value | |
562 * | |
563 * @throws \InvalidArgumentException When current node is empty | |
564 */ | |
565 public function text() | |
566 { | |
567 if (!$this->nodes) { | |
568 throw new \InvalidArgumentException('The current node list is empty.'); | |
569 } | |
570 | |
571 return $this->getNode(0)->nodeValue; | |
572 } | |
573 | |
574 /** | |
575 * Returns the first node of the list as HTML. | |
576 * | |
577 * @return string The node html | |
578 * | |
579 * @throws \InvalidArgumentException When current node is empty | |
580 */ | |
581 public function html() | |
582 { | |
583 if (!$this->nodes) { | |
584 throw new \InvalidArgumentException('The current node list is empty.'); | |
585 } | |
586 | |
587 $html = ''; | |
588 foreach ($this->getNode(0)->childNodes as $child) { | |
589 $html .= $child->ownerDocument->saveHTML($child); | |
590 } | |
591 | |
592 return $html; | |
593 } | |
594 | |
595 /** | |
596 * Evaluates an XPath expression. | |
597 * | |
598 * Since an XPath expression might evaluate to either a simple type or a \DOMNodeList, | |
599 * this method will return either an array of simple types or a new Crawler instance. | |
600 * | |
601 * @param string $xpath An XPath expression | |
602 * | |
603 * @return array|Crawler An array of evaluation results or a new Crawler instance | |
604 */ | |
605 public function evaluate($xpath) | |
606 { | |
607 if (null === $this->document) { | |
608 throw new \LogicException('Cannot evaluate the expression on an uninitialized crawler.'); | |
609 } | |
610 | |
611 $data = array(); | |
612 $domxpath = $this->createDOMXPath($this->document, $this->findNamespacePrefixes($xpath)); | |
613 | |
614 foreach ($this->nodes as $node) { | |
615 $data[] = $domxpath->evaluate($xpath, $node); | |
616 } | |
617 | |
618 if (isset($data[0]) && $data[0] instanceof \DOMNodeList) { | |
619 return $this->createSubCrawler($data); | |
620 } | |
621 | |
622 return $data; | |
623 } | |
624 | |
625 /** | |
626 * Extracts information from the list of nodes. | |
627 * | |
628 * You can extract attributes or/and the node value (_text). | |
629 * | |
630 * Example: | |
631 * | |
632 * $crawler->filter('h1 a')->extract(array('_text', 'href')); | |
633 * | |
634 * @param array $attributes An array of attributes | |
635 * | |
636 * @return array An array of extracted values | |
637 */ | |
638 public function extract($attributes) | |
639 { | |
640 $attributes = (array) $attributes; | |
641 $count = count($attributes); | |
642 | |
643 $data = array(); | |
644 foreach ($this->nodes as $node) { | |
645 $elements = array(); | |
646 foreach ($attributes as $attribute) { | |
647 if ('_text' === $attribute) { | |
648 $elements[] = $node->nodeValue; | |
649 } else { | |
650 $elements[] = $node->getAttribute($attribute); | |
651 } | |
652 } | |
653 | |
654 $data[] = $count > 1 ? $elements : $elements[0]; | |
655 } | |
656 | |
657 return $data; | |
658 } | |
659 | |
660 /** | |
661 * Filters the list of nodes with an XPath expression. | |
662 * | |
663 * The XPath expression is evaluated in the context of the crawler, which | |
664 * is considered as a fake parent of the elements inside it. | |
665 * This means that a child selector "div" or "./div" will match only | |
666 * the div elements of the current crawler, not their children. | |
667 * | |
668 * @param string $xpath An XPath expression | |
669 * | |
670 * @return self | |
671 */ | |
672 public function filterXPath($xpath) | |
673 { | |
674 $xpath = $this->relativize($xpath); | |
675 | |
676 // If we dropped all expressions in the XPath while preparing it, there would be no match | |
677 if ('' === $xpath) { | |
678 return $this->createSubCrawler(null); | |
679 } | |
680 | |
681 return $this->filterRelativeXPath($xpath); | |
682 } | |
683 | |
684 /** | |
685 * Filters the list of nodes with a CSS selector. | |
686 * | |
687 * This method only works if you have installed the CssSelector Symfony Component. | |
688 * | |
689 * @param string $selector A CSS selector | |
690 * | |
691 * @return self | |
692 * | |
693 * @throws \RuntimeException if the CssSelector Component is not available | |
694 */ | |
695 public function filter($selector) | |
696 { | |
697 if (!class_exists('Symfony\\Component\\CssSelector\\CssSelectorConverter')) { | |
698 throw new \RuntimeException('Unable to filter with a CSS selector as the Symfony CssSelector 2.8+ is not installed (you can use filterXPath instead).'); | |
699 } | |
700 | |
701 $converter = new CssSelectorConverter($this->isHtml); | |
702 | |
703 // The CssSelector already prefixes the selector with descendant-or-self:: | |
704 return $this->filterRelativeXPath($converter->toXPath($selector)); | |
705 } | |
706 | |
707 /** | |
708 * Selects links by name or alt value for clickable images. | |
709 * | |
710 * @param string $value The link text | |
711 * | |
712 * @return self | |
713 */ | |
714 public function selectLink($value) | |
715 { | |
716 $xpath = sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) ', static::xpathLiteral(' '.$value.' ')). | |
717 sprintf('or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]]', static::xpathLiteral(' '.$value.' ')); | |
718 | |
719 return $this->filterRelativeXPath($xpath); | |
720 } | |
721 | |
722 /** | |
723 * Selects images by alt value. | |
724 * | |
725 * @param string $value The image alt | |
726 * | |
727 * @return self A new instance of Crawler with the filtered list of nodes | |
728 */ | |
729 public function selectImage($value) | |
730 { | |
731 $xpath = sprintf('descendant-or-self::img[contains(normalize-space(string(@alt)), %s)]', static::xpathLiteral($value)); | |
732 | |
733 return $this->filterRelativeXPath($xpath); | |
734 } | |
735 | |
736 /** | |
737 * Selects a button by name or alt value for images. | |
738 * | |
739 * @param string $value The button text | |
740 * | |
741 * @return self | |
742 */ | |
743 public function selectButton($value) | |
744 { | |
745 $translate = 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")'; | |
746 $xpath = sprintf('descendant-or-self::input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', $translate, $translate, static::xpathLiteral(' '.$value.' ')). | |
747 sprintf('or (contains(%s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id=%s or @name=%s] ', $translate, static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value), static::xpathLiteral($value)). | |
748 sprintf('| descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id=%s or @name=%s]', static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value), static::xpathLiteral($value)); | |
749 | |
750 return $this->filterRelativeXPath($xpath); | |
751 } | |
752 | |
753 /** | |
754 * Returns a Link object for the first node in the list. | |
755 * | |
756 * @param string $method The method for the link (get by default) | |
757 * | |
758 * @return Link A Link instance | |
759 * | |
760 * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement | |
761 */ | |
762 public function link($method = 'get') | |
763 { | |
764 if (!$this->nodes) { | |
765 throw new \InvalidArgumentException('The current node list is empty.'); | |
766 } | |
767 | |
768 $node = $this->getNode(0); | |
769 | |
770 if (!$node instanceof \DOMElement) { | |
771 throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node))); | |
772 } | |
773 | |
774 return new Link($node, $this->baseHref, $method); | |
775 } | |
776 | |
777 /** | |
778 * Returns an array of Link objects for the nodes in the list. | |
779 * | |
780 * @return Link[] An array of Link instances | |
781 * | |
782 * @throws \InvalidArgumentException If the current node list contains non-DOMElement instances | |
783 */ | |
784 public function links() | |
785 { | |
786 $links = array(); | |
787 foreach ($this->nodes as $node) { | |
788 if (!$node instanceof \DOMElement) { | |
789 throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_class($node))); | |
790 } | |
791 | |
792 $links[] = new Link($node, $this->baseHref, 'get'); | |
793 } | |
794 | |
795 return $links; | |
796 } | |
797 | |
798 /** | |
799 * Returns an Image object for the first node in the list. | |
800 * | |
801 * @return Image An Image instance | |
802 * | |
803 * @throws \InvalidArgumentException If the current node list is empty | |
804 */ | |
805 public function image() | |
806 { | |
807 if (!count($this)) { | |
808 throw new \InvalidArgumentException('The current node list is empty.'); | |
809 } | |
810 | |
811 $node = $this->getNode(0); | |
812 | |
813 if (!$node instanceof \DOMElement) { | |
814 throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node))); | |
815 } | |
816 | |
817 return new Image($node, $this->baseHref); | |
818 } | |
819 | |
820 /** | |
821 * Returns an array of Image objects for the nodes in the list. | |
822 * | |
823 * @return Image[] An array of Image instances | |
824 */ | |
825 public function images() | |
826 { | |
827 $images = array(); | |
828 foreach ($this as $node) { | |
829 if (!$node instanceof \DOMElement) { | |
830 throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_class($node))); | |
831 } | |
832 | |
833 $images[] = new Image($node, $this->baseHref); | |
834 } | |
835 | |
836 return $images; | |
837 } | |
838 | |
839 /** | |
840 * Returns a Form object for the first node in the list. | |
841 * | |
842 * @param array $values An array of values for the form fields | |
843 * @param string $method The method for the form | |
844 * | |
845 * @return Form A Form instance | |
846 * | |
847 * @throws \InvalidArgumentException If the current node list is empty or the selected node is not instance of DOMElement | |
848 */ | |
849 public function form(array $values = null, $method = null) | |
850 { | |
851 if (!$this->nodes) { | |
852 throw new \InvalidArgumentException('The current node list is empty.'); | |
853 } | |
854 | |
855 $node = $this->getNode(0); | |
856 | |
857 if (!$node instanceof \DOMElement) { | |
858 throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node))); | |
859 } | |
860 | |
861 $form = new Form($node, $this->uri, $method, $this->baseHref); | |
862 | |
863 if (null !== $values) { | |
864 $form->setValues($values); | |
865 } | |
866 | |
867 return $form; | |
868 } | |
869 | |
870 /** | |
871 * Overloads a default namespace prefix to be used with XPath and CSS expressions. | |
872 * | |
873 * @param string $prefix | |
874 */ | |
875 public function setDefaultNamespacePrefix($prefix) | |
876 { | |
877 $this->defaultNamespacePrefix = $prefix; | |
878 } | |
879 | |
880 /** | |
881 * @param string $prefix | |
882 * @param string $namespace | |
883 */ | |
884 public function registerNamespace($prefix, $namespace) | |
885 { | |
886 $this->namespaces[$prefix] = $namespace; | |
887 } | |
888 | |
889 /** | |
890 * Converts string for XPath expressions. | |
891 * | |
892 * Escaped characters are: quotes (") and apostrophe ('). | |
893 * | |
894 * Examples: | |
895 * <code> | |
896 * echo Crawler::xpathLiteral('foo " bar'); | |
897 * //prints 'foo " bar' | |
898 * | |
899 * echo Crawler::xpathLiteral("foo ' bar"); | |
900 * //prints "foo ' bar" | |
901 * | |
902 * echo Crawler::xpathLiteral('a\'b"c'); | |
903 * //prints concat('a', "'", 'b"c') | |
904 * </code> | |
905 * | |
906 * @param string $s String to be escaped | |
907 * | |
908 * @return string Converted string | |
909 */ | |
910 public static function xpathLiteral($s) | |
911 { | |
912 if (false === strpos($s, "'")) { | |
913 return sprintf("'%s'", $s); | |
914 } | |
915 | |
916 if (false === strpos($s, '"')) { | |
917 return sprintf('"%s"', $s); | |
918 } | |
919 | |
920 $string = $s; | |
921 $parts = array(); | |
922 while (true) { | |
923 if (false !== $pos = strpos($string, "'")) { | |
924 $parts[] = sprintf("'%s'", substr($string, 0, $pos)); | |
925 $parts[] = "\"'\""; | |
926 $string = substr($string, $pos + 1); | |
927 } else { | |
928 $parts[] = "'$string'"; | |
929 break; | |
930 } | |
931 } | |
932 | |
933 return sprintf('concat(%s)', implode(', ', $parts)); | |
934 } | |
935 | |
936 /** | |
937 * Filters the list of nodes with an XPath expression. | |
938 * | |
939 * The XPath expression should already be processed to apply it in the context of each node. | |
940 * | |
941 * @param string $xpath | |
942 * | |
943 * @return self | |
944 */ | |
945 private function filterRelativeXPath($xpath) | |
946 { | |
947 $prefixes = $this->findNamespacePrefixes($xpath); | |
948 | |
949 $crawler = $this->createSubCrawler(null); | |
950 | |
951 foreach ($this->nodes as $node) { | |
952 $domxpath = $this->createDOMXPath($node->ownerDocument, $prefixes); | |
953 $crawler->add($domxpath->query($xpath, $node)); | |
954 } | |
955 | |
956 return $crawler; | |
957 } | |
958 | |
959 /** | |
960 * Make the XPath relative to the current context. | |
961 * | |
962 * The returned XPath will match elements matching the XPath inside the current crawler | |
963 * when running in the context of a node of the crawler. | |
964 * | |
965 * @param string $xpath | |
966 * | |
967 * @return string | |
968 */ | |
969 private function relativize($xpath) | |
970 { | |
971 $expressions = array(); | |
972 | |
973 // An expression which will never match to replace expressions which cannot match in the crawler | |
974 // We cannot simply drop | |
975 $nonMatchingExpression = 'a[name() = "b"]'; | |
976 | |
977 $xpathLen = strlen($xpath); | |
978 $openedBrackets = 0; | |
979 $startPosition = strspn($xpath, " \t\n\r\0\x0B"); | |
980 | |
981 for ($i = $startPosition; $i <= $xpathLen; ++$i) { | |
982 $i += strcspn($xpath, '"\'[]|', $i); | |
983 | |
984 if ($i < $xpathLen) { | |
985 switch ($xpath[$i]) { | |
986 case '"': | |
987 case "'": | |
988 if (false === $i = strpos($xpath, $xpath[$i], $i + 1)) { | |
989 return $xpath; // The XPath expression is invalid | |
990 } | |
991 continue 2; | |
992 case '[': | |
993 ++$openedBrackets; | |
994 continue 2; | |
995 case ']': | |
996 --$openedBrackets; | |
997 continue 2; | |
998 } | |
999 } | |
1000 if ($openedBrackets) { | |
1001 continue; | |
1002 } | |
1003 | |
1004 if ($startPosition < $xpathLen && '(' === $xpath[$startPosition]) { | |
1005 // If the union is inside some braces, we need to preserve the opening braces and apply | |
1006 // the change only inside it. | |
1007 $j = 1 + strspn($xpath, "( \t\n\r\0\x0B", $startPosition + 1); | |
1008 $parenthesis = substr($xpath, $startPosition, $j); | |
1009 $startPosition += $j; | |
1010 } else { | |
1011 $parenthesis = ''; | |
1012 } | |
1013 $expression = rtrim(substr($xpath, $startPosition, $i - $startPosition)); | |
1014 | |
1015 if (0 === strpos($expression, 'self::*/')) { | |
1016 $expression = './'.substr($expression, 8); | |
1017 } | |
1018 | |
1019 // add prefix before absolute element selector | |
1020 if ('' === $expression) { | |
1021 $expression = $nonMatchingExpression; | |
1022 } elseif (0 === strpos($expression, '//')) { | |
1023 $expression = 'descendant-or-self::'.substr($expression, 2); | |
1024 } elseif (0 === strpos($expression, './/')) { | |
1025 $expression = 'descendant-or-self::'.substr($expression, 3); | |
1026 } elseif (0 === strpos($expression, './')) { | |
1027 $expression = 'self::'.substr($expression, 2); | |
1028 } elseif (0 === strpos($expression, 'child::')) { | |
1029 $expression = 'self::'.substr($expression, 7); | |
1030 } elseif ('/' === $expression[0] || '.' === $expression[0] || 0 === strpos($expression, 'self::')) { | |
1031 $expression = $nonMatchingExpression; | |
1032 } elseif (0 === strpos($expression, 'descendant::')) { | |
1033 $expression = 'descendant-or-self::'.substr($expression, 12); | |
1034 } elseif (preg_match('/^(ancestor|ancestor-or-self|attribute|following|following-sibling|namespace|parent|preceding|preceding-sibling)::/', $expression)) { | |
1035 // the fake root has no parent, preceding or following nodes and also no attributes (even no namespace attributes) | |
1036 $expression = $nonMatchingExpression; | |
1037 } elseif (0 !== strpos($expression, 'descendant-or-self::')) { | |
1038 $expression = 'self::'.$expression; | |
1039 } | |
1040 $expressions[] = $parenthesis.$expression; | |
1041 | |
1042 if ($i === $xpathLen) { | |
1043 return implode(' | ', $expressions); | |
1044 } | |
1045 | |
1046 $i += strspn($xpath, " \t\n\r\0\x0B", $i + 1); | |
1047 $startPosition = $i + 1; | |
1048 } | |
1049 | |
1050 return $xpath; // The XPath expression is invalid | |
1051 } | |
1052 | |
1053 /** | |
1054 * @param int $position | |
1055 * | |
1056 * @return \DOMElement|null | |
1057 */ | |
1058 public function getNode($position) | |
1059 { | |
1060 if (isset($this->nodes[$position])) { | |
1061 return $this->nodes[$position]; | |
1062 } | |
1063 } | |
1064 | |
1065 /** | |
1066 * @return int | |
1067 */ | |
1068 public function count() | |
1069 { | |
1070 return count($this->nodes); | |
1071 } | |
1072 | |
1073 /** | |
1074 * @return \ArrayIterator | |
1075 */ | |
1076 public function getIterator() | |
1077 { | |
1078 return new \ArrayIterator($this->nodes); | |
1079 } | |
1080 | |
1081 /** | |
1082 * @param \DOMElement $node | |
1083 * @param string $siblingDir | |
1084 * | |
1085 * @return array | |
1086 */ | |
1087 protected function sibling($node, $siblingDir = 'nextSibling') | |
1088 { | |
1089 $nodes = array(); | |
1090 | |
1091 do { | |
1092 if ($node !== $this->getNode(0) && $node->nodeType === 1) { | |
1093 $nodes[] = $node; | |
1094 } | |
1095 } while ($node = $node->$siblingDir); | |
1096 | |
1097 return $nodes; | |
1098 } | |
1099 | |
1100 /** | |
1101 * @param \DOMDocument $document | |
1102 * @param array $prefixes | |
1103 * | |
1104 * @return \DOMXPath | |
1105 * | |
1106 * @throws \InvalidArgumentException | |
1107 */ | |
1108 private function createDOMXPath(\DOMDocument $document, array $prefixes = array()) | |
1109 { | |
1110 $domxpath = new \DOMXPath($document); | |
1111 | |
1112 foreach ($prefixes as $prefix) { | |
1113 $namespace = $this->discoverNamespace($domxpath, $prefix); | |
1114 if (null !== $namespace) { | |
1115 $domxpath->registerNamespace($prefix, $namespace); | |
1116 } | |
1117 } | |
1118 | |
1119 return $domxpath; | |
1120 } | |
1121 | |
1122 /** | |
1123 * @param \DOMXPath $domxpath | |
1124 * @param string $prefix | |
1125 * | |
1126 * @return string | |
1127 * | |
1128 * @throws \InvalidArgumentException | |
1129 */ | |
1130 private function discoverNamespace(\DOMXPath $domxpath, $prefix) | |
1131 { | |
1132 if (isset($this->namespaces[$prefix])) { | |
1133 return $this->namespaces[$prefix]; | |
1134 } | |
1135 | |
1136 // ask for one namespace, otherwise we'd get a collection with an item for each node | |
1137 $namespaces = $domxpath->query(sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix)); | |
1138 | |
1139 if ($node = $namespaces->item(0)) { | |
1140 return $node->nodeValue; | |
1141 } | |
1142 } | |
1143 | |
1144 /** | |
1145 * @param string $xpath | |
1146 * | |
1147 * @return array | |
1148 */ | |
1149 private function findNamespacePrefixes($xpath) | |
1150 { | |
1151 if (preg_match_all('/(?P<prefix>[a-z_][a-z_0-9\-\.]*+):[^"\/:]/i', $xpath, $matches)) { | |
1152 return array_unique($matches['prefix']); | |
1153 } | |
1154 | |
1155 return array(); | |
1156 } | |
1157 | |
1158 /** | |
1159 * Creates a crawler for some subnodes. | |
1160 * | |
1161 * @param \DOMElement|\DOMElement[]|\DOMNodeList|null $nodes | |
1162 * | |
1163 * @return static | |
1164 */ | |
1165 private function createSubCrawler($nodes) | |
1166 { | |
1167 $crawler = new static($nodes, $this->uri, $this->baseHref); | |
1168 $crawler->isHtml = $this->isHtml; | |
1169 $crawler->document = $this->document; | |
1170 $crawler->namespaces = $this->namespaces; | |
1171 | |
1172 return $crawler; | |
1173 } | |
1174 } |