Crawler.php 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795
  1. <?php
  2. /*
  3. * This file is part of the Symfony package.
  4. *
  5. * (c) Fabien Potencier <fabien@symfony.com>
  6. *
  7. * For the full copyright and license information, please view the LICENSE
  8. * file that was distributed with this source code.
  9. */
  10. namespace Symfony\Component\DomCrawler;
  11. use Symfony\Component\CssSelector\CssSelector;
  12. /**
  13. * Crawler eases navigation of a list of \DOMNode objects.
  14. *
  15. * @author Fabien Potencier <fabien@symfony.com>
  16. *
  17. * @api
  18. */
  19. class Crawler extends \SplObjectStorage
  20. {
  21. /**
  22. * @var string The current URI or the base href value
  23. */
  24. protected $uri;
  25. /**
  26. * Constructor.
  27. *
  28. * @param mixed $node A Node to use as the base for the crawling
  29. * @param string $uri The current URI or the base href value
  30. *
  31. * @api
  32. */
  33. public function __construct($node = null, $uri = null)
  34. {
  35. $this->uri = $uri;
  36. $this->add($node);
  37. }
  38. /**
  39. * Removes all the nodes.
  40. *
  41. * @api
  42. */
  43. public function clear()
  44. {
  45. $this->removeAll($this);
  46. }
  47. /**
  48. * Adds a node to the current list of nodes.
  49. *
  50. * This method uses the appropriate specialized add*() method based
  51. * on the type of the argument.
  52. *
  53. * @param \DOMNodeList|\DOMNode|array|string|null $node A node
  54. *
  55. * @throws \InvalidArgumentException When node is not the expected type.
  56. *
  57. * @api
  58. */
  59. public function add($node)
  60. {
  61. if ($node instanceof \DOMNodeList) {
  62. $this->addNodeList($node);
  63. } elseif ($node instanceof \DOMNode) {
  64. $this->addNode($node);
  65. } elseif (is_array($node)) {
  66. $this->addNodes($node);
  67. } elseif (is_string($node)) {
  68. $this->addContent($node);
  69. } elseif (null !== $node) {
  70. throw new \InvalidArgumentException(sprintf('Expecting a DOMNodeList or DOMNode instance, an array, a string, or null, but got "%s".', is_object($node) ? get_class($node) : gettype($node)));
  71. }
  72. }
  73. /**
  74. * Adds HTML/XML content.
  75. *
  76. * If the charset is not set via the content type, it is assumed
  77. * to be ISO-8859-1, which is the default charset defined by the
  78. * HTTP 1.1 specification.
  79. *
  80. * @param string $content A string to parse as HTML/XML
  81. * @param null|string $type The content type of the string
  82. */
  83. public function addContent($content, $type = null)
  84. {
  85. if (empty($type)) {
  86. $type = 'text/html';
  87. }
  88. // DOM only for HTML/XML content
  89. if (!preg_match('/(x|ht)ml/i', $type, $xmlMatches)) {
  90. return null;
  91. }
  92. $charset = null;
  93. if (false !== $pos = strpos($type, 'charset=')) {
  94. $charset = substr($type, $pos + 8);
  95. if (false !== $pos = strpos($charset, ';')) {
  96. $charset = substr($charset, 0, $pos);
  97. }
  98. }
  99. if (null === $charset &&
  100. preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9]+)/i', $content, $matches)) {
  101. $charset = $matches[1];
  102. }
  103. if (null === $charset) {
  104. $charset = 'ISO-8859-1';
  105. }
  106. if ('x' === $xmlMatches[1]) {
  107. $this->addXmlContent($content, $charset);
  108. } else {
  109. $this->addHtmlContent($content, $charset);
  110. }
  111. }
  112. /**
  113. * Adds an HTML content to the list of nodes.
  114. *
  115. * The libxml errors are disabled when the content is parsed.
  116. *
  117. * If you want to get parsing errors, be sure to enable
  118. * internal errors via libxml_use_internal_errors(true)
  119. * and then, get the errors via libxml_get_errors(). Be
  120. * sure to clear errors with libxml_clear_errors() afterward.
  121. *
  122. * @param string $content The HTML content
  123. * @param string $charset The charset
  124. *
  125. * @api
  126. */
  127. public function addHtmlContent($content, $charset = 'UTF-8')
  128. {
  129. $current = libxml_use_internal_errors(true);
  130. $disableEntities = libxml_disable_entity_loader(true);
  131. $dom = new \DOMDocument('1.0', $charset);
  132. $dom->validateOnParse = true;
  133. if (function_exists('mb_convert_encoding') && in_array(strtolower($charset), array_map('strtolower', mb_list_encodings()))) {
  134. $content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset);
  135. }
  136. @$dom->loadHTML($content);
  137. libxml_use_internal_errors($current);
  138. libxml_disable_entity_loader($disableEntities);
  139. $this->addDocument($dom);
  140. $base = $this->filterXPath('descendant-or-self::base')->extract(array('href'));
  141. $baseHref = current($base);
  142. if (count($base) && !empty($baseHref)) {
  143. if ($this->uri) {
  144. $linkNode = $dom->createElement('a');
  145. $linkNode->setAttribute('href', $baseHref);
  146. $link = new Link($linkNode, $this->uri);
  147. $this->uri = $link->getUri();
  148. } else {
  149. $this->uri = $baseHref;
  150. }
  151. }
  152. }
  153. /**
  154. * Adds an XML content to the list of nodes.
  155. *
  156. * The libxml errors are disabled when the content is parsed.
  157. *
  158. * If you want to get parsing errors, be sure to enable
  159. * internal errors via libxml_use_internal_errors(true)
  160. * and then, get the errors via libxml_get_errors(). Be
  161. * sure to clear errors with libxml_clear_errors() afterward.
  162. *
  163. * @param string $content The XML content
  164. * @param string $charset The charset
  165. *
  166. * @api
  167. */
  168. public function addXmlContent($content, $charset = 'UTF-8')
  169. {
  170. $current = libxml_use_internal_errors(true);
  171. $disableEntities = libxml_disable_entity_loader(true);
  172. $dom = new \DOMDocument('1.0', $charset);
  173. $dom->validateOnParse = true;
  174. // remove the default namespace to make XPath expressions simpler
  175. @$dom->loadXML(str_replace('xmlns', 'ns', $content), LIBXML_NONET);
  176. libxml_use_internal_errors($current);
  177. libxml_disable_entity_loader($disableEntities);
  178. $this->addDocument($dom);
  179. }
  180. /**
  181. * Adds a \DOMDocument to the list of nodes.
  182. *
  183. * @param \DOMDocument $dom A \DOMDocument instance
  184. *
  185. * @api
  186. */
  187. public function addDocument(\DOMDocument $dom)
  188. {
  189. if ($dom->documentElement) {
  190. $this->addNode($dom->documentElement);
  191. }
  192. }
  193. /**
  194. * Adds a \DOMNodeList to the list of nodes.
  195. *
  196. * @param \DOMNodeList $nodes A \DOMNodeList instance
  197. *
  198. * @api
  199. */
  200. public function addNodeList(\DOMNodeList $nodes)
  201. {
  202. foreach ($nodes as $node) {
  203. $this->addNode($node);
  204. }
  205. }
  206. /**
  207. * Adds an array of \DOMNode instances to the list of nodes.
  208. *
  209. * @param \DOMNode[] $nodes An array of \DOMNode instances
  210. *
  211. * @api
  212. */
  213. public function addNodes(array $nodes)
  214. {
  215. foreach ($nodes as $node) {
  216. $this->add($node);
  217. }
  218. }
  219. /**
  220. * Adds a \DOMNode instance to the list of nodes.
  221. *
  222. * @param \DOMNode $node A \DOMNode instance
  223. *
  224. * @api
  225. */
  226. public function addNode(\DOMNode $node)
  227. {
  228. if ($node instanceof \DOMDocument) {
  229. $this->attach($node->documentElement);
  230. } else {
  231. $this->attach($node);
  232. }
  233. }
  234. /**
  235. * Returns a node given its position in the node list.
  236. *
  237. * @param integer $position The position
  238. *
  239. * @return Crawler A new instance of the Crawler with the selected node, or an empty Crawler if it does not exist.
  240. *
  241. * @api
  242. */
  243. public function eq($position)
  244. {
  245. foreach ($this as $i => $node) {
  246. if ($i == $position) {
  247. return new static($node, $this->uri);
  248. }
  249. }
  250. return new static(null, $this->uri);
  251. }
  252. /**
  253. * Calls an anonymous function on each node of the list.
  254. *
  255. * The anonymous function receives the position and the node wrapped
  256. * in a Crawler instance as arguments.
  257. *
  258. * Example:
  259. *
  260. * $crawler->filter('h1')->each(function ($node, $i) {
  261. * return $node->text();
  262. * });
  263. *
  264. * @param \Closure $closure An anonymous function
  265. *
  266. * @return array An array of values returned by the anonymous function
  267. *
  268. * @api
  269. */
  270. public function each(\Closure $closure)
  271. {
  272. $data = array();
  273. foreach ($this as $i => $node) {
  274. $data[] = $closure(new static($node, $this->uri), $i);
  275. }
  276. return $data;
  277. }
  278. /**
  279. * Reduces the list of nodes by calling an anonymous function.
  280. *
  281. * To remove a node from the list, the anonymous function must return false.
  282. *
  283. * @param \Closure $closure An anonymous function
  284. *
  285. * @return Crawler A Crawler instance with the selected nodes.
  286. *
  287. * @api
  288. */
  289. public function reduce(\Closure $closure)
  290. {
  291. $nodes = array();
  292. foreach ($this as $i => $node) {
  293. if (false !== $closure(new static($node, $this->uri), $i)) {
  294. $nodes[] = $node;
  295. }
  296. }
  297. return new static($nodes, $this->uri);
  298. }
  299. /**
  300. * Returns the first node of the current selection
  301. *
  302. * @return Crawler A Crawler instance with the first selected node
  303. *
  304. * @api
  305. */
  306. public function first()
  307. {
  308. return $this->eq(0);
  309. }
  310. /**
  311. * Returns the last node of the current selection
  312. *
  313. * @return Crawler A Crawler instance with the last selected node
  314. *
  315. * @api
  316. */
  317. public function last()
  318. {
  319. return $this->eq(count($this) - 1);
  320. }
  321. /**
  322. * Returns the siblings nodes of the current selection
  323. *
  324. * @return Crawler A Crawler instance with the sibling nodes
  325. *
  326. * @throws \InvalidArgumentException When current node is empty
  327. *
  328. * @api
  329. */
  330. public function siblings()
  331. {
  332. if (!count($this)) {
  333. throw new \InvalidArgumentException('The current node list is empty.');
  334. }
  335. return new static($this->sibling($this->getNode(0)->parentNode->firstChild), $this->uri);
  336. }
  337. /**
  338. * Returns the next siblings nodes of the current selection
  339. *
  340. * @return Crawler A Crawler instance with the next sibling nodes
  341. *
  342. * @throws \InvalidArgumentException When current node is empty
  343. *
  344. * @api
  345. */
  346. public function nextAll()
  347. {
  348. if (!count($this)) {
  349. throw new \InvalidArgumentException('The current node list is empty.');
  350. }
  351. return new static($this->sibling($this->getNode(0)), $this->uri);
  352. }
  353. /**
  354. * Returns the previous sibling nodes of the current selection
  355. *
  356. * @return Crawler A Crawler instance with the previous sibling nodes
  357. *
  358. * @throws \InvalidArgumentException
  359. *
  360. * @api
  361. */
  362. public function previousAll()
  363. {
  364. if (!count($this)) {
  365. throw new \InvalidArgumentException('The current node list is empty.');
  366. }
  367. return new static($this->sibling($this->getNode(0), 'previousSibling'), $this->uri);
  368. }
  369. /**
  370. * Returns the parents nodes of the current selection
  371. *
  372. * @return Crawler A Crawler instance with the parents nodes of the current selection
  373. *
  374. * @throws \InvalidArgumentException When current node is empty
  375. *
  376. * @api
  377. */
  378. public function parents()
  379. {
  380. if (!count($this)) {
  381. throw new \InvalidArgumentException('The current node list is empty.');
  382. }
  383. $node = $this->getNode(0);
  384. $nodes = array();
  385. while ($node = $node->parentNode) {
  386. if (1 === $node->nodeType && '_root' !== $node->nodeName) {
  387. $nodes[] = $node;
  388. }
  389. }
  390. return new static($nodes, $this->uri);
  391. }
  392. /**
  393. * Returns the children nodes of the current selection
  394. *
  395. * @return Crawler A Crawler instance with the children nodes
  396. *
  397. * @throws \InvalidArgumentException When current node is empty
  398. *
  399. * @api
  400. */
  401. public function children()
  402. {
  403. if (!count($this)) {
  404. throw new \InvalidArgumentException('The current node list is empty.');
  405. }
  406. $node = $this->getNode(0)->firstChild;
  407. return new static($node ? $this->sibling($node) : array(), $this->uri);
  408. }
  409. /**
  410. * Returns the attribute value of the first node of the list.
  411. *
  412. * @param string $attribute The attribute name
  413. *
  414. * @return string The attribute value
  415. *
  416. * @throws \InvalidArgumentException When current node is empty
  417. *
  418. * @api
  419. */
  420. public function attr($attribute)
  421. {
  422. if (!count($this)) {
  423. throw new \InvalidArgumentException('The current node list is empty.');
  424. }
  425. return $this->getNode(0)->getAttribute($attribute);
  426. }
  427. /**
  428. * Returns the node value of the first node of the list.
  429. *
  430. * @return string The node value
  431. *
  432. * @throws \InvalidArgumentException When current node is empty
  433. *
  434. * @api
  435. */
  436. public function text()
  437. {
  438. if (!count($this)) {
  439. throw new \InvalidArgumentException('The current node list is empty.');
  440. }
  441. return $this->getNode(0)->nodeValue;
  442. }
  443. /**
  444. * Returns the first node of the list as HTML.
  445. *
  446. * @return string The node html
  447. *
  448. * @throws \InvalidArgumentException When current node is empty
  449. */
  450. public function html()
  451. {
  452. if (!count($this)) {
  453. throw new \InvalidArgumentException('The current node list is empty.');
  454. }
  455. $html = '';
  456. foreach ($this->getNode(0)->childNodes as $child) {
  457. if (version_compare(PHP_VERSION, '5.3.6', '>=')) {
  458. // node parameter was added to the saveHTML() method in PHP 5.3.6
  459. // @see http://php.net/manual/en/domdocument.savehtml.php
  460. $html .= $child->ownerDocument->saveHTML($child);
  461. } else {
  462. $document = new \DOMDocument('1.0', 'UTF-8');
  463. $document->appendChild($document->importNode($child, true));
  464. $html .= rtrim($document->saveHTML());
  465. }
  466. }
  467. return $html;
  468. }
  469. /**
  470. * Extracts information from the list of nodes.
  471. *
  472. * You can extract attributes or/and the node value (_text).
  473. *
  474. * Example:
  475. *
  476. * $crawler->filter('h1 a')->extract(array('_text', 'href'));
  477. *
  478. * @param array $attributes An array of attributes
  479. *
  480. * @return array An array of extracted values
  481. *
  482. * @api
  483. */
  484. public function extract($attributes)
  485. {
  486. $attributes = (array) $attributes;
  487. $data = array();
  488. foreach ($this as $node) {
  489. $elements = array();
  490. foreach ($attributes as $attribute) {
  491. if ('_text' === $attribute) {
  492. $elements[] = $node->nodeValue;
  493. } else {
  494. $elements[] = $node->getAttribute($attribute);
  495. }
  496. }
  497. $data[] = count($attributes) > 1 ? $elements : $elements[0];
  498. }
  499. return $data;
  500. }
  501. /**
  502. * Filters the list of nodes with an XPath expression.
  503. *
  504. * @param string $xpath An XPath expression
  505. *
  506. * @return Crawler A new instance of Crawler with the filtered list of nodes
  507. *
  508. * @api
  509. */
  510. public function filterXPath($xpath)
  511. {
  512. $document = new \DOMDocument('1.0', 'UTF-8');
  513. $root = $document->appendChild($document->createElement('_root'));
  514. foreach ($this as $node) {
  515. $root->appendChild($document->importNode($node, true));
  516. }
  517. $domxpath = new \DOMXPath($document);
  518. return new static($domxpath->query($xpath), $this->uri);
  519. }
  520. /**
  521. * Filters the list of nodes with a CSS selector.
  522. *
  523. * This method only works if you have installed the CssSelector Symfony Component.
  524. *
  525. * @param string $selector A CSS selector
  526. *
  527. * @return Crawler A new instance of Crawler with the filtered list of nodes
  528. *
  529. * @throws \RuntimeException if the CssSelector Component is not available
  530. *
  531. * @api
  532. */
  533. public function filter($selector)
  534. {
  535. if (!class_exists('Symfony\\Component\\CssSelector\\CssSelector')) {
  536. // @codeCoverageIgnoreStart
  537. throw new \RuntimeException('Unable to filter with a CSS selector as the Symfony CssSelector is not installed (you can use filterXPath instead).');
  538. // @codeCoverageIgnoreEnd
  539. }
  540. return $this->filterXPath(CssSelector::toXPath($selector));
  541. }
  542. /**
  543. * Selects links by name or alt value for clickable images.
  544. *
  545. * @param string $value The link text
  546. *
  547. * @return Crawler A new instance of Crawler with the filtered list of nodes
  548. *
  549. * @api
  550. */
  551. public function selectLink($value)
  552. {
  553. $xpath = sprintf('//a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s)] ', static::xpathLiteral(' '.$value.' ')).
  554. sprintf('| //a/img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]/ancestor::a', static::xpathLiteral(' '.$value.' '));
  555. return $this->filterXPath($xpath);
  556. }
  557. /**
  558. * Selects a button by name or alt value for images.
  559. *
  560. * @param string $value The button text
  561. *
  562. * @return Crawler A new instance of Crawler with the filtered list of nodes
  563. *
  564. * @api
  565. */
  566. public function selectButton($value)
  567. {
  568. $xpath = sprintf('//input[((@type="submit" or @type="button") and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', static::xpathLiteral(' '.$value.' ')).
  569. sprintf('or (@type="image" and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id="%s" or @name="%s"] ', static::xpathLiteral(' '.$value.' '), $value, $value).
  570. sprintf('| //button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id="%s" or @name="%s"]', static::xpathLiteral(' '.$value.' '), $value, $value);
  571. return $this->filterXPath($xpath);
  572. }
  573. /**
  574. * Returns a Link object for the first node in the list.
  575. *
  576. * @param string $method The method for the link (get by default)
  577. *
  578. * @return Link A Link instance
  579. *
  580. * @throws \InvalidArgumentException If the current node list is empty
  581. *
  582. * @api
  583. */
  584. public function link($method = 'get')
  585. {
  586. if (!count($this)) {
  587. throw new \InvalidArgumentException('The current node list is empty.');
  588. }
  589. $node = $this->getNode(0);
  590. return new Link($node, $this->uri, $method);
  591. }
  592. /**
  593. * Returns an array of Link objects for the nodes in the list.
  594. *
  595. * @return Link[] An array of Link instances
  596. *
  597. * @api
  598. */
  599. public function links()
  600. {
  601. $links = array();
  602. foreach ($this as $node) {
  603. $links[] = new Link($node, $this->uri, 'get');
  604. }
  605. return $links;
  606. }
  607. /**
  608. * Returns a Form object for the first node in the list.
  609. *
  610. * @param array $values An array of values for the form fields
  611. * @param string $method The method for the form
  612. *
  613. * @return Form A Form instance
  614. *
  615. * @throws \InvalidArgumentException If the current node list is empty
  616. *
  617. * @api
  618. */
  619. public function form(array $values = null, $method = null)
  620. {
  621. if (!count($this)) {
  622. throw new \InvalidArgumentException('The current node list is empty.');
  623. }
  624. $form = new Form($this->getNode(0), $this->uri, $method);
  625. if (null !== $values) {
  626. $form->setValues($values);
  627. }
  628. return $form;
  629. }
  630. /**
  631. * Converts string for XPath expressions.
  632. *
  633. * Escaped characters are: quotes (") and apostrophe (').
  634. *
  635. * Examples:
  636. * <code>
  637. * echo Crawler::xpathLiteral('foo " bar');
  638. * //prints 'foo " bar'
  639. *
  640. * echo Crawler::xpathLiteral("foo ' bar");
  641. * //prints "foo ' bar"
  642. *
  643. * echo Crawler::xpathLiteral('a\'b"c');
  644. * //prints concat('a', "'", 'b"c')
  645. * </code>
  646. *
  647. * @param string $s String to be escaped
  648. *
  649. * @return string Converted string
  650. */
  651. public static function xpathLiteral($s)
  652. {
  653. if (false === strpos($s, "'")) {
  654. return sprintf("'%s'", $s);
  655. }
  656. if (false === strpos($s, '"')) {
  657. return sprintf('"%s"', $s);
  658. }
  659. $string = $s;
  660. $parts = array();
  661. while (true) {
  662. if (false !== $pos = strpos($string, "'")) {
  663. $parts[] = sprintf("'%s'", substr($string, 0, $pos));
  664. $parts[] = "\"'\"";
  665. $string = substr($string, $pos + 1);
  666. } else {
  667. $parts[] = "'$string'";
  668. break;
  669. }
  670. }
  671. return sprintf("concat(%s)", implode($parts, ', '));
  672. }
  673. /**
  674. * @param integer $position
  675. *
  676. * @return \DOMElement|null
  677. */
  678. protected function getNode($position)
  679. {
  680. foreach ($this as $i => $node) {
  681. if ($i == $position) {
  682. return $node;
  683. }
  684. // @codeCoverageIgnoreStart
  685. }
  686. return null;
  687. // @codeCoverageIgnoreEnd
  688. }
  689. /**
  690. * @param \DOMElement $node
  691. * @param string $siblingDir
  692. *
  693. * @return array
  694. */
  695. protected function sibling($node, $siblingDir = 'nextSibling')
  696. {
  697. $nodes = array();
  698. do {
  699. if ($node !== $this->getNode(0) && $node->nodeType === 1) {
  700. $nodes[] = $node;
  701. }
  702. } while ($node = $node->$siblingDir);
  703. return $nodes;
  704. }
  705. }