|
11 | 11 |
|
12 | 12 | namespace Symfony\Component\DomCrawler;
|
13 | 13 |
|
| 14 | +use Masterminds\HTML5; |
14 | 15 | use Symfony\Component\CssSelector\CssSelectorConverter;
|
15 | 16 |
|
16 | 17 | /**
|
@@ -55,15 +56,29 @@ class Crawler implements \Countable, \IteratorAggregate
|
55 | 56 | private $isHtml = true;
|
56 | 57 |
|
57 | 58 | /**
|
58 |
| - * @param mixed $node A Node to use as the base for the crawling |
59 |
| - * @param string $uri The current URI |
60 |
| - * @param string $baseHref The base href value |
| 59 | + * @var HTML5|null |
61 | 60 | */
|
62 |
| - public function __construct($node = null, string $uri = null, string $baseHref = null) |
| 61 | + private $html5Parser; |
| 62 | + |
| 63 | + /** |
| 64 | + * @param mixed $node A Node to use as the base for the crawling |
| 65 | + * @param string $uri The current URI |
| 66 | + * @param string $baseHref The base href value |
| 67 | + * @param bool|null $useHtml5Parser Whether the Crawler should use the HTML5 parser or the native DOM parser |
| 68 | + */ |
| 69 | + public function __construct($node = null, string $uri = null, string $baseHref = null, bool $useHtml5Parser = null) |
63 | 70 | {
|
64 | 71 | $this->uri = $uri;
|
65 | 72 | $this->baseHref = $baseHref ?: $uri;
|
66 | 73 |
|
| 74 | + if ($useHtml5Parser && !class_exists(HTML5::class)) { |
| 75 | + throw new \LogicException('Using the DomCrawler HTML5 parser requires the html5-php library. Try running "composer require masterminds/html5".'); |
| 76 | + } |
| 77 | + |
| 78 | + if ($useHtml5Parser ?? class_exists(HTML5::class)) { |
| 79 | + $this->html5Parser = new HTML5(['disable_html_ns' => true]); |
| 80 | + } |
| 81 | + |
67 | 82 | $this->add($node);
|
68 | 83 | }
|
69 | 84 |
|
@@ -183,29 +198,7 @@ public function addContent($content, $type = null)
|
183 | 198 | */
|
184 | 199 | public function addHtmlContent($content, $charset = 'UTF-8')
|
185 | 200 | {
|
186 |
| - $internalErrors = libxml_use_internal_errors(true); |
187 |
| - $disableEntities = libxml_disable_entity_loader(true); |
188 |
| - |
189 |
| - $dom = new \DOMDocument('1.0', $charset); |
190 |
| - $dom->validateOnParse = true; |
191 |
| - |
192 |
| - set_error_handler(function () { throw new \Exception(); }); |
193 |
| - |
194 |
| - try { |
195 |
| - // Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML() |
196 |
| - $content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset); |
197 |
| - } catch (\Exception $e) { |
198 |
| - } |
199 |
| - |
200 |
| - restore_error_handler(); |
201 |
| - |
202 |
| - if ('' !== trim($content)) { |
203 |
| - @$dom->loadHTML($content); |
204 |
| - } |
205 |
| - |
206 |
| - libxml_use_internal_errors($internalErrors); |
207 |
| - libxml_disable_entity_loader($disableEntities); |
208 |
| - |
| 201 | + $dom = null !== $this->html5Parser ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset); |
209 | 202 | $this->addDocument($dom);
|
210 | 203 |
|
211 | 204 | $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']);
|
@@ -608,6 +601,15 @@ public function html(/* $default = null */)
|
608 | 601 | throw new \InvalidArgumentException('The current node list is empty.');
|
609 | 602 | }
|
610 | 603 |
|
| 604 | + if (null !== $this->html5Parser) { |
| 605 | + $html = ''; |
| 606 | + foreach ($this->getNode(0)->childNodes as $child) { |
| 607 | + $html .= $this->html5Parser->saveHTML($child); |
| 608 | + } |
| 609 | + |
| 610 | + return $html; |
| 611 | + } |
| 612 | + |
611 | 613 | $html = '';
|
612 | 614 | foreach ($this->getNode(0)->childNodes as $child) {
|
613 | 615 | $html .= $child->ownerDocument->saveHTML($child);
|
@@ -1112,6 +1114,53 @@ protected function sibling($node, $siblingDir = 'nextSibling')
|
1112 | 1114 | return $nodes;
|
1113 | 1115 | }
|
1114 | 1116 |
|
| 1117 | + private function parseHtml5(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument |
| 1118 | + { |
| 1119 | + return $this->html5Parser->parse($this->convertToHtmlEntities($htmlContent, $charset), [], $charset); |
| 1120 | + } |
| 1121 | + |
| 1122 | + private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument |
| 1123 | + { |
| 1124 | + $htmlContent = $this->convertToHtmlEntities($htmlContent, $charset); |
| 1125 | + |
| 1126 | + $internalErrors = libxml_use_internal_errors(true); |
| 1127 | + $disableEntities = libxml_disable_entity_loader(true); |
| 1128 | + |
| 1129 | + $dom = new \DOMDocument('1.0', $charset); |
| 1130 | + $dom->validateOnParse = true; |
| 1131 | + |
| 1132 | + if ('' !== trim($htmlContent)) { |
| 1133 | + @$dom->loadHTML($htmlContent); |
| 1134 | + } |
| 1135 | + |
| 1136 | + libxml_use_internal_errors($internalErrors); |
| 1137 | + libxml_disable_entity_loader($disableEntities); |
| 1138 | + |
| 1139 | + return $dom; |
| 1140 | + } |
| 1141 | + |
| 1142 | + /** |
| 1143 | + * Convert charset to HTML-entities to ensure valid parsing. |
| 1144 | + */ |
| 1145 | + private function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string |
| 1146 | + { |
| 1147 | + set_error_handler(function () { throw new \Exception(); }); |
| 1148 | + |
| 1149 | + try { |
| 1150 | + return mb_convert_encoding($htmlContent, 'HTML-ENTITIES', $charset); |
| 1151 | + } catch (\Exception $e) { |
| 1152 | + try { |
| 1153 | + $htmlContent = iconv($charset, 'UTF-8', $htmlContent); |
| 1154 | + $htmlContent = mb_convert_encoding($htmlContent, 'HTML-ENTITIES', 'UTF-8'); |
| 1155 | + } catch (\Exception $e) { |
| 1156 | + } |
| 1157 | + |
| 1158 | + return $htmlContent; |
| 1159 | + } finally { |
| 1160 | + restore_error_handler(); |
| 1161 | + } |
| 1162 | + } |
| 1163 | + |
1115 | 1164 | /**
|
1116 | 1165 | * @throws \InvalidArgumentException
|
1117 | 1166 | */
|
|
0 commit comments