Skip to content

Commit 78de2ce

Browse files
tgalopinfabpot
authored andcommitted
[DomCrawler] Optionally use html5-php to parse HTML
1 parent 7bf22ea commit 78de2ce

File tree

6 files changed

+237
-130
lines changed

6 files changed

+237
-130
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ CHANGELOG
66

77
* Added return of element name (`_name`) in `extract()` method.
88
* Added ability to return a default value in `text()` and `html()` instead of throwing an exception when node is empty.
9+
* When available, the [html5-php library](https://github.com/Masterminds/html5-php) is used to
10+
parse HTML added to a Crawler for better support of HTML5 tags.
911

1012
4.2.0
1113
-----

Crawler.php

Lines changed: 76 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
namespace Symfony\Component\DomCrawler;
1313

14+
use Masterminds\HTML5;
1415
use Symfony\Component\CssSelector\CssSelectorConverter;
1516

1617
/**
@@ -55,15 +56,29 @@ class Crawler implements \Countable, \IteratorAggregate
5556
private $isHtml = true;
5657

5758
/**
58-
* @param mixed $node A Node to use as the base for the crawling
59-
* @param string $uri The current URI
60-
* @param string $baseHref The base href value
59+
* @var HTML5|null
6160
*/
62-
public function __construct($node = null, string $uri = null, string $baseHref = null)
61+
private $html5Parser;
62+
63+
/**
64+
* @param mixed $node A Node to use as the base for the crawling
65+
* @param string $uri The current URI
66+
* @param string $baseHref The base href value
67+
* @param bool|null $useHtml5Parser Whether the Crawler should use the HTML5 parser or the native DOM parser
68+
*/
69+
public function __construct($node = null, string $uri = null, string $baseHref = null, bool $useHtml5Parser = null)
6370
{
6471
$this->uri = $uri;
6572
$this->baseHref = $baseHref ?: $uri;
6673

74+
if ($useHtml5Parser && !class_exists(HTML5::class)) {
75+
throw new \LogicException('Using the DomCrawler HTML5 parser requires the html5-php library. Try running "composer require masterminds/html5".');
76+
}
77+
78+
if ($useHtml5Parser ?? class_exists(HTML5::class)) {
79+
$this->html5Parser = new HTML5(['disable_html_ns' => true]);
80+
}
81+
6782
$this->add($node);
6883
}
6984

@@ -183,29 +198,7 @@ public function addContent($content, $type = null)
183198
*/
184199
public function addHtmlContent($content, $charset = 'UTF-8')
185200
{
186-
$internalErrors = libxml_use_internal_errors(true);
187-
$disableEntities = libxml_disable_entity_loader(true);
188-
189-
$dom = new \DOMDocument('1.0', $charset);
190-
$dom->validateOnParse = true;
191-
192-
set_error_handler(function () { throw new \Exception(); });
193-
194-
try {
195-
// Convert charset to HTML-entities to work around bugs in DOMDocument::loadHTML()
196-
$content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset);
197-
} catch (\Exception $e) {
198-
}
199-
200-
restore_error_handler();
201-
202-
if ('' !== trim($content)) {
203-
@$dom->loadHTML($content);
204-
}
205-
206-
libxml_use_internal_errors($internalErrors);
207-
libxml_disable_entity_loader($disableEntities);
208-
201+
$dom = null !== $this->html5Parser ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset);
209202
$this->addDocument($dom);
210203

211204
$base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']);
@@ -608,6 +601,15 @@ public function html(/* $default = null */)
608601
throw new \InvalidArgumentException('The current node list is empty.');
609602
}
610603

604+
if (null !== $this->html5Parser) {
605+
$html = '';
606+
foreach ($this->getNode(0)->childNodes as $child) {
607+
$html .= $this->html5Parser->saveHTML($child);
608+
}
609+
610+
return $html;
611+
}
612+
611613
$html = '';
612614
foreach ($this->getNode(0)->childNodes as $child) {
613615
$html .= $child->ownerDocument->saveHTML($child);
@@ -1112,6 +1114,53 @@ protected function sibling($node, $siblingDir = 'nextSibling')
11121114
return $nodes;
11131115
}
11141116

1117+
private function parseHtml5(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
1118+
{
1119+
return $this->html5Parser->parse($this->convertToHtmlEntities($htmlContent, $charset), [], $charset);
1120+
}
1121+
1122+
private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
1123+
{
1124+
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
1125+
1126+
$internalErrors = libxml_use_internal_errors(true);
1127+
$disableEntities = libxml_disable_entity_loader(true);
1128+
1129+
$dom = new \DOMDocument('1.0', $charset);
1130+
$dom->validateOnParse = true;
1131+
1132+
if ('' !== trim($htmlContent)) {
1133+
@$dom->loadHTML($htmlContent);
1134+
}
1135+
1136+
libxml_use_internal_errors($internalErrors);
1137+
libxml_disable_entity_loader($disableEntities);
1138+
1139+
return $dom;
1140+
}
1141+
1142+
/**
1143+
* Convert charset to HTML-entities to ensure valid parsing.
1144+
*/
1145+
private function convertToHtmlEntities(string $htmlContent, string $charset = 'UTF-8'): string
1146+
{
1147+
set_error_handler(function () { throw new \Exception(); });
1148+
1149+
try {
1150+
return mb_convert_encoding($htmlContent, 'HTML-ENTITIES', $charset);
1151+
} catch (\Exception $e) {
1152+
try {
1153+
$htmlContent = iconv($charset, 'UTF-8', $htmlContent);
1154+
$htmlContent = mb_convert_encoding($htmlContent, 'HTML-ENTITIES', 'UTF-8');
1155+
} catch (\Exception $e) {
1156+
}
1157+
1158+
return $htmlContent;
1159+
} finally {
1160+
restore_error_handler();
1161+
}
1162+
}
1163+
11151164
/**
11161165
* @throws \InvalidArgumentException
11171166
*/

0 commit comments

Comments
 (0)