Skip to content

Commit 000634e

Browse files
ausifabpot
authored andcommitted
[DomCrawler] Encode html entities only if nessecary
1 parent e3b4806 commit 000634e

File tree

2 files changed

+24
-2
lines changed

2 files changed

+24
-2
lines changed

Crawler.php

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1151,12 +1151,30 @@ protected function sibling(\DOMNode $node, string $siblingDir = 'nextSibling')
11511151

11521152
private function parseHtml5(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
11531153
{
1154-
return $this->html5Parser->parse($this->convertToHtmlEntities($htmlContent, $charset));
1154+
if (!$this->supportsEncoding($charset)) {
1155+
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
1156+
$charset = 'UTF-8';
1157+
}
1158+
1159+
return $this->html5Parser->parse($htmlContent, ['encoding' => $charset]);
1160+
}
1161+
1162+
private function supportsEncoding(string $encoding): bool
1163+
{
1164+
try {
1165+
return '' === @mb_convert_encoding('', $encoding, 'UTF-8');
1166+
} catch (\Throwable $e) {
1167+
return false;
1168+
}
11551169
}
11561170

11571171
private function parseXhtml(string $htmlContent, string $charset = 'UTF-8'): \DOMDocument
11581172
{
1159-
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
1173+
if ('UTF-8' === $charset && preg_match('//u', $htmlContent)) {
1174+
$htmlContent = '<?xml encoding="UTF-8">'.$htmlContent;
1175+
} else {
1176+
$htmlContent = $this->convertToHtmlEntities($htmlContent, $charset);
1177+
}
11601178

11611179
$internalErrors = libxml_use_internal_errors(true);
11621180
if (\LIBXML_VERSION < 20900) {

Tests/AbstractCrawlerTestCase.php

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,10 @@ public function testAddContent()
194194
$crawler = $this->createCrawler();
195195
$crawler->addContent($this->getDoctype().'<html><meta http-equiv="Content-Type" content="text/html; charset=unicode" /><div class="foo"></html></html>');
196196
$this->assertEquals('foo', $crawler->filterXPath('//div')->attr('class'), '->addContent() ignores bad charset');
197+
198+
$crawler = $this->createCrawler();
199+
$crawler->addContent($this->getDoctype().'<html><script>var foo = "bär";</script></html>', 'text/html; charset=UTF-8');
200+
$this->assertEquals('var foo = "bär";', $crawler->filterXPath('//script')->text(), '->addContent() does not interfere with script content');
197201
}
198202

199203
/**

0 commit comments

Comments
 (0)