Skip to content

Commit 68800eb

Browse files
victor-prdhnicolas-grekas
authored andcommitted
[DomCrawler] Give choice of used parser
1 parent d58c22d commit 68800eb

File tree

4 files changed

+27
-7
lines changed

4 files changed

+27
-7
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ CHANGELOG
44
6.3
55
---
66

7+
* Add `$useHtml5Parser` argument to `Crawler`
78
* Add `CrawlerSelectorCount` test constraint
89
* Add argument `$normalizeWhitespace` to `Crawler::innerText()`
910
* Make `Crawler::innerText()` return the first non-empty text

Crawler.php

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,16 +58,17 @@ class Crawler implements \Countable, \IteratorAggregate
5858
*/
5959
private bool $isHtml = true;
6060

61-
private HTML5 $html5Parser;
61+
62+
private ?HTML5 $html5Parser = null;
6263

6364
/**
6465
* @param \DOMNodeList|\DOMNode|\DOMNode[]|string|null $node A Node to use as the base for the crawling
6566
*/
66-
public function __construct(\DOMNodeList|\DOMNode|array|string $node = null, string $uri = null, string $baseHref = null)
67+
public function __construct(\DOMNodeList|\DOMNode|array|string $node = null, string $uri = null, string $baseHref = null, bool $useHtml5Parser = true)
6768
{
6869
$this->uri = $uri;
6970
$this->baseHref = $baseHref ?: $uri;
70-
$this->html5Parser = new HTML5(['disable_html_ns' => true]);
71+
$this->html5Parser = $useHtml5Parser ? new HTML5(['disable_html_ns' => true]) : null;
7172
$this->cachedNamespaces = new \ArrayObject();
7273

7374
$this->add($node);
@@ -621,7 +622,7 @@ public function html(string $default = null): string
621622
$node = $this->getNode(0);
622623
$owner = $node->ownerDocument;
623624

624-
if ('<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
625+
if ($this->html5Parser && '<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
625626
$owner = $this->html5Parser;
626627
}
627628

@@ -642,7 +643,7 @@ public function outerHtml(): string
642643
$node = $this->getNode(0);
643644
$owner = $node->ownerDocument;
644645

645-
if ('<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
646+
if ($this->html5Parser && '<!DOCTYPE html>' === $owner->saveXML($owner->childNodes[0])) {
646647
$owner = $this->html5Parser;
647648
}
648649

@@ -1215,6 +1216,10 @@ private function parseHtmlString(string $content, string $charset): \DOMDocument
12151216

12161217
private function canParseHtml5String(string $content): bool
12171218
{
1219+
if (!$this->html5Parser) {
1220+
return false;
1221+
}
1222+
12181223
if (false === ($pos = stripos($content, '<!doctype html>'))) {
12191224
return false;
12201225
}

Tests/AbstractCrawlerTestCase.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,9 @@ abstract class AbstractCrawlerTestCase extends TestCase
2121
{
2222
abstract public static function getDoctype(): string;
2323

24-
protected function createCrawler($node = null, string $uri = null, string $baseHref = null)
24+
protected function createCrawler($node = null, string $uri = null, string $baseHref = null, bool $useHtml5Parser = true)
2525
{
26-
return new Crawler($node, $uri, $baseHref);
26+
return new Crawler($node, $uri, $baseHref, $useHtml5Parser);
2727
}
2828

2929
public function testConstructor()

Tests/Html5ParserCrawlerTest.php

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,20 @@ public function testHtml5ParserWithInvalidHeadedContent(string $content)
4646
self::assertEmpty($crawler->filterXPath('//h1')->text(), '->addHtmlContent failed as expected');
4747
}
4848

49+
public function testHtml5ParserNotSameAsNativeParserForSpecificHtml()
50+
{
51+
// Html who create a bug specific to the DOM extension (see https://github.com/symfony/symfony/issues/28596)
52+
$html = $this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>';
53+
54+
$html5Crawler = $this->createCrawler(null, null, null, true);
55+
$html5Crawler->add($html);
56+
57+
$nativeCrawler = $this->createCrawler(null, null, null, false);
58+
$nativeCrawler->add($html);
59+
60+
$this->assertNotEquals($nativeCrawler->filterXPath('//h1')->text(), $html5Crawler->filterXPath('//h1')->text(), 'Native parser and Html5 parser must be different');
61+
}
62+
4963
public static function validHtml5Provider(): iterable
5064
{
5165
$html = self::getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>';

0 commit comments

Comments
 (0)