Skip to content

Commit 3ac31ff

Browse files
committed
Merge branch '4.4' into 5.1
* 4.4: Fix for issue #37681 Revert changes to Table->fillCells() [Console] Table: support cells with newlines after a cell with colspan >= 2 Fix redis connect with empty password [Validator] Add BC layer for notInRangeMessage when min and max are set
2 parents 8aedc51 + 6dd1e7a commit 3ac31ff

File tree

2 files changed

+84
-2
lines changed

2 files changed

+84
-2
lines changed

Crawler.php

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -182,8 +182,7 @@ public function addContent(string $content, string $type = null)
182182
*/
183183
public function addHtmlContent(string $content, string $charset = 'UTF-8')
184184
{
185-
// Use HTML5 parser if the content is HTML5 and the library is available
186-
$dom = null !== $this->html5Parser && strspn($content, " \t\r\n") === stripos($content, '<!doctype html>') ? $this->parseHtml5($content, $charset) : $this->parseXhtml($content, $charset);
185+
$dom = $this->parseHtmlString($content, $charset);
187186
$this->addDocument($dom);
188187

189188
$base = $this->filterRelativeXPath('descendant-or-self::base')->extract(['href']);
@@ -1234,4 +1233,35 @@ private function createCssSelectorConverter(): CssSelectorConverter
12341233

12351234
return new CssSelectorConverter($this->isHtml);
12361235
}
1236+
1237+
/**
1238+
* Parse string into DOMDocument object using HTML5 parser if the content is HTML5 and the library is available.
1239+
* Use libxml parser otherwise.
1240+
*/
1241+
private function parseHtmlString(string $content, string $charset): \DOMDocument
1242+
{
1243+
if ($this->canParseHtml5String($content)) {
1244+
return $this->parseHtml5($content, $charset);
1245+
}
1246+
1247+
return $this->parseXhtml($content, $charset);
1248+
}
1249+
1250+
private function canParseHtml5String(string $content): bool
1251+
{
1252+
if (null === $this->html5Parser) {
1253+
return false;
1254+
}
1255+
if (false === ($pos = stripos($content, '<!doctype html>'))) {
1256+
return false;
1257+
}
1258+
$header = substr($content, 0, $pos);
1259+
1260+
return '' === $header || $this->isValidHtml5Heading($header);
1261+
}
1262+
1263+
private function isValidHtml5Heading(string $heading): bool
1264+
{
1265+
return 1 === preg_match('/^\x{FEFF}?\s*(<!--[^>]*?-->\s*)*$/u', $heading);
1266+
}
12371267
}

Tests/Html5ParserCrawlerTest.php

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,4 +25,56 @@ public function testAddHtml5()
2525
$crawler->add($this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>');
2626
$this->assertEquals('Foo', $crawler->filterXPath('//h1')->text(), '->add() adds nodes from a string');
2727
}
28+
29+
/** @dataProvider validHtml5Provider */
30+
public function testHtml5ParserParseContentStartingWithValidHeading(string $content): void
31+
{
32+
$this->skipTestIfHTML5LibraryNotAvailable();
33+
34+
$crawler = $this->createCrawler();
35+
$crawler->addHtmlContent($content);
36+
self::assertEquals(
37+
'Foo',
38+
$crawler->filterXPath('//h1')->text(),
39+
'->addHtmlContent() parses valid HTML with comment before doctype'
40+
);
41+
}
42+
43+
/** @dataProvider invalidHtml5Provider */
44+
public function testHtml5ParserWithInvalidHeadedContent(string $content): void
45+
{
46+
$this->skipTestIfHTML5LibraryNotAvailable();
47+
48+
$crawler = $this->createCrawler();
49+
$crawler->addHtmlContent($content);
50+
self::assertEmpty($crawler->filterXPath('//h1')->text(), '->addHtmlContent failed as expected');
51+
}
52+
53+
public function validHtml5Provider(): iterable
54+
{
55+
$html = $this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>';
56+
$BOM = \chr(0xEF).\chr(0xBB).\chr(0xBF);
57+
58+
yield 'BOM first' => [$BOM.$html];
59+
yield 'Single comment' => ['<!-- comment -->'.$html];
60+
yield 'Multiline comment' => ["<!-- \n multiline comment \n -->".$html];
61+
yield 'Several comments' => ['<!--c--> <!--cc-->'.$html];
62+
yield 'Whitespaces' => [' '.$html];
63+
yield 'All together' => [$BOM.' '.'<!--c-->'.$html];
64+
}
65+
66+
public function invalidHtml5Provider(): iterable
67+
{
68+
$html = $this->getDoctype().'<html><body><h1><p>Foo</p></h1></body></html>';
69+
70+
yield 'Text' => ['hello world'.$html];
71+
yield 'Text between comments' => ['<!--c--> test <!--cc-->'.$html];
72+
}
73+
74+
private function skipTestIfHTML5LibraryNotAvailable(): void
75+
{
76+
if (!class_exists(\Masterminds\HTML5::class)) {
77+
self::markTestSkipped('HTML5 library is not available');
78+
}
79+
}
2880
}

0 commit comments

Comments
 (0)