From 4d14e8c001a53f6574a9d46b906df569ca1d1d8a Mon Sep 17 00:00:00 2001 From: Kieran Brahney Date: Thu, 21 May 2020 18:36:33 +0100 Subject: [PATCH 1/4] Fixed #166 --- src/HTML5/Parser/DOMTreeBuilder.php | 25 +++++++++------- test/HTML5/Html5Test.php | 46 +++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 10 deletions(-) diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index 293d83e..b04356e 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -322,6 +322,12 @@ public function startTag($name, $attributes = array(), $selfClosing = false) break; } + // Case when no exists, note section on 'Anything else' below. + // https://html.spec.whatwg.org/multipage/parsing.html#the-after-head-insertion-mode + if ($this->insertMode === static::IM_AFTER_HEAD && 'head' !== $name && 'body' !== $name) { + $this->startTag('body'); + } + // Special case handling for SVG. if ($this->insertMode === static::IM_IN_SVG) { $lname = Elements::normalizeSvgElement($lname); @@ -556,21 +562,20 @@ public function comment($cdata) public function text($data) { - // XXX: Hmmm.... should we really be this strict? + // https://html.spec.whatwg.org/multipage/parsing.html#the-before-head-insertion-mode if ($this->insertMode < static::IM_IN_HEAD) { // Per '8.2.5.4.3 The "before head" insertion mode' the characters - // " \t\n\r\f" should be ignored but no mention of a parse error. This is - // practical as most documents contain these characters. Other text is not - // expected here so recording a parse error is necessary. + // " \t\n\r\f" should be ignored . $dataTmp = trim($data, " \t\n\r\f"); - if (!empty($dataTmp)) { - // fprintf(STDOUT, "Unexpected insert mode: %d", $this->insertMode); - $this->parseError('Unexpected text. Ignoring: ' . $dataTmp); + if (! empty($dataTmp)) { + $this->startTag('head'); + $this->endTag('head'); + $this->startTag('body'); + } else { + return; } - - return; } - // fprintf(STDOUT, "Appending text %s.", $data); + $node = $this->doc->createTextNode($data); $this->current->appendChild($node); } diff --git a/test/HTML5/Html5Test.php b/test/HTML5/Html5Test.php index 1887a8d..86ccff5 100644 --- a/test/HTML5/Html5Test.php +++ b/test/HTML5/Html5Test.php @@ -492,4 +492,50 @@ public function testAnchorTargetQueryParam() $res ); } + + /** + * Test for issue #166. + * + * @param $input + * @param $expected + * + * @dataProvider tagOmissionProvider + */ + public function testTagOmission($input, $expected) + { + $doc = $this->html5->loadHTML($input); + + $out = $this->html5->saveHTML($doc); + + $this->assertRegExp("|" . preg_quote($expected, "|") . "|", $out); + } + + /** + * Tag omission test cases. + * + * @return \string[][] + */ + public function tagOmissionProvider() + { + return $provider = array( + array( + 'Hello, This is a test.
Does it work this time?', + 'Hello, This is a test.
Does it work this time?', + ), + // test whitespace (\n) + array( + ' + + + +
+ +', + ' + +
+' + ), + ); + } } From 585278e14a108432f648b0d48144f31d09d166e8 Mon Sep 17 00:00:00 2001 From: Asmir Mustafic Date: Wed, 11 Jan 2023 21:04:34 +0100 Subject: [PATCH 2/4] split test --- src/HTML5/Parser/DOMTreeBuilder.php | 2 +- test/HTML5/Html5Test.php | 4 ++-- test/HTML5/Parser/DOMTreeBuilderTest.php | 9 ++++++--- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index b04356e..8cd4831 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -567,7 +567,7 @@ public function text($data) // Per '8.2.5.4.3 The "before head" insertion mode' the characters // " \t\n\r\f" should be ignored . $dataTmp = trim($data, " \t\n\r\f"); - if (! empty($dataTmp)) { + if (!empty($dataTmp)) { $this->startTag('head'); $this->endTag('head'); $this->startTag('body'); diff --git a/test/HTML5/Html5Test.php b/test/HTML5/Html5Test.php index 86ccff5..6311941 100644 --- a/test/HTML5/Html5Test.php +++ b/test/HTML5/Html5Test.php @@ -507,7 +507,7 @@ public function testTagOmission($input, $expected) $out = $this->html5->saveHTML($doc); - $this->assertRegExp("|" . preg_quote($expected, "|") . "|", $out); + $this->assertRegExp('|' . preg_quote($expected, '|') . '|', $out); } /** @@ -534,7 +534,7 @@ public function tagOmissionProvider() '
-' +', ), ); } diff --git a/test/HTML5/Parser/DOMTreeBuilderTest.php b/test/HTML5/Parser/DOMTreeBuilderTest.php index 659378c..0183aed 100644 --- a/test/HTML5/Parser/DOMTreeBuilderTest.php +++ b/test/HTML5/Parser/DOMTreeBuilderTest.php @@ -457,14 +457,17 @@ public function testText() $data = $wrapper->childNodes->item(0); $this->assertEquals(XML_TEXT_NODE, $data->nodeType); $this->assertEquals('test', $data->data); + } + public function testTextBeforeHeadNotAllowed() + { // The DomTreeBuilder has special handling for text when in before head mode. - $html = ' - Foo'; + $html = 'Foo'; $doc = $this->parse($html); - $this->assertEquals('Line 0, Col 0: Unexpected text. Ignoring: Foo', $this->errors[0]); + $this->assertEquals('Line 0, Col 0: Unexpected head tag outside of head context.', $this->errors[0]); $headElement = $doc->documentElement->firstChild; $this->assertEquals('head', $headElement->tagName); + $this->assertXmlStringEqualsXmlString($doc, 'Foo'); } public function testParseErrors() From a758ac092649e0cb4e7cdd6351f4a0ac06bd2952 Mon Sep 17 00:00:00 2001 From: Asmir Mustafic Date: Wed, 18 Jan 2023 07:36:45 +0100 Subject: [PATCH 3/4] invalid start tags --- src/HTML5/Parser/DOMTreeBuilder.php | 30 +++++++++++++++++++++--- test/HTML5/Html5Test.php | 13 ++++------ test/HTML5/Parser/DOMTreeBuilderTest.php | 29 ++++++++++++++++++----- 3 files changed, 55 insertions(+), 17 deletions(-) diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index 8cd4831..b17a5aa 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -302,12 +302,28 @@ public function startTag($name, $attributes = array(), $selfClosing = false) case 'head': if ($this->insertMode > static::IM_BEFORE_HEAD) { $this->parseError('Unexpected head tag outside of head context.'); + // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody + // A start tag whose tag name is one of: "caption", "col", "colgroup", "frame", "head", "tbody", + // "td", "tfoot", "th", "thead", "tr" + // Parse error. Ignore the token. + return 0; } else { $this->insertMode = static::IM_IN_HEAD; } break; case 'body': - $this->insertMode = static::IM_IN_BODY; + if ($this->insertMode >= static::IM_IN_BODY) { + $this->parseError('Unexpected body tag outside of body context.'); + // https://html.spec.whatwg.org/multipage/parsing.html#parsing-main-inbody + // A start tag whose tag name is "body" + // Parse error. + // If the second element on the stack of open elements is not a body element, if the stack of open elements has only one node on it, or if there is a template element on the stack of open elements, then ignore the token. (fragment case) + // Otherwise, set the frameset-ok flag to "not ok"; then, for each attribute on the token, check to see if the attribute is already present on the body element (the second element) on the stack of open elements, and if it is not, add the attribute and its corresponding value to that element. + return 0; + } else { + $this->insertMode = static::IM_IN_BODY; + } + break; case 'svg': $this->insertMode = static::IM_IN_SVG; @@ -541,10 +557,18 @@ public function endTag($name) switch ($lname) { case 'head': - $this->insertMode = static::IM_AFTER_HEAD; + if ($this->insertMode <= static::IM_AFTER_HEAD) { + $this->insertMode = static::IM_AFTER_HEAD; + } else { + $this->parseError('Closing head tag encountered but not in head context.'); + } break; case 'body': - $this->insertMode = static::IM_AFTER_BODY; + if ($this->insertMode <= static::IM_AFTER_BODY || $this->insertMode >= static::IM_IN_SVG) { + $this->insertMode = static::IM_AFTER_BODY; + } else { + $this->parseError('Closing body tag encountered but not in body context.'); + } break; case 'svg': case 'mathml': diff --git a/test/HTML5/Html5Test.php b/test/HTML5/Html5Test.php index 6311941..c78a4b9 100644 --- a/test/HTML5/Html5Test.php +++ b/test/HTML5/Html5Test.php @@ -98,9 +98,9 @@ public function testEncodingUtf8() public function testEncodingWindows1252() { - $dom = $this->html5->load(__DIR__ . '/Fixtures/encoding/windows-1252.html', array( + $dom = $this->html5->load(__DIR__ . '/Fixtures/encoding/windows-1252.html', [ 'encoding' => 'Windows-1252', - )); + ]); $this->assertInstanceOf('\DOMDocument', $dom); $this->assertEmpty($this->html5->getErrors()); @@ -496,17 +496,14 @@ public function testAnchorTargetQueryParam() /** * Test for issue #166. * - * @param $input - * @param $expected - * * @dataProvider tagOmissionProvider */ public function testTagOmission($input, $expected) { $doc = $this->html5->loadHTML($input); + $this->assertCount(0, $this->html5->getErrors()); $out = $this->html5->saveHTML($doc); - $this->assertRegExp('|' . preg_quote($expected, '|') . '|', $out); } @@ -517,9 +514,9 @@ public function testTagOmission($input, $expected) */ public function tagOmissionProvider() { - return $provider = array( + return array( array( - 'Hello, This is a test.
Does it work this time?', + 'Hello, This is a test.
Does it work this time?', 'Hello, This is a test.
Does it work this time?', ), // test whitespace (\n) diff --git a/test/HTML5/Parser/DOMTreeBuilderTest.php b/test/HTML5/Parser/DOMTreeBuilderTest.php index 0183aed..9e9c003 100644 --- a/test/HTML5/Parser/DOMTreeBuilderTest.php +++ b/test/HTML5/Parser/DOMTreeBuilderTest.php @@ -6,9 +6,9 @@ namespace Masterminds\HTML5\Tests\Parser; +use Masterminds\HTML5\Parser\DOMTreeBuilder; use Masterminds\HTML5\Parser\Scanner; use Masterminds\HTML5\Parser\Tokenizer; -use Masterminds\HTML5\Parser\DOMTreeBuilder; /** * These tests are functional, not necessarily unit tests. @@ -462,12 +462,29 @@ public function testText() public function testTextBeforeHeadNotAllowed() { // The DomTreeBuilder has special handling for text when in before head mode. - $html = 'Foo'; + $html = 'Footest'; + $doc = $this->parse($html); + + $this->assertContains('Line 0, Col 0: Unexpected body tag outside of body context.', $this->errors); + $this->assertXmlStringEqualsXmlString($doc, 'Footest'); + } + + public function testHeadInBodyTriggersParseError() + { + $html = 'test'; $doc = $this->parse($html); - $this->assertEquals('Line 0, Col 0: Unexpected head tag outside of head context.', $this->errors[0]); - $headElement = $doc->documentElement->firstChild; - $this->assertEquals('head', $headElement->tagName); - $this->assertXmlStringEqualsXmlString($doc, 'Foo'); + + $this->assertContains('Line 0, Col 0: Unexpected head tag outside of head context.', $this->errors); + $this->assertXmlStringEqualsXmlString($doc, 'test'); + } + + public function testBodyInBodyTriggersParseError() + { + $html = 'testba
z'; + $doc = $this->parse($html); + + $this->assertContains('Line 0, Col 0: Unexpected body tag outside of body context.', $this->errors); + $this->assertXmlStringEqualsXmlString($doc, 'testba
z'); } public function testParseErrors() From 084c7ff238d470c61ad94f9eca227d8442d050cc Mon Sep 17 00:00:00 2001 From: Asmir Mustafic Date: Wed, 18 Jan 2023 07:38:08 +0100 Subject: [PATCH 4/4] cs --- .github/workflows/cs.yaml | 2 +- .php_cs.dist | 1 + src/HTML5.php | 1 - src/HTML5/Parser/DOMTreeBuilder.php | 2 -- src/HTML5/Parser/Tokenizer.php | 1 + src/HTML5/Parser/TreeBuildingRules.php | 1 - test/HTML5/Html5Test.php | 4 ++-- test/HTML5/Parser/ScannerTest.php | 2 +- test/HTML5/Parser/TokenizerTest.php | 2 +- test/HTML5/Parser/TreeBuildingRulesTest.php | 6 +++--- test/HTML5/Serializer/OutputRulesTest.php | 2 +- 11 files changed, 11 insertions(+), 13 deletions(-) diff --git a/.github/workflows/cs.yaml b/.github/workflows/cs.yaml index 4b7375d..bc6575c 100644 --- a/.github/workflows/cs.yaml +++ b/.github/workflows/cs.yaml @@ -28,5 +28,5 @@ jobs: - name: cs fix run: | - wget -q https://github.com/FriendsOfPHP/PHP-CS-Fixer/releases/download/v2.13.1/php-cs-fixer.phar + wget -q https://github.com/FriendsOfPHP/PHP-CS-Fixer/releases/download/v2.19.0/php-cs-fixer.phar php php-cs-fixer.phar fix --dry-run --diff diff --git a/.php_cs.dist b/.php_cs.dist index d5e4918..4c16946 100644 --- a/.php_cs.dist +++ b/.php_cs.dist @@ -9,6 +9,7 @@ return PhpCsFixer\Config::create() '@Symfony' => true, 'concat_space' => array('spacing' => 'one'), 'phpdoc_annotation_without_dot' => false, + 'array_syntax' => ['syntax' => 'long'], )) ->setFinder($finder) ; diff --git a/src/HTML5.php b/src/HTML5.php index c857145..49a90da 100644 --- a/src/HTML5.php +++ b/src/HTML5.php @@ -146,7 +146,6 @@ public function hasErrors() * Parse an input string. * * @param string $input - * @param array $options * * @return \DOMDocument */ diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php index b17a5aa..a718fe2 100644 --- a/src/HTML5/Parser/DOMTreeBuilder.php +++ b/src/HTML5/Parser/DOMTreeBuilder.php @@ -231,8 +231,6 @@ public function fragment() * * This is used for handling Processor Instructions as they are * inserted. If omitted, PI's are inserted directly into the DOM tree. - * - * @param InstructionProcessor $proc */ public function setInstructionProcessor(InstructionProcessor $proc) { diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php index 016919a..db3799d 100644 --- a/src/HTML5/Parser/Tokenizer.php +++ b/src/HTML5/Parser/Tokenizer.php @@ -726,6 +726,7 @@ protected function isCommentEnd() // Test for '!>' if ('!' == $this->scanner->current() && '>' == $this->scanner->peek()) { $this->scanner->consume(); // Consume the last '>' + return true; } // Unread '-' and one of '!' or '>'; diff --git a/src/HTML5/Parser/TreeBuildingRules.php b/src/HTML5/Parser/TreeBuildingRules.php index 00d3951..4c6983b 100644 --- a/src/HTML5/Parser/TreeBuildingRules.php +++ b/src/HTML5/Parser/TreeBuildingRules.php @@ -80,7 +80,6 @@ public function evaluate($new, $current) case 'thead': case 'tfoot': case 'table': // Spec isn't explicit about this, but it's necessary. - return $this->closeIfCurrentMatches($new, $current, array( 'thead', 'tfoot', diff --git a/test/HTML5/Html5Test.php b/test/HTML5/Html5Test.php index c78a4b9..ccb7bf8 100644 --- a/test/HTML5/Html5Test.php +++ b/test/HTML5/Html5Test.php @@ -98,9 +98,9 @@ public function testEncodingUtf8() public function testEncodingWindows1252() { - $dom = $this->html5->load(__DIR__ . '/Fixtures/encoding/windows-1252.html', [ + $dom = $this->html5->load(__DIR__ . '/Fixtures/encoding/windows-1252.html', array( 'encoding' => 'Windows-1252', - ]); + )); $this->assertInstanceOf('\DOMDocument', $dom); $this->assertEmpty($this->html5->getErrors()); diff --git a/test/HTML5/Parser/ScannerTest.php b/test/HTML5/Parser/ScannerTest.php index 9f75c4d..a181c1b 100644 --- a/test/HTML5/Parser/ScannerTest.php +++ b/test/HTML5/Parser/ScannerTest.php @@ -6,8 +6,8 @@ namespace Masterminds\HTML5\Tests\Parser; -use Masterminds\HTML5\Parser\StringInputStream; use Masterminds\HTML5\Parser\Scanner; +use Masterminds\HTML5\Parser\StringInputStream; class ScannerTest extends \Masterminds\HTML5\Tests\TestCase { diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php index c9643c5..374896f 100644 --- a/test/HTML5/Parser/TokenizerTest.php +++ b/test/HTML5/Parser/TokenizerTest.php @@ -2,9 +2,9 @@ namespace Masterminds\HTML5\Tests\Parser; -use Masterminds\HTML5\Parser\UTF8Utils; use Masterminds\HTML5\Parser\Scanner; use Masterminds\HTML5\Parser\Tokenizer; +use Masterminds\HTML5\Parser\UTF8Utils; class TokenizerTest extends \Masterminds\HTML5\Tests\TestCase { diff --git a/test/HTML5/Parser/TreeBuildingRulesTest.php b/test/HTML5/Parser/TreeBuildingRulesTest.php index 45c68bc..ae17a34 100644 --- a/test/HTML5/Parser/TreeBuildingRulesTest.php +++ b/test/HTML5/Parser/TreeBuildingRulesTest.php @@ -6,10 +6,10 @@ namespace Masterminds\HTML5\Tests\Parser; -use Masterminds\HTML5\Parser\TreeBuildingRules; -use Masterminds\HTML5\Parser\Tokenizer; -use Masterminds\HTML5\Parser\Scanner; use Masterminds\HTML5\Parser\DOMTreeBuilder; +use Masterminds\HTML5\Parser\Scanner; +use Masterminds\HTML5\Parser\Tokenizer; +use Masterminds\HTML5\Parser\TreeBuildingRules; /** * These tests are functional, not necessarily unit tests. diff --git a/test/HTML5/Serializer/OutputRulesTest.php b/test/HTML5/Serializer/OutputRulesTest.php index 5741579..bb09853 100644 --- a/test/HTML5/Serializer/OutputRulesTest.php +++ b/test/HTML5/Serializer/OutputRulesTest.php @@ -2,9 +2,9 @@ namespace Masterminds\HTML5\Tests\Serializer; +use Masterminds\HTML5; use Masterminds\HTML5\Serializer\OutputRules; use Masterminds\HTML5\Serializer\Traverser; -use Masterminds\HTML5; class OutputRulesTest extends \Masterminds\HTML5\Tests\TestCase {