From 4d14e8c001a53f6574a9d46b906df569ca1d1d8a Mon Sep 17 00:00:00 2001
From: Kieran Brahney
Date: Thu, 21 May 2020 18:36:33 +0100
Subject: [PATCH 1/4] Fixed #166
---
src/HTML5/Parser/DOMTreeBuilder.php | 25 +++++++++-------
test/HTML5/Html5Test.php | 46 +++++++++++++++++++++++++++++
2 files changed, 61 insertions(+), 10 deletions(-)
diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php
index 293d83e..b04356e 100644
--- a/src/HTML5/Parser/DOMTreeBuilder.php
+++ b/src/HTML5/Parser/DOMTreeBuilder.php
@@ -322,6 +322,12 @@ public function startTag($name, $attributes = array(), $selfClosing = false)
break;
}
+ // Case when no exists, note section on 'Anything else' below.
+ // https://html.spec.whatwg.org/multipage/parsing.html#the-after-head-insertion-mode
+ if ($this->insertMode === static::IM_AFTER_HEAD && 'head' !== $name && 'body' !== $name) {
+ $this->startTag('body');
+ }
+
// Special case handling for SVG.
if ($this->insertMode === static::IM_IN_SVG) {
$lname = Elements::normalizeSvgElement($lname);
@@ -556,21 +562,20 @@ public function comment($cdata)
public function text($data)
{
- // XXX: Hmmm.... should we really be this strict?
+ // https://html.spec.whatwg.org/multipage/parsing.html#the-before-head-insertion-mode
if ($this->insertMode < static::IM_IN_HEAD) {
// Per '8.2.5.4.3 The "before head" insertion mode' the characters
- // " \t\n\r\f" should be ignored but no mention of a parse error. This is
- // practical as most documents contain these characters. Other text is not
- // expected here so recording a parse error is necessary.
+ // " \t\n\r\f" should be ignored .
$dataTmp = trim($data, " \t\n\r\f");
- if (!empty($dataTmp)) {
- // fprintf(STDOUT, "Unexpected insert mode: %d", $this->insertMode);
- $this->parseError('Unexpected text. Ignoring: ' . $dataTmp);
+ if (! empty($dataTmp)) {
+ $this->startTag('head');
+ $this->endTag('head');
+ $this->startTag('body');
+ } else {
+ return;
}
-
- return;
}
- // fprintf(STDOUT, "Appending text %s.", $data);
+
$node = $this->doc->createTextNode($data);
$this->current->appendChild($node);
}
diff --git a/test/HTML5/Html5Test.php b/test/HTML5/Html5Test.php
index 1887a8d..86ccff5 100644
--- a/test/HTML5/Html5Test.php
+++ b/test/HTML5/Html5Test.php
@@ -492,4 +492,50 @@ public function testAnchorTargetQueryParam()
$res
);
}
+
+ /**
+ * Test for issue #166.
+ *
+ * @param $input
+ * @param $expected
+ *
+ * @dataProvider tagOmissionProvider
+ */
+ public function testTagOmission($input, $expected)
+ {
+ $doc = $this->html5->loadHTML($input);
+
+ $out = $this->html5->saveHTML($doc);
+
+ $this->assertRegExp("|" . preg_quote($expected, "|") . "|", $out);
+ }
+
+ /**
+ * Tag omission test cases.
+ *
+ * @return \string[][]
+ */
+ public function tagOmissionProvider()
+ {
+ return $provider = array(
+ array(
+ 'Hello, This is a test.
Does it work this time?',
+ 'Hello, This is a test.
Does it work this time?',
+ ),
+ // test whitespace (\n)
+ array(
+ '
+
+
+
+
+
+',
+ '
+
+
+'
+ ),
+ );
+ }
}
From 585278e14a108432f648b0d48144f31d09d166e8 Mon Sep 17 00:00:00 2001
From: Asmir Mustafic
Date: Wed, 11 Jan 2023 21:04:34 +0100
Subject: [PATCH 2/4] split test
---
src/HTML5/Parser/DOMTreeBuilder.php | 2 +-
test/HTML5/Html5Test.php | 4 ++--
test/HTML5/Parser/DOMTreeBuilderTest.php | 9 ++++++---
3 files changed, 9 insertions(+), 6 deletions(-)
diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php
index b04356e..8cd4831 100644
--- a/src/HTML5/Parser/DOMTreeBuilder.php
+++ b/src/HTML5/Parser/DOMTreeBuilder.php
@@ -567,7 +567,7 @@ public function text($data)
// Per '8.2.5.4.3 The "before head" insertion mode' the characters
// " \t\n\r\f" should be ignored .
$dataTmp = trim($data, " \t\n\r\f");
- if (! empty($dataTmp)) {
+ if (!empty($dataTmp)) {
$this->startTag('head');
$this->endTag('head');
$this->startTag('body');
diff --git a/test/HTML5/Html5Test.php b/test/HTML5/Html5Test.php
index 86ccff5..6311941 100644
--- a/test/HTML5/Html5Test.php
+++ b/test/HTML5/Html5Test.php
@@ -507,7 +507,7 @@ public function testTagOmission($input, $expected)
$out = $this->html5->saveHTML($doc);
- $this->assertRegExp("|" . preg_quote($expected, "|") . "|", $out);
+ $this->assertRegExp('|' . preg_quote($expected, '|') . '|', $out);
}
/**
@@ -534,7 +534,7 @@ public function tagOmissionProvider()
'
-'
+',
),
);
}
diff --git a/test/HTML5/Parser/DOMTreeBuilderTest.php b/test/HTML5/Parser/DOMTreeBuilderTest.php
index 659378c..0183aed 100644
--- a/test/HTML5/Parser/DOMTreeBuilderTest.php
+++ b/test/HTML5/Parser/DOMTreeBuilderTest.php
@@ -457,14 +457,17 @@ public function testText()
$data = $wrapper->childNodes->item(0);
$this->assertEquals(XML_TEXT_NODE, $data->nodeType);
$this->assertEquals('test', $data->data);
+ }
+ public function testTextBeforeHeadNotAllowed()
+ {
// The DomTreeBuilder has special handling for text when in before head mode.
- $html = '
- Foo';
+ $html = 'Foo';
$doc = $this->parse($html);
- $this->assertEquals('Line 0, Col 0: Unexpected text. Ignoring: Foo', $this->errors[0]);
+ $this->assertEquals('Line 0, Col 0: Unexpected head tag outside of head context.', $this->errors[0]);
$headElement = $doc->documentElement->firstChild;
$this->assertEquals('head', $headElement->tagName);
+ $this->assertXmlStringEqualsXmlString($doc, 'Foo
test');
+ }
+
+ public function testBodyInBodyTriggersParseError()
+ {
+ $html = 'testba
z';
+ $doc = $this->parse($html);
+
+ $this->assertContains('Line 0, Col 0: Unexpected body tag outside of body context.', $this->errors);
+ $this->assertXmlStringEqualsXmlString($doc, 'testba
z');
}
public function testParseErrors()
From 084c7ff238d470c61ad94f9eca227d8442d050cc Mon Sep 17 00:00:00 2001
From: Asmir Mustafic
Date: Wed, 18 Jan 2023 07:38:08 +0100
Subject: [PATCH 4/4] cs
---
.github/workflows/cs.yaml | 2 +-
.php_cs.dist | 1 +
src/HTML5.php | 1 -
src/HTML5/Parser/DOMTreeBuilder.php | 2 --
src/HTML5/Parser/Tokenizer.php | 1 +
src/HTML5/Parser/TreeBuildingRules.php | 1 -
test/HTML5/Html5Test.php | 4 ++--
test/HTML5/Parser/ScannerTest.php | 2 +-
test/HTML5/Parser/TokenizerTest.php | 2 +-
test/HTML5/Parser/TreeBuildingRulesTest.php | 6 +++---
test/HTML5/Serializer/OutputRulesTest.php | 2 +-
11 files changed, 11 insertions(+), 13 deletions(-)
diff --git a/.github/workflows/cs.yaml b/.github/workflows/cs.yaml
index 4b7375d..bc6575c 100644
--- a/.github/workflows/cs.yaml
+++ b/.github/workflows/cs.yaml
@@ -28,5 +28,5 @@ jobs:
- name: cs fix
run: |
- wget -q https://github.com/FriendsOfPHP/PHP-CS-Fixer/releases/download/v2.13.1/php-cs-fixer.phar
+ wget -q https://github.com/FriendsOfPHP/PHP-CS-Fixer/releases/download/v2.19.0/php-cs-fixer.phar
php php-cs-fixer.phar fix --dry-run --diff
diff --git a/.php_cs.dist b/.php_cs.dist
index d5e4918..4c16946 100644
--- a/.php_cs.dist
+++ b/.php_cs.dist
@@ -9,6 +9,7 @@ return PhpCsFixer\Config::create()
'@Symfony' => true,
'concat_space' => array('spacing' => 'one'),
'phpdoc_annotation_without_dot' => false,
+ 'array_syntax' => ['syntax' => 'long'],
))
->setFinder($finder)
;
diff --git a/src/HTML5.php b/src/HTML5.php
index c857145..49a90da 100644
--- a/src/HTML5.php
+++ b/src/HTML5.php
@@ -146,7 +146,6 @@ public function hasErrors()
* Parse an input string.
*
* @param string $input
- * @param array $options
*
* @return \DOMDocument
*/
diff --git a/src/HTML5/Parser/DOMTreeBuilder.php b/src/HTML5/Parser/DOMTreeBuilder.php
index b17a5aa..a718fe2 100644
--- a/src/HTML5/Parser/DOMTreeBuilder.php
+++ b/src/HTML5/Parser/DOMTreeBuilder.php
@@ -231,8 +231,6 @@ public function fragment()
*
* This is used for handling Processor Instructions as they are
* inserted. If omitted, PI's are inserted directly into the DOM tree.
- *
- * @param InstructionProcessor $proc
*/
public function setInstructionProcessor(InstructionProcessor $proc)
{
diff --git a/src/HTML5/Parser/Tokenizer.php b/src/HTML5/Parser/Tokenizer.php
index 016919a..db3799d 100644
--- a/src/HTML5/Parser/Tokenizer.php
+++ b/src/HTML5/Parser/Tokenizer.php
@@ -726,6 +726,7 @@ protected function isCommentEnd()
// Test for '!>'
if ('!' == $this->scanner->current() && '>' == $this->scanner->peek()) {
$this->scanner->consume(); // Consume the last '>'
+
return true;
}
// Unread '-' and one of '!' or '>';
diff --git a/src/HTML5/Parser/TreeBuildingRules.php b/src/HTML5/Parser/TreeBuildingRules.php
index 00d3951..4c6983b 100644
--- a/src/HTML5/Parser/TreeBuildingRules.php
+++ b/src/HTML5/Parser/TreeBuildingRules.php
@@ -80,7 +80,6 @@ public function evaluate($new, $current)
case 'thead':
case 'tfoot':
case 'table': // Spec isn't explicit about this, but it's necessary.
-
return $this->closeIfCurrentMatches($new, $current, array(
'thead',
'tfoot',
diff --git a/test/HTML5/Html5Test.php b/test/HTML5/Html5Test.php
index c78a4b9..ccb7bf8 100644
--- a/test/HTML5/Html5Test.php
+++ b/test/HTML5/Html5Test.php
@@ -98,9 +98,9 @@ public function testEncodingUtf8()
public function testEncodingWindows1252()
{
- $dom = $this->html5->load(__DIR__ . '/Fixtures/encoding/windows-1252.html', [
+ $dom = $this->html5->load(__DIR__ . '/Fixtures/encoding/windows-1252.html', array(
'encoding' => 'Windows-1252',
- ]);
+ ));
$this->assertInstanceOf('\DOMDocument', $dom);
$this->assertEmpty($this->html5->getErrors());
diff --git a/test/HTML5/Parser/ScannerTest.php b/test/HTML5/Parser/ScannerTest.php
index 9f75c4d..a181c1b 100644
--- a/test/HTML5/Parser/ScannerTest.php
+++ b/test/HTML5/Parser/ScannerTest.php
@@ -6,8 +6,8 @@
namespace Masterminds\HTML5\Tests\Parser;
-use Masterminds\HTML5\Parser\StringInputStream;
use Masterminds\HTML5\Parser\Scanner;
+use Masterminds\HTML5\Parser\StringInputStream;
class ScannerTest extends \Masterminds\HTML5\Tests\TestCase
{
diff --git a/test/HTML5/Parser/TokenizerTest.php b/test/HTML5/Parser/TokenizerTest.php
index c9643c5..374896f 100644
--- a/test/HTML5/Parser/TokenizerTest.php
+++ b/test/HTML5/Parser/TokenizerTest.php
@@ -2,9 +2,9 @@
namespace Masterminds\HTML5\Tests\Parser;
-use Masterminds\HTML5\Parser\UTF8Utils;
use Masterminds\HTML5\Parser\Scanner;
use Masterminds\HTML5\Parser\Tokenizer;
+use Masterminds\HTML5\Parser\UTF8Utils;
class TokenizerTest extends \Masterminds\HTML5\Tests\TestCase
{
diff --git a/test/HTML5/Parser/TreeBuildingRulesTest.php b/test/HTML5/Parser/TreeBuildingRulesTest.php
index 45c68bc..ae17a34 100644
--- a/test/HTML5/Parser/TreeBuildingRulesTest.php
+++ b/test/HTML5/Parser/TreeBuildingRulesTest.php
@@ -6,10 +6,10 @@
namespace Masterminds\HTML5\Tests\Parser;
-use Masterminds\HTML5\Parser\TreeBuildingRules;
-use Masterminds\HTML5\Parser\Tokenizer;
-use Masterminds\HTML5\Parser\Scanner;
use Masterminds\HTML5\Parser\DOMTreeBuilder;
+use Masterminds\HTML5\Parser\Scanner;
+use Masterminds\HTML5\Parser\Tokenizer;
+use Masterminds\HTML5\Parser\TreeBuildingRules;
/**
* These tests are functional, not necessarily unit tests.
diff --git a/test/HTML5/Serializer/OutputRulesTest.php b/test/HTML5/Serializer/OutputRulesTest.php
index 5741579..bb09853 100644
--- a/test/HTML5/Serializer/OutputRulesTest.php
+++ b/test/HTML5/Serializer/OutputRulesTest.php
@@ -2,9 +2,9 @@
namespace Masterminds\HTML5\Tests\Serializer;
+use Masterminds\HTML5;
use Masterminds\HTML5\Serializer\OutputRules;
use Masterminds\HTML5\Serializer\Traverser;
-use Masterminds\HTML5;
class OutputRulesTest extends \Masterminds\HTML5\Tests\TestCase
{