Skip to content

Commit 5ba2a82

Browse files
mateuszdebinskiMateusz Dębiński
andauthored
Merge commit from fork
Co-authored-by: Mateusz Dębiński <mateusz.debinski@ibexa.co>
1 parent acb0fa0 commit 5ba2a82

File tree

8 files changed

+419
-8
lines changed

8 files changed

+419
-8
lines changed

src/lib/RichText/XMLSanitizer.php

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
<?php
2+
3+
/**
4+
* @copyright Copyright (C) Ibexa AS. All rights reserved.
5+
* @license For full copyright and license information view LICENSE file distributed with this source code.
6+
*/
7+
declare(strict_types=1);
8+
9+
namespace Ibexa\FieldTypeRichText\RichText;
10+
11+
use DOMDocument;
12+
use DOMText;
13+
use DOMXPath;
14+
use RuntimeException;
15+
16+
/**
17+
* @internal
18+
*/
19+
final class XMLSanitizer
20+
{
21+
public function sanitizeXMLString(string $xmlString): string
22+
{
23+
$xmlString = $this->decodeHTMLEntities($xmlString);
24+
$xmlString = $this->removeComments($xmlString);
25+
$xmlString = $this->removeDangerousTags($xmlString);
26+
$xmlString = $this->sanitizeDocType($xmlString);
27+
28+
return $this->removeEmptyDocType($xmlString);
29+
}
30+
31+
public function convertCDATAToText(DOMDocument $document): DOMDocument
32+
{
33+
$xpath = new DOMXPath($document);
34+
$cdataNodes = $xpath->query('//text()[ancestor-or-self::node()]');
35+
if ($cdataNodes === false) {
36+
return $document;
37+
}
38+
39+
foreach ($cdataNodes as $cdataNode) {
40+
if ($cdataNode->nodeType === XML_CDATA_SECTION_NODE && $cdataNode->parentNode !== null) {
41+
$cdataNode->parentNode->replaceChild(new DOMText($cdataNode->textContent), $cdataNode);
42+
}
43+
}
44+
45+
return $document;
46+
}
47+
48+
private function decodeHTMLEntities(string $xmlString): string
49+
{
50+
return html_entity_decode($xmlString, ENT_XML1, 'UTF-8');
51+
}
52+
53+
private function removeComments(string $xmlString): string
54+
{
55+
$xmlString = preg_replace('/<!--\s?.*?\s?-->/s', '', $xmlString);
56+
57+
if ($xmlString === null) {
58+
$this->throwRuntimeException(__METHOD__);
59+
}
60+
61+
return $xmlString;
62+
}
63+
64+
private function removeDangerousTags(string $xmlString): string
65+
{
66+
$xmlString = preg_replace('/<\s*(script|iframe|object|embed|style)[^>]*>.*?<\s*\/\s*\1\s*>/is', '', $xmlString);
67+
68+
if ($xmlString === null) {
69+
$this->throwRuntimeException(__METHOD__);
70+
}
71+
72+
return $xmlString;
73+
}
74+
75+
private function sanitizeDocType(string $xmlString): string
76+
{
77+
$pattern = '/<\s*!DOCTYPE\s+(?<name>[^\s>]+)\s*(\[(?<entities>.*?)\]\s*)?>/is';
78+
79+
if (!preg_match($pattern, $xmlString, $matches)) {
80+
return $xmlString;
81+
}
82+
83+
$docTypeName = $matches['name'];
84+
$entitiesBlock = $matches['entities'] ?? '';
85+
[$safeEntities, $removedEntities] = $this->filterEntitiesFromDocType($entitiesBlock);
86+
87+
foreach ($removedEntities as $entity) {
88+
$xmlString = preg_replace('/&' . preg_quote($entity, '/') . ';/i', '', $xmlString);
89+
90+
if ($xmlString === null) {
91+
$this->throwRuntimeException(__METHOD__);
92+
}
93+
}
94+
95+
$safeDocType = sprintf('<!DOCTYPE %s [ %s ]>', $docTypeName, implode("\n", $safeEntities));
96+
$xmlString = preg_replace($pattern, $safeDocType, $xmlString);
97+
98+
if ($xmlString === null) {
99+
$this->throwRuntimeException(__METHOD__);
100+
}
101+
102+
return $xmlString;
103+
}
104+
105+
private function removeEmptyDocType(string $xmlString): string
106+
{
107+
$xmlString = preg_replace('/<\s*!DOCTYPE\s+[^\[\]>]*\[\s*\]>/is', '', $xmlString);
108+
109+
if ($xmlString === null) {
110+
$this->throwRuntimeException(__METHOD__);
111+
}
112+
113+
return $xmlString;
114+
}
115+
116+
/**
117+
* @return array<int, array<int, string>>
118+
*/
119+
private function filterEntitiesFromDocType(string $entitiesBlock): array
120+
{
121+
$lines = explode("\n", $entitiesBlock);
122+
$safeEntities = [];
123+
$entitiesToRemove = [];
124+
$entityDefinitions = [];
125+
126+
foreach ($lines as $line) {
127+
$line = trim($line);
128+
129+
if (preg_match('/<!ENTITY\s+(\S+)\s+(SYSTEM|PUBLIC)\s+/i', $line, $matches)) {
130+
$entitiesToRemove[] = $matches[1];
131+
continue;
132+
}
133+
134+
if (!preg_match('/<!ENTITY\s+(\S+)\s+"([^"]+)"/', $line, $matches)) {
135+
continue;
136+
}
137+
138+
$entityName = $matches[1];
139+
$entityValue = $matches[2];
140+
$entityDefinitions[$entityName] = $entityValue;
141+
142+
if (preg_match('/&\S+;/', $entityValue)) {
143+
$entitiesToRemove[] = $entityName;
144+
continue;
145+
}
146+
147+
$safeEntities[] = $line;
148+
}
149+
150+
$entitiesToRemove = $this->resolveRecursiveEntities($entityDefinitions, $entitiesToRemove);
151+
$safeEntities = array_filter($safeEntities, function ($line) use ($entitiesToRemove) {
152+
return !$this->containsUnsafeEntity($line, $entitiesToRemove);
153+
});
154+
155+
return [$safeEntities, $entitiesToRemove];
156+
}
157+
158+
/**
159+
* @param array<int, string> $entitiesToRemove
160+
* @param array<string, string> $entityDefinitions
161+
*
162+
* @return array<int, string>
163+
*/
164+
private function resolveRecursiveEntities(array $entityDefinitions, array $entitiesToRemove): array
165+
{
166+
foreach ($entityDefinitions as $name => $value) {
167+
foreach ($entitiesToRemove as $toRemove) {
168+
if (strpos($value, "&$toRemove;") !== false && !in_array($name, $entitiesToRemove, true)) {
169+
$entitiesToRemove[] = $name;
170+
}
171+
}
172+
}
173+
174+
return array_unique($entitiesToRemove);
175+
}
176+
177+
/**
178+
* @param array<int, string> $entitiesToRemove
179+
*/
180+
private function containsUnsafeEntity(string $line, array $entitiesToRemove): bool
181+
{
182+
foreach ($entitiesToRemove as $toRemove) {
183+
if (strpos($line, $toRemove) !== false) {
184+
return true;
185+
}
186+
}
187+
188+
return false;
189+
}
190+
191+
/**
192+
* @return never
193+
*/
194+
private function throwRuntimeException(string $functionName): void
195+
{
196+
throw new RuntimeException(
197+
sprintf('%s returned null for "$xmlString", error: %s', $functionName, preg_last_error_msg())
198+
);
199+
}
200+
}

src/lib/eZ/RichText/DOMDocumentFactory.php

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,18 @@
1010

1111
use DOMDocument;
1212
use EzSystems\EzPlatformRichText\eZ\RichText\Exception\InvalidXmlException;
13+
use Ibexa\FieldTypeRichText\RichText\XMLSanitizer;
1314

1415
final class DOMDocumentFactory
1516
{
17+
/** @var \Ibexa\FieldTypeRichText\RichText\XMLSanitizer */
18+
private $xmlSanitizer;
19+
20+
public function __construct(XMLSanitizer $xmlSanitizer)
21+
{
22+
$this->xmlSanitizer = $xmlSanitizer;
23+
}
24+
1625
/**
1726
* Creates \DOMDocument from given $xmlString.
1827
*
@@ -33,11 +42,11 @@ public function loadXMLString(string $xmlString): DOMDocument
3342
// - substitute entities
3443
// - disable network access
3544
// - relax parser limits for document size/complexity
36-
$success = $document->loadXML($xmlString, LIBXML_NOENT | LIBXML_NONET | LIBXML_PARSEHUGE);
45+
$success = $document->loadXML($this->xmlSanitizer->sanitizeXMLString($xmlString), LIBXML_NOENT | LIBXML_DTDLOAD | LIBXML_NONET | LIBXML_PARSEHUGE);
3746
if (!$success) {
3847
throw new InvalidXmlException('$xmlString', libxml_get_errors());
3948
}
4049

41-
return $document;
50+
return $this->xmlSanitizer->convertCDATAToText($document);
4251
}
4352
}

src/lib/eZ/settings/fieldtype_services.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ services:
5252

5353
EzSystems\EzPlatformRichText\eZ\RichText\DOMDocumentFactory:
5454
public: false
55+
56+
Ibexa\FieldTypeRichText\RichText\XMLSanitizer:
57+
public: false
5558

5659
EzSystems\EzPlatformRichText\eZ\RichText\InputHandler:
5760
arguments:

tests/integration/eZ/SPI/RichTextFieldTypeIntegrationTest.php

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
use EzSystems\EzPlatformRichText\eZ\RichText;
1818
use EzSystems\EzPlatformRichText\eZ\FieldType\RichText\RichTextStorage\Gateway\DoctrineStorage;
1919
use EzSystems\EzPlatformRichText\eZ\Persistence\Legacy\RichTextFieldValueConverter;
20+
use Ibexa\FieldTypeRichText\RichText\XMLSanitizer;
2021

2122
/**
2223
* Integration test for legacy storage field types.
@@ -58,7 +59,7 @@ public function getTypeName()
5859
public function getCustomHandler()
5960
{
6061
$inputHandler = new RichText\InputHandler(
61-
new RichText\DOMDocumentFactory(),
62+
new RichText\DOMDocumentFactory(new XMLSanitizer()),
6263
new RichText\ConverterDispatcher([]),
6364
new RichText\Normalizer\Aggregate(),
6465
new RichText\Validator\ValidatorDispatcher([

tests/lib/Form/DataTransformer/RichTextTransformerTest.php

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
use EzSystems\EzPlatformRichText\eZ\RichText\Converter;
1616
use EzSystems\EzPlatformRichText\eZ\RichText\DOMDocumentFactory;
1717
use EzSystems\EzPlatformRichText\eZ\RichText\InputHandlerInterface;
18+
use Ibexa\FieldTypeRichText\RichText\XMLSanitizer;
1819
use EzSystems\EzPlatformRichText\Form\DataTransformer\RichTextTransformer;
1920
use PHPUnit\Framework\TestCase;
2021
use Symfony\Component\Form\Exception\TransformationFailedException;
@@ -37,7 +38,7 @@ protected function setUp(): void
3738

3839
$this->richTextTransformer = new RichTextTransformer(
3940
// DOMDocumentFactory is final
40-
new DOMDocumentFactory(),
41+
new DOMDocumentFactory(new XMLSanitizer()),
4142
$this->inputHandler,
4243
$this->docbook2xhtml5editConverter
4344
);

tests/lib/eZ/FieldType/RichTextTest.php

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
use EzSystems\EzPlatformRichText\eZ\RichText\RelationProcessor;
2727
use EzSystems\EzPlatformRichText\eZ\RichText\Validator\Validator;
2828
use EzSystems\EzPlatformRichText\eZ\RichText\Validator\ValidatorDispatcher;
29+
use Ibexa\FieldTypeRichText\RichText\XMLSanitizer;
2930
use PHPUnit\Framework\TestCase;
3031
use RuntimeException;
3132

@@ -41,7 +42,7 @@ class RichTextTest extends TestCase
4142
protected function getFieldType()
4243
{
4344
$inputHandler = new InputHandler(
44-
new DOMDocumentFactory(),
45+
new DOMDocumentFactory(new XMLSanitizer()),
4546
new ConverterDispatcher([
4647
'http://docbook.org/ns/docbook' => null,
4748
]),

0 commit comments

Comments
 (0)