Skip to content

Commit 791c8cf

Browse files
authored
Merge pull request #4019 from oleibman/issue3995
Html Reader Non-UTF8 Charsets
2 parents d38b7cb + bdcbc04 commit 791c8cf

14 files changed

+251
-14
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ and this project adheres to [Semantic Versioning](https://semver.org).
4646
- Default Value for Conditional::$text [PR #3946](https://github.com/PHPOffice/PhpSpreadsheet/pull/3946)
4747
- Table Filter Buttons [Issue #3988](https://github.com/PHPOffice/PhpSpreadsheet/issues/3988) [PR #3992](https://github.com/PHPOffice/PhpSpreadsheet/pull/3992)
4848
- Improvements to Xml Reader [Issue #3999](https://github.com/PHPOffice/PhpSpreadsheet/issues/3999) [Issue #4000](https://github.com/PHPOffice/PhpSpreadsheet/issues/4000) [Issue #4001](https://github.com/PHPOffice/PhpSpreadsheet/issues/4001) [Issue #4002](https://github.com/PHPOffice/PhpSpreadsheet/issues/4002) [PR #4003](https://github.com/PHPOffice/PhpSpreadsheet/pull/4003) [PR #4007](https://github.com/PHPOffice/PhpSpreadsheet/pull/4007)
49+
- Html Reader non-UTF8 [Issue #3995](https://github.com/PHPOffice/PhpSpreadsheet/issues/3995) [Issue #866](https://github.com/PHPOffice/PhpSpreadsheet/issues/866) [Issue #1681](https://github.com/PHPOffice/PhpSpreadsheet/issues/1681) [PR #4019](https://github.com/PHPOffice/PhpSpreadsheet/pull/4019)
4950

5051
## 2.0.0 - 2024-01-04
5152

docs/topics/reading-and-writing-to-file.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ versions of Microsoft Excel.
298298
**Excel 2003 XML limitations** Please note that Excel 2003 XML format
299299
has some limits regarding to styling cells and handling large
300300
spreadsheets via PHP.
301-
Also, only files using charset UTF-8 are supported.
301+
Also, only files using charset UTF-8 or ISO-8859-* are supported.
302302

303303
### \PhpOffice\PhpSpreadsheet\Reader\Xml
304304

@@ -718,7 +718,7 @@ extension.
718718

719719
**HTML limitations** Please note that HTML file format has some limits
720720
regarding to styling cells, number formatting, ...
721-
Also, only files using charset UTF-8 are supported.
721+
Declared charsets compatible with ASCII in range 00-7F, and UTF-8/16 with BOM are supported.
722722

723723
### \PhpOffice\PhpSpreadsheet\Reader\Html
724724

src/PhpSpreadsheet/Reader/Html.php

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ class Html extends BaseReader
3030
*/
3131
const TEST_SAMPLE_SIZE = 2048;
3232

33+
private const STARTS_WITH_BOM = '/^(?:\xfe\xff|\xff\xfe|\xEF\xBB\xBF)/';
34+
35+
private const DECLARES_CHARSET = '/ charset=/i';
36+
3337
/**
3438
* Input encoding.
3539
*/
@@ -144,6 +148,9 @@ public function canRead(string $filename): bool
144148
}
145149

146150
$beginning = $this->readBeginning();
151+
if (preg_match(self::STARTS_WITH_BOM, $beginning)) {
152+
return true;
153+
}
147154
$startWithTag = self::startsWithTag($beginning);
148155
$containsTags = self::containsTags($beginning);
149156
$endsWithTag = self::endsWithTag($this->readEnding());
@@ -638,12 +645,7 @@ public function loadIntoExisting(string $filename, Spreadsheet $spreadsheet): Sp
638645
// Reload the HTML file into the DOM object
639646
try {
640647
$convert = $this->getSecurityScannerOrThrow()->scanFile($filename);
641-
$lowend = "\u{80}";
642-
$highend = "\u{10ffff}";
643-
$regexp = "/[$lowend-$highend]/u";
644-
/** @var callable $callback */
645-
$callback = [self::class, 'replaceNonAscii'];
646-
$convert = preg_replace_callback($regexp, $callback, $convert);
648+
$convert = self::replaceNonAsciiIfNeeded($convert);
647649
$loaded = ($convert === null) ? false : $dom->loadHTML($convert);
648650
} catch (Throwable $e) {
649651
$loaded = false;
@@ -736,6 +738,20 @@ private static function replaceNonAscii(array $matches): string
736738
return '&#' . mb_ord($matches[0], 'UTF-8') . ';';
737739
}
738740

741+
private static function replaceNonAsciiIfNeeded(string $convert): ?string
742+
{
743+
if (preg_match(self::STARTS_WITH_BOM, $convert) !== 1 && preg_match(self::DECLARES_CHARSET, $convert) !== 1) {
744+
$lowend = "\u{80}";
745+
$highend = "\u{10ffff}";
746+
$regexp = "/[$lowend-$highend]/u";
747+
/** @var callable $callback */
748+
$callback = [self::class, 'replaceNonAscii'];
749+
$convert = preg_replace_callback($regexp, $callback, $convert);
750+
}
751+
752+
return $convert;
753+
}
754+
739755
/**
740756
* Spreadsheet from content.
741757
*/
@@ -747,12 +763,7 @@ public function loadFromString(string $content, ?Spreadsheet $spreadsheet = null
747763
// Reload the HTML file into the DOM object
748764
try {
749765
$convert = $this->getSecurityScannerOrThrow()->scan($content);
750-
$lowend = "\u{80}";
751-
$highend = "\u{10ffff}";
752-
$regexp = "/[$lowend-$highend]/u";
753-
/** @var callable $callback */
754-
$callback = [self::class, 'replaceNonAscii'];
755-
$convert = preg_replace_callback($regexp, $callback, $convert);
766+
$convert = self::replaceNonAsciiIfNeeded($convert);
756767
$loaded = ($convert === null) ? false : $dom->loadHTML($convert);
757768
} catch (Throwable $e) {
758769
$loaded = false;
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpOffice\PhpSpreadsheetTests\Reader\Html;
6+
7+
use PhpOffice\PhpSpreadsheet\Reader\Exception as ReaderException;
8+
use PhpOffice\PhpSpreadsheet\Reader\Html;
9+
use PHPUnit\Framework\TestCase;
10+
11+
class HtmlCharsetTest extends TestCase
12+
{
13+
/**
14+
* @dataProvider providerCharset
15+
*/
16+
public function testCharset(string $filename, string $expectedResult): void
17+
{
18+
if ($expectedResult === 'exception') {
19+
$this->expectException(ReaderException::class);
20+
$this->expectExceptionMessage('Failed to load');
21+
}
22+
$directory = 'tests/data/Reader/HTML';
23+
$reader = new Html();
24+
$spreadsheet = $reader->load("$directory/$filename");
25+
$sheet = $spreadsheet->getActiveSheet();
26+
self::assertSame($expectedResult, $sheet->getCell('A1')->getValue());
27+
$spreadsheet->disconnectWorksheets();
28+
}
29+
30+
public static function providerCharset(): array
31+
{
32+
return [
33+
['charset.ISO-8859-1.html', 'À1'],
34+
['charset.ISO-8859-1.html4.html', 'À1'],
35+
['charset.ISO-8859-2.html', 'Ŕ1'],
36+
['charset.nocharset.html', 'À1'],
37+
['charset.UTF-8.html', 'À1'],
38+
['charset.UTF-8.bom.html', 'À1'],
39+
['charset.UTF-16.bebom.html', 'À1'],
40+
['charset.UTF-16.lebom.html', 'À1'],
41+
['charset.gb18030.html', '电视机'],
42+
['charset.unknown.html', 'exception'],
43+
];
44+
}
45+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
<!DOCTYPE html>
2+
<html lang='en'>
3+
<head>
4+
<meta charset='ISO-8859-1'>
5+
<title>ISO-8859-1</title>
6+
</head>
7+
<body>
8+
<table>
9+
<tbody>
10+
<tr>
11+
<td>À1</td>
12+
<td>B1</td>
13+
<td>ç1</td>
14+
<td>D1</td>
15+
</tr>
16+
<tr>
17+
<td>Ã2</td>
18+
<td>B2</td>
19+
<td>C2</td>
20+
<td>Ð2</td>
21+
</tr>
22+
</tbody>
23+
</table>
24+
</body>
25+
</html>
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
2+
<html lang='en'>
3+
<head>
4+
<meta http-equiv="Content-Type" content="text/html; CHARSET=ISO-8859-1">
5+
<title>ISO-8859-1 Html4 Doctype and Meta</title>
6+
</head>
7+
<body>
8+
<table>
9+
<tbody>
10+
<tr>
11+
<td>À1</td>
12+
<td>B1</td>
13+
<td>ç1</td>
14+
<td>D1</td>
15+
</tr>
16+
<tr>
17+
<td>Ã2</td>
18+
<td>B2</td>
19+
<td>C2</td>
20+
<td>Ð2</td>
21+
</tr>
22+
</tbody>
23+
</table>
24+
</body>
25+
</html>
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
<!DOCTYPE html>
2+
<html lang='en'>
3+
<head>
4+
<meta charset='ISO-8859-2'>
5+
<title>ISO-8859-2</title>
6+
</head>
7+
<body>
8+
<table>
9+
<tbody>
10+
<tr>
11+
<td>À1</td>
12+
<td>B1</td>
13+
<td>ç1</td>
14+
<td>D1</td>
15+
</tr>
16+
<tr>
17+
<td>Ã2</td>
18+
<td>B2</td>
19+
<td>C2</td>
20+
<td>Ð2</td>
21+
</tr>
22+
</tbody>
23+
</table>
24+
</body>
25+
</html>
860 Bytes
Binary file not shown.
860 Bytes
Binary file not shown.
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
 <!DOCTYPE html>
2+
<html lang='en'>
3+
<head>
4+
<title>UTF-8</title>
5+
</head>
6+
<body>
7+
<table>
8+
<tbody>
9+
<tr>
10+
<td>À1</td>
11+
<td>B1</td>
12+
<td>ç1</td>
13+
<td>D1</td>
14+
</tr>
15+
<tr>
16+
<td>Ã2</td>
17+
<td>B2</td>
18+
<td>C2</td>
19+
<td>Ð2</td>
20+
</tr>
21+
</tbody>
22+
</table>
23+
</body>
24+
</html>

0 commit comments

Comments
 (0)