Skip to content

Commit 2833ddc

Browse files
committed
Csv Reader Allow Use of mimetype=text/html Files Without Extension
Fix #4036. The issue was originally reported as #564 (and #811) and fixed for the most part, but this is a variation that was not covered by the original. Cells with html fragments can cause `mime_content_type` to identify the file as `text\html`. Original fix was to ignore mime_content_type when file extension is 'csv' or 'tsv'. However, if the file does not have one of those extensions, it will be rejected by Csv Reader as invalid mimetype. This PR adds text\html to the list of valid mimetypes. I imagine that this type of problem might occur for other mimetypes. If any of those are reported in future, it might be better to just add a "suppress mimetype" check option, rather than extending the list forever. Html is unusual in that its rules are so lax, which is why it seems appropriate to add it here. Note that IOFactory may still identify a file as Html even when intended as Csv. The sample associated with this issue does not fall into this category, but one of the unit tests on this ticket does. The file will still be read correctly by Csv Reader, but IOFactory load may cause it to use Html Reader instead.
1 parent 2ed696f commit 2833ddc

File tree

4 files changed

+74
-1
lines changed

4 files changed

+74
-1
lines changed

src/PhpSpreadsheet/Reader/Csv.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -566,6 +566,7 @@ public function canRead(string $filename): bool
566566
'text/csv',
567567
'text/plain',
568568
'inode/x-empty',
569+
'text/html',
569570
];
570571

571572
return in_array($type, $supportedTypes, true);

src/PhpSpreadsheet/Writer/ZipStream3.php

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22

33
namespace PhpOffice\PhpSpreadsheet\Writer;
44

5-
use ZipStream\Option\Archive;
65
use ZipStream\ZipStream;
76

87
class ZipStream3

tests/PhpSpreadsheetTests/IOFactoryTest.php

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ public static function providerIdentify(): array
103103
//['samples/templates/Excel2003XMLTest.xml', 'Xml', Reader\Xml::class],
104104
['samples/templates/46readHtml.html', 'Html', Reader\Html::class],
105105
['tests/data/Reader/CSV/encoding.utf8bom.csv', 'Csv', Reader\Csv::class],
106+
['tests/data/Reader/HTML/charset.UTF-16.lebom.html', 'Html', Reader\Html::class],
107+
['tests/data/Reader/HTML/charset.UTF-8.bom.html', 'Html', Reader\Html::class],
106108
];
107109
}
108110

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
<?php
2+
3+
declare(strict_types=1);
4+
5+
namespace PhpOffice\PhpSpreadsheetTests\Reader\Csv;
6+
7+
use PhpOffice\PhpSpreadsheet\IOFactory;
8+
use PhpOffice\PhpSpreadsheet\Reader\Csv as CsvReader;
9+
use PhpOffice\PhpSpreadsheet\Shared\File;
10+
use PHPUnit\Framework\TestCase;
11+
12+
class NotHtmlTest extends TestCase
13+
{
14+
private string $tempFile = '';
15+
16+
protected function tearDown(): void
17+
{
18+
if ($this->tempFile !== '') {
19+
unlink($this->tempFile);
20+
$this->tempFile = '';
21+
}
22+
}
23+
24+
public function testHtmlCantRead(): void
25+
{
26+
// This test has a file which IOFactory will identify as Csv.
27+
// So file can be read using either Csv Reader or IOFactory.
28+
$this->tempFile = $filename = File::temporaryFilename();
29+
$cells = [
30+
['1', '<a href="http://example.com">example</a>', '3'],
31+
['4', '5', '6'],
32+
];
33+
$handle = fopen($filename, 'wb');
34+
self::assertNotFalse($handle);
35+
foreach ($cells as $row) {
36+
fwrite($handle, "{$row[0]},{$row[1]},{$row[2]}\n");
37+
}
38+
fclose($handle);
39+
self::assertSame('text/html', mime_content_type($filename));
40+
self::assertSame('Csv', IOFactory::identify($filename));
41+
$reader = new CsvReader();
42+
$spreadsheet = $reader->load($filename);
43+
$sheet = $spreadsheet->getActiveSheet();
44+
self::assertSame($cells, $sheet->toArray());
45+
$spreadsheet->disconnectWorksheets();
46+
}
47+
48+
public function testHtmlCanRead(): void
49+
{
50+
// This test has a file which IOFactory will identify as Html.
51+
// So file has to be read using Csv Reader, not IOFactory.
52+
$this->tempFile = $filename = File::temporaryFilename();
53+
$cells = [
54+
['<a href="http://example.com">example</a>', '<div>hello', '3'],
55+
['4', '5', '</div>'],
56+
];
57+
$handle = fopen($filename, 'wb');
58+
self::assertNotFalse($handle);
59+
foreach ($cells as $row) {
60+
fwrite($handle, "{$row[0]},{$row[1]},{$row[2]}\n");
61+
}
62+
fclose($handle);
63+
self::assertSame('text/html', mime_content_type($filename));
64+
self::assertSame('Html', IOFactory::identify($filename));
65+
$reader = new CsvReader();
66+
$spreadsheet = $reader->load($filename);
67+
$sheet = $spreadsheet->getActiveSheet();
68+
self::assertSame($cells, $sheet->toArray());
69+
$spreadsheet->disconnectWorksheets();
70+
}
71+
}

0 commit comments

Comments
 (0)