Skip to content

Commit 9420492

Browse files
feat: allow mime_type to be guessed for ByteStream (#9573)
* feat(bytestream): add guess_mime_type parameter * refactor(FileTypeRouter): refactor guess mimetype * feat(bytestream): add guess_mime_type to util * style(ruff): add trailing whitespace * fix: fix type annotation * test(file_type_router): add test for additional_mimetypes param * fix(file_type_router): non-existent file behavior * feat(file_type_router): add release notes * fix(file_type_router): remove unused logger * style: fix ruff formatting magic values * test(bytestream): handle windows/unix mimetype differences --------- Co-authored-by: Julian Risch <julian.risch@deepset.ai>
1 parent b9fa706 commit 9420492

File tree

7 files changed

+177
-27
lines changed

7 files changed

+177
-27
lines changed

haystack/components/converters/utils.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,20 +8,22 @@
88
from haystack.dataclasses import ByteStream
99

1010

11-
def get_bytestream_from_source(source: Union[str, Path, ByteStream]) -> ByteStream:
11+
def get_bytestream_from_source(source: Union[str, Path, ByteStream], guess_mime_type: bool = False) -> ByteStream:
1212
"""
1313
Creates a ByteStream object from a source.
1414
1515
:param source:
1616
A source to convert to a ByteStream. Can be a string (path to a file), a Path object, or a ByteStream.
17+
:param guess_mime_type:
18+
Whether to guess the mime type from the file.
1719
:return:
1820
A ByteStream object.
1921
"""
2022

2123
if isinstance(source, ByteStream):
2224
return source
2325
if isinstance(source, (str, Path)):
24-
bs = ByteStream.from_file_path(Path(source))
26+
bs = ByteStream.from_file_path(Path(source), guess_mime_type=guess_mime_type)
2527
bs.meta["file_path"] = str(source)
2628
return bs
2729
raise ValueError(f"Unsupported source type {type(source)}")

haystack/components/routers/file_type_router.py

Lines changed: 1 addition & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,6 @@
1212
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
1313
from haystack.dataclasses import ByteStream
1414

15-
CUSTOM_MIMETYPES = {
16-
# we add markdown because it is not added by the mimetypes module
17-
# see https://github.com/python/cpython/pull/17995
18-
".md": "text/markdown",
19-
".markdown": "text/markdown",
20-
# we add msg because it is not added by the mimetypes module
21-
".msg": "application/vnd.ms-outlook",
22-
}
23-
2415

2516
@component
2617
class FileTypeRouter:
@@ -149,7 +140,7 @@ def run(
149140
source = Path(source)
150141

151142
if isinstance(source, Path):
152-
mime_type = self._get_mime_type(source)
143+
mime_type = ByteStream._guess_mime_type(source)
153144
elif isinstance(source, ByteStream):
154145
mime_type = source.mime_type
155146
else:
@@ -171,16 +162,3 @@ def run(
171162
mime_types["unclassified"].append(source)
172163

173164
return dict(mime_types)
174-
175-
def _get_mime_type(self, path: Path) -> Optional[str]:
176-
"""
177-
Get the MIME type of the provided file path.
178-
179-
:param path: The file path to get the MIME type for.
180-
181-
:returns: The MIME type of the provided file path, or `None` if the MIME type cannot be determined.
182-
"""
183-
extension = path.suffix.lower()
184-
mime_type = mimetypes.guess_type(path.as_posix())[0]
185-
# lookup custom mappings if the mime type is not found
186-
return CUSTOM_MIMETYPES.get(extension, mime_type)

haystack/dataclasses/byte_stream.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5+
import mimetypes
56
from dataclasses import dataclass, field
67
from pathlib import Path
78
from typing import Any, Dict, Optional
@@ -32,15 +33,22 @@ def to_file(self, destination_path: Path) -> None:
3233

3334
@classmethod
3435
def from_file_path(
35-
cls, filepath: Path, mime_type: Optional[str] = None, meta: Optional[Dict[str, Any]] = None
36+
cls,
37+
filepath: Path,
38+
mime_type: Optional[str] = None,
39+
meta: Optional[Dict[str, Any]] = None,
40+
guess_mime_type: bool = False,
3641
) -> "ByteStream":
3742
"""
3843
Create a ByteStream from the contents read from a file.
3944
4045
:param filepath: A valid path to a file.
4146
:param mime_type: The mime type of the file.
4247
:param meta: Additional metadata to be stored with the ByteStream.
48+
:param guess_mime_type: Whether to guess the mime type from the file.
4349
"""
50+
if not mime_type and guess_mime_type:
51+
mime_type = cls._guess_mime_type(filepath)
4452
with open(filepath, "rb") as fd:
4553
return cls(data=fd.read(), mime_type=mime_type, meta=meta or {})
4654

@@ -100,3 +108,26 @@ def from_dict(cls, data: Dict[str, Any]) -> "ByteStream":
100108
:returns: A ByteStream instance.
101109
"""
102110
return ByteStream(data=bytes(data["data"]), meta=data.get("meta", {}), mime_type=data.get("mime_type"))
111+
112+
@staticmethod
113+
def _guess_mime_type(path: Path) -> Optional[str]:
114+
"""
115+
Guess the MIME type of the provided file path.
116+
117+
:param path: The file path to get the MIME type for.
118+
119+
:returns: The MIME type of the provided file path, or `None` if the MIME type cannot be determined.
120+
"""
121+
custom_mimetypes = {
122+
# we add markdown because it is not added by the mimetypes module
123+
# see https://github.com/python/cpython/pull/17995
124+
".md": "text/markdown",
125+
".markdown": "text/markdown",
126+
# we add msg because it is not added by the mimetypes module
127+
".msg": "application/vnd.ms-outlook",
128+
}
129+
130+
extension = path.suffix.lower()
131+
mime_type = mimetypes.guess_type(path.as_posix())[0]
132+
# lookup custom mappings if the mime type is not found
133+
return custom_mimetypes.get(extension, mime_type)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
enhancements:
3+
- |
4+
Add `guess_mime_type` parameter to `Bytestream.from_file_path()`

test/components/converters/test_utils.py

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44

55
import pytest
66

7-
from haystack.components.converters.utils import normalize_metadata
7+
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
8+
from haystack.dataclasses import ByteStream
89

910

1011
def test_normalize_metadata_None():
@@ -32,3 +33,40 @@ def test_normalize_metadata_list_of_wrong_size():
3233
def test_normalize_metadata_other_type():
3334
with pytest.raises(ValueError, match="meta must be either None, a dictionary or a list of dictionaries."):
3435
normalize_metadata(({"a": 1},), sources_count=1)
36+
37+
38+
def test_get_bytestream_from_path_object(tmp_path):
39+
bytes_ = b"hello world"
40+
source = tmp_path / "test.txt"
41+
source.write_bytes(bytes_)
42+
43+
bs = get_bytestream_from_source(source, guess_mime_type=True)
44+
45+
assert isinstance(bs, ByteStream)
46+
assert bs.data == bytes_
47+
assert bs.mime_type == "text/plain"
48+
assert bs.meta["file_path"].endswith("test.txt")
49+
50+
51+
def test_get_bytestream_from_string_path(tmp_path):
52+
bytes_ = b"hello world"
53+
source = tmp_path / "test.txt"
54+
source.write_bytes(bytes_)
55+
56+
bs = get_bytestream_from_source(str(source), guess_mime_type=True)
57+
58+
assert isinstance(bs, ByteStream)
59+
assert bs.data == bytes_
60+
assert bs.mime_type == "text/plain"
61+
assert bs.meta["file_path"].endswith("test.txt")
62+
63+
64+
def test_get_bytestream_from_source_invalid_type():
65+
with pytest.raises(ValueError, match="Unsupported source type"):
66+
get_bytestream_from_source(123)
67+
68+
69+
def test_get_bytestream_from_source_bytestream_passthrough():
70+
bs = ByteStream(data=b"spam", mime_type="text/custom", meta={"spam": "eggs"})
71+
result = get_bytestream_from_source(bs)
72+
assert result is bs

test/components/routers/test_file_router.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,15 @@
33
# SPDX-License-Identifier: Apache-2.0
44

55
import io
6+
import mimetypes
67
import sys
8+
from pathlib import PosixPath
79
from unittest.mock import mock_open, patch
810

911
import pytest
12+
from packaging import version
1013

14+
import haystack
1115
from haystack import Pipeline
1216
from haystack.components.converters import PyPDFToDocument, TextFileToDocument
1317
from haystack.components.routers.file_type_router import FileTypeRouter
@@ -395,3 +399,50 @@ def test_pipeline_with_converters(self, test_files_path):
395399

396400
assert output["text_file_converter"]["documents"][0].meta["meta_field_1"] == "meta_value_1"
397401
assert output["pypdf_converter"]["documents"][0].meta["meta_field_2"] == "meta_value_2"
402+
403+
def test_additional_mimetypes_integration(self, tmp_path):
404+
"""
405+
Test if the component runs correctly in a pipeline with additional mimetypes correctly.
406+
"""
407+
custom_mime_type = "application/x-spam"
408+
custom_extension = ".spam"
409+
test_file = tmp_path / f"test.{custom_extension}"
410+
test_file.touch()
411+
412+
# confirm that mimetypes module doesn't know about this extension by default
413+
assert custom_mime_type not in mimetypes.types_map.values()
414+
415+
# make haystack aware of the custom mime type
416+
router = FileTypeRouter(
417+
mime_types=[custom_mime_type], additional_mimetypes={custom_mime_type: custom_extension}
418+
)
419+
mappings = router.run(sources=[test_file])
420+
421+
# assert the file was classified under the custom mime type
422+
assert custom_mime_type in mappings
423+
assert test_file in mappings[custom_mime_type]
424+
425+
@pytest.mark.skipif(
426+
version.parse(haystack.__version__) >= version.parse("2.17.0"),
427+
reason="https://github.com/deepset-ai/haystack/pull/9573#issuecomment-3045237341",
428+
)
429+
def test_non_existent_file(self):
430+
"""
431+
Test conditional FileNotFoundError behavior in FileTypeRouter.
432+
433+
In Haystack versions prior to 2.17.0, `FileTypeRouter` does not raise an error
434+
when a non-existent file is passed without `meta`. However, it raises a
435+
FileNotFoundError when the same file is passed with `meta` supplied.
436+
437+
This inconsistent behavior is slated to change in 2.17.0.
438+
See: https://github.com/deepset-ai/haystack/pull/9573#issuecomment-3045237341
439+
"""
440+
router = FileTypeRouter(mime_types=[r"text/plain"])
441+
442+
# No meta - does not raise error
443+
result = router.run(sources=["non_existent.txt"])
444+
assert result == {"text/plain": [PosixPath("non_existent.txt")]}
445+
446+
# With meta - raises FileNotFoundError
447+
with pytest.raises(FileNotFoundError):
448+
router.run(sources=["non_existent.txt"], meta={"spam": "eggs"})

test/dataclasses/test_byte_stream.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,52 @@ def test_from_file_path(tmp_path, request):
2626
assert b.meta == {"foo": "bar"}
2727

2828

29+
@pytest.mark.parametrize(
30+
"file_path, expected_mime_types",
31+
[
32+
("spam.jpeg", {"image/jpeg"}),
33+
("spam.jpg", {"image/jpeg"}),
34+
("spam.png", {"image/png"}),
35+
("spam.gif", {"image/gif"}),
36+
("spam.svg", {"image/svg+xml"}),
37+
("spam.js", {"text/javascript", "application/javascript"}),
38+
("spam.txt", {"text/plain"}),
39+
("spam.html", {"text/html"}),
40+
("spam.htm", {"text/html"}),
41+
("spam.css", {"text/css"}),
42+
("spam.csv", {"text/csv"}),
43+
("spam.md", {"text/markdown"}), # custom mapping
44+
("spam.markdown", {"text/markdown"}), # custom mapping
45+
("spam.msg", {"application/vnd.ms-outlook"}), # custom mapping
46+
("spam.pdf", {"application/pdf"}),
47+
("spam.xml", {"application/xml", "text/xml"}),
48+
("spam.json", {"application/json"}),
49+
("spam.doc", {"application/msword"}),
50+
("spam.docx", {"application/vnd.openxmlformats-officedocument.wordprocessingml.document"}),
51+
("spam.xls", {"application/vnd.ms-excel"}),
52+
("spam.xlsx", {"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"}),
53+
("spam.ppt", {"application/vnd.ms-powerpoint"}),
54+
("spam.pptx", {"application/vnd.openxmlformats-officedocument.presentationml.presentation"}),
55+
],
56+
)
57+
def test_from_file_path_guess_mime_type(file_path, expected_mime_types, tmp_path):
58+
test_file = tmp_path / file_path
59+
test_file.touch()
60+
61+
b = ByteStream.from_file_path(test_file, guess_mime_type=True)
62+
assert b.mime_type in expected_mime_types
63+
64+
65+
def test_explicit_mime_type_is_not_overwritten_by_guessing(tmp_path):
66+
# create empty file with correct extension
67+
test_file = tmp_path / "sample.md"
68+
test_file.touch()
69+
70+
explicit_mime_type = "text/x-rst"
71+
b = ByteStream.from_file_path(test_file, mime_type=explicit_mime_type, guess_mime_type=True)
72+
assert b.mime_type == explicit_mime_type
73+
74+
2975
def test_from_string():
3076
test_string = "Hello, world!"
3177
b = ByteStream.from_string(test_string)

0 commit comments

Comments
 (0)