Skip to content

Commit 65ecf6c

Browse files
Merge pull request #284 from Dana-Farber-AIOS/infer_backend_from_path
Infer backend from path
2 parents ea9439e + 94ffa23 commit 65ecf6c

File tree

3 files changed

+33
-48
lines changed

3 files changed

+33
-48
lines changed

pathml/core/slide_data.py

Lines changed: 23 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -15,33 +15,29 @@
1515
import pathml.core
1616
import pathml.preprocessing.pipeline
1717
from pathml.core.slide_types import SlideType
18-
from torch.utils.data import Dataset
1918

2019

21-
def get_file_ext(path):
20+
def infer_backend(path):
2221
"""
23-
Return the file extension of an input path.
24-
If zipped with 'gz' or 'bz2' extension, will instead return the second to last extension.
25-
If multiple extensions, will return the last two.
22+
Checks file extensions to try to infer correct backend to use.
23+
Uses the file extensions from the sets contained in this file (pathml/core/slide_data.py)
24+
For file formats which are supported by both openslide and bioformats, will return "bioformats".
2625
2726
Args:
2827
path: path to file
2928
3029
Returns:
31-
str: file extension
30+
str: one of "bioformats", "openslide", "dicom", "h5path"
3231
"""
33-
p = Path(path)
34-
ext = p.suffixes
35-
if not ext:
36-
raise Exception(f"invalid path has no file extension: {path}")
37-
elif len(ext) == 1:
38-
ext = ext[0]
39-
elif len(ext) >= 2:
40-
if ext[-1] in {".gz", ".bz2"}:
41-
ext = ext[-2]
42-
else:
43-
ext = "".join(ext[-2:])
44-
return ext
32+
path = str(path)
33+
for extension_set, name in zip(
34+
[pathmlext, bioformatsext, openslideext, dicomext],
35+
["h5path", "bioformats", "openslide", "dicom"],
36+
):
37+
for ext in extension_set:
38+
if path[-len(ext) :] == ext:
39+
return name
40+
raise ValueError(f"input path {path} doesn't match any supported file extensions")
4541

4642

4743
class SlideData:
@@ -55,8 +51,11 @@ class SlideData:
5551
tiles (pathml.core.Tiles, optional): object containing {coordinates, tile} pairs
5652
labels (collections.OrderedDict, optional): dictionary containing {key, label} pairs
5753
backend (str, optional): backend to use for interfacing with slide on disk.
58-
Must be one of {"OpenSlide", "BioFormats", "DICOM"} (case-insensitive).
54+
Must be one of {"OpenSlide", "BioFormats", "DICOM", "h5path"} (case-insensitive).
55+
Note that for supported image formats, OpenSlide performance can be significantly better than BioFormats.
56+
Consider specifying ``backend = "openslide"`` when possible.
5957
If ``None``, and a ``filepath`` is provided, tries to infer the correct backend from the file extension.
58+
Defaults to ``None``.
6059
slide_type (pathml.core.SlideType, optional): slide type specification. Must be a
6160
:class:`~pathml.core.SlideType` object. Alternatively, slide type can be specified by using the
6261
parameters ``stain``, ``tma``, ``rgb``, ``volumetric``, and ``time_series``.
@@ -121,8 +120,8 @@ def __init__(
121120
), f"slide_type is of type {type(slide_type)} but must be of type pathml.core.types.SlideType"
122121
assert backend is None or (
123122
isinstance(backend, str)
124-
and backend.lower() in {"openslide", "bioformats", "dicom"}
125-
), f"backend {backend} must be one of ['OpenSlide', 'BioFormats', 'DICOM'] (case-insensitive)."
123+
and backend.lower() in {"openslide", "bioformats", "dicom", "h5path"}
124+
), f"backend {backend} must be one of ['OpenSlide', 'BioFormats', 'DICOM', 'h5path'] (case-insensitive)."
126125
assert counts is None or isinstance(
127126
counts, anndata.AnnData
128127
), f"counts is if type {type(counts)} but must be of type anndata.AnnData"
@@ -146,7 +145,7 @@ def __init__(
146145

147146
# get name from filepath if no name is provided
148147
if name is None and filepath is not None:
149-
name = Path(filepath).stem
148+
name = Path(filepath).name
150149

151150
_load_from_h5path = False
152151

@@ -155,21 +154,9 @@ def __init__(
155154
backend = backend.lower()
156155
else:
157156
# try to infer the correct backend
158-
ext = get_file_ext(filepath)
159-
if ext in openslideext:
160-
backend = "openslide"
161-
elif ext in bioformatsext:
162-
backend = "bioformats"
163-
elif ext in dicomext:
164-
backend = "dicom"
165-
elif ext in pathmlext:
166-
backend = "h5path"
167-
# load SlideData from h5 or h5path
157+
backend = infer_backend(filepath)
158+
if backend == "h5path":
168159
_load_from_h5path = True
169-
else:
170-
raise ValueError(
171-
f"Backend not specified, but cannot infer correct backend from input path {filepath}"
172-
)
173160

174161
if backend.lower() == "openslide":
175162
backend_obj = pathml.core.OpenSlideBackend(filepath)

tests/core_tests/test_slide_data.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
BioFormatsBackend,
1919
Tile,
2020
)
21-
from pathml.core.slide_data import get_file_ext
21+
from pathml.core.slide_data import infer_backend
2222
from pathml.preprocessing import Pipeline, BoxBlur
2323

2424

@@ -29,18 +29,16 @@ def test_repr(slide):
2929

3030

3131
@pytest.mark.parametrize(
32-
"path,ext",
32+
"path,backend",
3333
[
34-
("/test/testing/test.txt", ".txt"),
35-
("/test/testing/test.txt.gz", ".txt"),
36-
("/test/testing/test.txt.bz2", ".txt"),
37-
("/test/testing/test.qptiff", ".qptiff"),
38-
("/test/testing/test.ext1.ext2", ".ext1.ext2"),
34+
("/test/testing/test.qptiff", "bioformats"),
35+
("/test/dot.dot/space space space/File with.spaces and.dots.h5path", "h5path"),
36+
("test.dcm", "dicom"),
37+
("test.file.multiple.exts.jpg.qptiff.tiff.ome.tiff", "bioformats"),
3938
],
4039
)
41-
def test_get_file_ext(path, ext):
42-
result = get_file_ext(path)
43-
assert result == ext
40+
def test_infer_backend(path, backend):
41+
assert infer_backend(path) == backend
4442

4543

4644
def test_write_with_array_labels(tmp_path, example_slide_data):
@@ -125,7 +123,7 @@ def test_generate_tiles_padding(he_slide, pad):
125123

126124
def test_read_write_heslide(tmp_path, example_slide_data_with_tiles):
127125
slidedata = example_slide_data_with_tiles
128-
path = tmp_path / "testhe.h5"
126+
path = tmp_path / "testhe.test.test.dots space dots.h5"
129127
slidedata.write(path)
130128
readslidedata = SlideData(path)
131129
repr(readslidedata)

tests/ml_tests/test_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def test_dataset(tmp_path, im_path):
5454
else:
5555
assert v == labs[k]
5656

57-
if wsi.name == "small_vectra":
57+
if wsi.name == "small_vectra.qptiff":
5858
# 5-dim images (XYZCT converted to TCZXY for batching)
5959
assert np.array_equal(im, wsi.tiles[0].image.transpose(4, 3, 2, 1, 0))
6060
else:

0 commit comments

Comments
 (0)