Skip to content

Commit c0a30d6

Browse files
Merge pull request #290 from Dana-Farber-AIOS/release-2.0.4
Release 2.0.4
2 parents 1010b3a + 2aa4f62 commit c0a30d6

File tree

14 files changed

+149
-103
lines changed

14 files changed

+149
-103
lines changed

.github/workflows/tests-conda.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ jobs:
4040
# uses: conda-incubator/setup-miniconda@f4c00b0ec69bdc87b1ab4972613558dd9f4f36f3
4141
uses: conda-incubator/setup-miniconda@v2.0.0
4242
with:
43+
add_pip_as_python_dependency: false
4344
environment-file: environment.yml
4445
activate-environment: pathml
4546
python-version: ${{ matrix.python-version }}

CONTRIBUTING.rst

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,16 @@ Request a new feature by filing an issue on GitHub. Make sure to include the fol
3737
For developers
3838
==============
3939

40+
Coordinate system conventions
41+
-----------------------------
42+
43+
With multiple tools for interacting with matrices/images, conflicting coordinate systems has been a common source of
44+
bugs. This is typically caused when mixing up (X, Y) coordinate systems and (i, j) coordinate systems. **To avoid these
45+
issues, we have adopted the (i, j) coordinate convention throughout PathML.** This follows the convention used by
46+
NumPy and many others, where ``A[i, j]`` refers to the element of matrix A in the ith row, jth column.
47+
Developers should be careful about coordinate systems and make the necessary adjustments when using third-party tools
48+
so that users of PathML can rely on a consistent coordinate system when using our tools.
49+
4050
Setting up a local development environment
4151
-------------------------------------------
4252

@@ -94,12 +104,15 @@ How to contribute code, documentation, etc.
94104
6. Push your changes and open a pull request on GitHub referencing the corresponding issue
95105
7. Respond to discussion/feedback about the pull request, make changes as necessary
96106

97-
Versioning
98-
----------
107+
Versioning and Distributing
108+
---------------------------
99109

100110
We use `semantic versioning`_. The version is tracked in ``pathml/_version.py`` and should be updated there as required.
101-
When new code is merged to the master branch on GitHub, the version should be incremented and the commit should
102-
be tagged in version format (e.g., "v1.0.0" for version 1.0.0).
111+
When new code is merged to the master branch on GitHub, the version should be incremented and a new release should be
112+
pushed. Releases can be created using the GitHub website interface, and should be tagged in version format
113+
(e.g., "v1.0.0" for version 1.0.0) and include release notes indicating what has changed.
114+
Once a new release is created, GitHub Actions workflows will automatically build and publish the updated package on
115+
PyPI and TestPyPI, as well as build and publish the Docker image to Docker Hub.
103116

104117
Code Quality
105118
------------

environment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ channels:
55
- pytorch
66

77
dependencies:
8-
- pip==21.2.2
8+
- pip==21.3.1
99
- python==3.8
1010
- numpy==1.19.5
1111
- scipy==1.7.3

pathml/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,4 @@
33
License: GNU GPL 2.0
44
"""
55

6-
__version__ = "2.0.3"
6+
__version__ = "2.0.4"

pathml/core/slide_backends.py

Lines changed: 31 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,10 @@ class BioFormatsBackend(SlideBackend):
252252
filename (str): path to image file on disk
253253
dtype (numpy.dtype): data type of image. If ``None``, will use BioFormats to infer the data type from the
254254
image's OME metadata. Defaults to ``None``.
255+
256+
Note:
257+
While the Bio-Formats convention uses XYZCT channel order, we use YXZCT for compatibility with the rest of
258+
PathML which is based on (i, j) coordinate system.
255259
"""
256260

257261
def __init__(self, filename, dtype=None):
@@ -281,7 +285,8 @@ def __init__(self, filename, dtype=None):
281285
reader.getSizeC(),
282286
reader.getSizeT(),
283287
)
284-
sizeSeries.append((sizex, sizey, sizez, sizec, sizet))
288+
# use yxzct for compatibility with the rest of PathML which uses i,j coords (not x, y)
289+
sizeSeries.append((sizey, sizex, sizez, sizec, sizet))
285290
s = [s[0] * s[1] for s in sizeSeries]
286291

287292
self.level_count = seriesCount # count of levels
@@ -332,7 +337,7 @@ def get_image_shape(self, level=None):
332337
Defaults to ``None``.
333338
334339
Returns:
335-
Tuple[int, int]: Shape of image (H, W)
340+
Tuple[int, int]: Shape of image (i, j) at target level
336341
"""
337342
if level is None:
338343
return self.shape[:2]
@@ -343,25 +348,29 @@ def get_image_shape(self, level=None):
343348
), f"input level {level} invalid for slide with {self.level_count} levels total"
344349
return self.shape_list[level][:2]
345350

346-
def extract_region(self, location, size, level=0, series_as_channels=False):
351+
def extract_region(
352+
self, location, size, level=0, series_as_channels=False, normalize=True
353+
):
347354
"""
348355
Extract a region of the image. All bioformats images have 5 dimensions representing
349-
(x, y, z, channel, time). Even if an image does not have multiple z-series or time-series,
350-
those dimensions will still be kept. For example, a standard RGB image will be of shape (x, y, 1, 3, 1).
356+
(i, j, z, channel, time). Even if an image does not have multiple z-series or time-series,
357+
those dimensions will still be kept. For example, a standard RGB image will be of shape (i, j, 1, 3, 1).
351358
If a tuple with len < 5 is passed, missing dimensions will be
352359
retrieved in full.
353360
354361
Args:
355-
location (Tuple[int, int]): (X,Y) location of corner of extracted region closest to the origin.
356-
size (Tuple[int, int, ...]): (X,Y) size of each region. If an integer is passed, will convert to a
357-
tuple of (H, W) and extract a square region. If a tuple with len < 5 is passed, missing
362+
location (Tuple[int, int]): (i, j) location of corner of extracted region closest to the origin.
363+
size (Tuple[int, int, ...]): (i, j) size of each region. If an integer is passed, will convert to a
364+
tuple of (i, j) and extract a square region. If a tuple with len < 5 is passed, missing
358365
dimensions will be retrieved in full.
359366
level (int): level from which to extract chunks. Level 0 is highest resolution. Defaults to 0.
360367
series_as_channels (bool): Whether to treat image series as channels. If ``True``, multi-level images
361368
are not supported. Defaults to ``False``.
369+
normalize (bool, optional): Whether to normalize the image to int8 before returning. Defaults to True.
370+
If False, image will be returned as-is immediately after reading, typically in float64.
362371
363372
Returns:
364-
np.ndarray: image at the specified region. 5-D array of (x, y, z, c, t)
373+
np.ndarray: image at the specified region. 5-D array of (i, j, z, c, t)
365374
"""
366375
if level is None:
367376
level = 0
@@ -412,7 +421,7 @@ def extract_region(self, location, size, level=0, series_as_channels=False):
412421
t=0,
413422
series=level,
414423
rescale=False,
415-
XYWH=(location[0], location[1], 2, 2),
424+
XYWH=(location[1], location[0], 2, 2),
416425
)
417426

418427
# need this part because some facilities output images where the channels are incorrectly stored as series
@@ -426,11 +435,10 @@ def extract_region(self, location, size, level=0, series_as_channels=False):
426435
t=t,
427436
series=c,
428437
rescale=False,
429-
XYWH=(location[0], location[1], size[0], size[1]),
438+
XYWH=(location[1], location[0], size[1], size[0]),
430439
)
431440
slicearray = np.asarray(slicearray)
432441
# some file formats read x, y out of order, transpose
433-
slicearray = np.transpose(slicearray)
434442
array[:, :, z, c, t] = slicearray
435443

436444
# in this case, channels are correctly stored as channels, and we can support multi-level images as series
@@ -442,26 +450,23 @@ def extract_region(self, location, size, level=0, series_as_channels=False):
442450
t=t,
443451
series=level,
444452
rescale=False,
445-
XYWH=(location[0], location[1], size[0], size[1]),
453+
XYWH=(location[1], location[0], size[1], size[0]),
446454
)
447455
slicearray = np.asarray(slicearray)
448-
# some file formats read x, y out of order, transpose
449-
if slicearray.shape[:2] != array.shape[:2]:
450-
slicearray = np.transpose(slicearray)
451-
# in 2d undoes transpose
452-
if len(sample.shape) == 3:
453-
slicearray = np.moveaxis(slicearray, 0, -1)
454456
if len(sample.shape) == 3:
455457
array[:, :, z, :, t] = slicearray
456458
else:
457459
array[:, :, z, level, t] = slicearray
458460

459-
# scale array before converting: https://github.com/Dana-Farber-AIOS/pathml/issues/271
460-
# first scale to [0-1]
461-
array_scaled = array / (2 ** (8 * self.pixel_dtype.itemsize))
462-
# then scale to [0-255] and convert to 8 bit
463-
array_scaled = array_scaled * 2 ** 8
464-
return array_scaled.astype(np.uint8)
461+
if not normalize:
462+
return array
463+
else:
464+
# scale array before converting: https://github.com/Dana-Farber-AIOS/pathml/issues/271
465+
# first scale to [0-1]
466+
array_scaled = array / (2 ** (8 * self.pixel_dtype.itemsize))
467+
# then scale to [0-255] and convert to 8 bit
468+
array_scaled = array_scaled * 2 ** 8
469+
return array_scaled.astype(np.uint8)
465470

466471
def get_thumbnail(self, size=None):
467472
"""
@@ -515,6 +520,7 @@ def generate_tiles(self, shape=3000, stride=None, pad=False, level=0, **kwargs):
515520
pad (bool): How to handle tiles on the edges. If ``True``, these edge tiles will be zero-padded
516521
and yielded with the other chunks. If ``False``, incomplete edge chunks will be ignored.
517522
Defaults to ``False``.
523+
**kwargs: Other arguments passed through to ``extract_region()`` method.
518524
519525
Yields:
520526
pathml.core.tile.Tile: Extracted Tile object

pathml/core/slide_data.py

Lines changed: 28 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -15,33 +15,29 @@
1515
import pathml.core
1616
import pathml.preprocessing.pipeline
1717
from pathml.core.slide_types import SlideType
18-
from torch.utils.data import Dataset
1918

2019

21-
def get_file_ext(path):
20+
def infer_backend(path):
2221
"""
23-
Return the file extension of an input path.
24-
If zipped with 'gz' or 'bz2' extension, will instead return the second to last extension.
25-
If multiple extensions, will return the last two.
22+
Checks file extensions to try to infer correct backend to use.
23+
Uses the file extensions from the sets contained in this file (pathml/core/slide_data.py)
24+
For file formats which are supported by both openslide and bioformats, will return "bioformats".
2625
2726
Args:
2827
path: path to file
2928
3029
Returns:
31-
str: file extension
30+
str: one of "bioformats", "openslide", "dicom", "h5path"
3231
"""
33-
p = Path(path)
34-
ext = p.suffixes
35-
if not ext:
36-
raise Exception(f"invalid path has no file extension: {path}")
37-
elif len(ext) == 1:
38-
ext = ext[0]
39-
elif len(ext) >= 2:
40-
if ext[-1] in {".gz", ".bz2"}:
41-
ext = ext[-2]
42-
else:
43-
ext = "".join(ext[-2:])
44-
return ext
32+
path = str(path)
33+
for extension_set, name in zip(
34+
[pathmlext, bioformatsext, openslideext, dicomext],
35+
["h5path", "bioformats", "openslide", "dicom"],
36+
):
37+
for ext in extension_set:
38+
if path[-len(ext) :] == ext:
39+
return name
40+
raise ValueError(f"input path {path} doesn't match any supported file extensions")
4541

4642

4743
class SlideData:
@@ -55,8 +51,11 @@ class SlideData:
5551
tiles (pathml.core.Tiles, optional): object containing {coordinates, tile} pairs
5652
labels (collections.OrderedDict, optional): dictionary containing {key, label} pairs
5753
backend (str, optional): backend to use for interfacing with slide on disk.
58-
Must be one of {"OpenSlide", "BioFormats", "DICOM"} (case-insensitive).
54+
Must be one of {"OpenSlide", "BioFormats", "DICOM", "h5path"} (case-insensitive).
55+
Note that for supported image formats, OpenSlide performance can be significantly better than BioFormats.
56+
Consider specifying ``backend = "openslide"`` when possible.
5957
If ``None``, and a ``filepath`` is provided, tries to infer the correct backend from the file extension.
58+
Defaults to ``None``.
6059
slide_type (pathml.core.SlideType, optional): slide type specification. Must be a
6160
:class:`~pathml.core.SlideType` object. Alternatively, slide type can be specified by using the
6261
parameters ``stain``, ``tma``, ``rgb``, ``volumetric``, and ``time_series``.
@@ -91,6 +90,7 @@ def __init__(
9190
volumetric=None,
9291
time_series=None,
9392
counts=None,
93+
dtype=None,
9494
):
9595
# check inputs
9696
assert masks is None or isinstance(
@@ -120,8 +120,8 @@ def __init__(
120120
), f"slide_type is of type {type(slide_type)} but must be of type pathml.core.types.SlideType"
121121
assert backend is None or (
122122
isinstance(backend, str)
123-
and backend.lower() in {"openslide", "bioformats", "dicom"}
124-
), f"backend {backend} must be one of ['OpenSlide', 'BioFormats', 'DICOM'] (case-insensitive)."
123+
and backend.lower() in {"openslide", "bioformats", "dicom", "h5path"}
124+
), f"backend {backend} must be one of ['OpenSlide', 'BioFormats', 'DICOM', 'h5path'] (case-insensitive)."
125125
assert counts is None or isinstance(
126126
counts, anndata.AnnData
127127
), f"counts is if type {type(counts)} but must be of type anndata.AnnData"
@@ -145,7 +145,7 @@ def __init__(
145145

146146
# get name from filepath if no name is provided
147147
if name is None and filepath is not None:
148-
name = Path(filepath).stem
148+
name = Path(filepath).name
149149

150150
_load_from_h5path = False
151151

@@ -154,26 +154,14 @@ def __init__(
154154
backend = backend.lower()
155155
else:
156156
# try to infer the correct backend
157-
ext = get_file_ext(filepath)
158-
if ext in openslideext:
159-
backend = "openslide"
160-
elif ext in bioformatsext:
161-
backend = "bioformats"
162-
elif ext in dicomext:
163-
backend = "dicom"
164-
elif ext in pathmlext:
165-
backend = "h5path"
166-
# load SlideData from h5 or h5path
157+
backend = infer_backend(filepath)
158+
if backend == "h5path":
167159
_load_from_h5path = True
168-
else:
169-
raise ValueError(
170-
f"Backend not specified, but cannot infer correct backend from input path {filepath}"
171-
)
172160

173161
if backend.lower() == "openslide":
174162
backend_obj = pathml.core.OpenSlideBackend(filepath)
175163
elif backend.lower() == "bioformats":
176-
backend_obj = pathml.core.BioFormatsBackend(filepath)
164+
backend_obj = pathml.core.BioFormatsBackend(filepath, dtype)
177165
elif backend.lower() == "dicom":
178166
backend_obj = pathml.core.DICOMBackend(filepath)
179167
elif backend.lower() == "h5path":
@@ -279,6 +267,7 @@ def run(
279267
write_dir (str): Path to directory to write the processed slide to. The processed SlideData object
280268
will be written to the directory immediately after the pipeline has completed running.
281269
The filepath will default to "<write_dir>/<slide.name>.h5path. Defaults to ``None``.
270+
**kwargs: Other arguments passed through to ``generate_tiles()`` method of the backend.
282271
"""
283272
assert isinstance(
284273
pipeline, pathml.preprocessing.pipeline.Pipeline
@@ -381,8 +370,8 @@ def extract_region(self, location, size, *args, **kwargs):
381370
location (Tuple[int, int]): Location of top-left corner of tile (i, j)
382371
size (Union[int, Tuple[int, int]]): Size of each tile. May be a tuple of (height, width) or a
383372
single integer, in which case square tiles of that size are generated.
384-
*args: positional arguments passed through
385-
**kwargs: keyword arguments passed through
373+
*args: positional arguments passed through to ``extract_region()`` method of the backend.
374+
**kwargs: keyword arguments passed through to ``extract_region()`` method of the backend.
386375
387376
Returns:
388377
np.ndarray: image at the specified region

pathml/core/tile.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ def __init__(
108108
# remove any Nones
109109
stain_type_dict = {key: val for key, val in stain_type_dict.items() if val}
110110
if stain_type_dict:
111-
slide_type = pathml.core.types.SlideType(**stain_type_dict)
111+
slide_type = pathml.core.slide_types.SlideType(**stain_type_dict)
112112

113113
assert counts is None or isinstance(
114114
counts, anndata.AnnData

pathml/datasets/pannuke.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ def _download_pannuke(self, download_dir):
249249
# don't download if the directory already exists
250250
if not os.path.isdir(p):
251251
print(f"Downloading fold {fold_ix}")
252-
url = f"https://warwick.ac.uk/fac/sci/dcs/research/tia/data/pannuke/fold_{fold_ix}.zip"
252+
url = f"https://warwick.ac.uk/fac/cross_fac/tia/data/pannuke/fold_{fold_ix}.zip"
253253
name = os.path.basename(url)
254254
download_from_url(url=url, download_dir=download_dir, name=name)
255255
path = os.path.join(download_dir, name)

0 commit comments

Comments
 (0)