feat(inference): add hdf5 file reading

okunator · okunator · commit b3e06e9045c9 · 2023-01-13T17:17:34.000+02:00
diff --git a/cellseg_models_pytorch/inference/__init__.py b/cellseg_models_pytorch/inference/__init__.py
@@ -1,5 +1,5 @@
 from ._base_inferer import BaseInferer
-from .folder_dataset import FolderDataset
+from .folder_dataset_infer import FolderDatasetInfer
 from .post_processor import PostProcessor
 from .predictor import Predictor
 from .resize_inferer import ResizeInferer
@@ -11,5 +11,5 @@
     "PostProcessor",
     "ResizeInferer",
     "SlidingWindowInferer",
-    "FolderDataset",
+    "FolderDatasetInfer",
 ]
diff --git a/cellseg_models_pytorch/inference/_base_inferer.py b/cellseg_models_pytorch/inference/_base_inferer.py
@@ -13,7 +13,8 @@
 from tqdm import tqdm
 
 from ..utils import FileHandler, tensor_to_ndarray
-from .folder_dataset import FolderDataset
+from .folder_dataset_infer import FolderDatasetInfer
+from .hdf5_dataset_infer import HDF5DatasetInfer
 from .post_processor import PostProcessor
 from .predictor import Predictor
 
@@ -22,7 +23,7 @@ class BaseInferer(ABC):
     def __init__(
         self,
         model: nn.Module,
-        input_folder: Union[Path, str],
+        input_path: Union[Path, str],
         out_activations: Dict[str, str],
         out_boundary_weights: Dict[str, bool],
         patch_size: Tuple[int, int],
@@ -47,8 +48,8 @@ def __init__(
         ----------
             model : nn.Module
                 A segmentation model.
-            input_folder : Path | str
-                Path to a folder of images.
+            input_path : Path | str
+                Path to a folder of images or to hdf5 db.
             out_activations : Dict[str, str]
                 Dictionary of head names mapped to a string value that specifies the
                 activation applied at the head. E.g. {"type": "tanh", "cellpose": None}
@@ -87,7 +88,7 @@ def __init__(
             checkpoint_path : Path | str, optional
                 Path to the model weight checkpoints.
             n_images : int, optional
-                First n-number of images used from the `input_folder`.
+                First n-number of images used from the `input_path`.
             type_post_proc : Callable, optional
                 A post-processing function for the type maps. If not None, overrides
                 the default.
@@ -112,21 +113,28 @@ def __init__(
         self.save_intermediate = save_intermediate
         self.save_format = save_format
 
-        # dataloader
-        self.path = Path(input_folder)
-
-        folder_ds = FolderDataset(self.path, n_images=n_images)
-        if self.save_dir is None and len(folder_ds.fnames) > 40:
-            warnings.warn(
-                "`save_dir` is None. Thus, the outputs are be saved in `out_masks` "
-                "class variable. If the input folder contains many images, running "
-                "inference will likely flood the memory depending on the size and "
-                "number of the images. Consider saving outputs to disk by providing "
-                "`save_dir` argument."
+        # dataset & dataloader
+        self.path = Path(input_path)
+        if self.path.is_dir():
+            ds = FolderDatasetInfer(self.path, n_images=n_images)
+            if self.save_dir is None and len(ds.fnames) > 40:
+                warnings.warn(
+                    "`save_dir` is None. Thus, the outputs are be saved in `out_masks` "
+                    "class attribute. If the input folder contains many images, running"
+                    " inference will likely flood the memory depending on the size and "
+                    "number of the images. Consider saving outputs to disk by providing"
+                    " `save_dir` argument."
+                )
+        elif self.path.is_file() and self.path.suffix in (".h5", ".hdf5"):
+            ds = HDF5DatasetInfer(self.path, n_images=n_images)
+        else:
+            raise ValueError(
+                f"Given `input_path`: {input_path} is neither an image folder or a h5 "
+                "database. Allowed suffices for h5 database are ('.h5', '.hdf5')"
             )
 
         self.dataloader = DataLoader(
-            folder_ds, batch_size=batch_size, shuffle=False, pin_memory=True
+            ds, batch_size=batch_size, shuffle=False, pin_memory=True
         )
 
         # Set post processor
diff --git a/cellseg_models_pytorch/inference/folder_dataset_infer.py b/cellseg_models_pytorch/inference/folder_dataset_infer.py
@@ -9,10 +9,10 @@
 SUFFIXES = (".jpeg", ".jpg", ".tif", ".tiff", ".png")
 
 
-__all__ = ["FolderDataset"]
+__all__ = ["FolderDatasetInfer"]
 
 
-class FolderDataset(Dataset, FileHandler):
+class FolderDatasetInfer(Dataset, FileHandler):
     def __init__(
         self, path: Union[str, Path], pattern: str = "*", n_images: int = None
     ) -> None:
@@ -55,10 +55,10 @@ def __len__(self) -> int:
         """Length of folder."""
         return len(self.fnames)
 
-    def __getitem__(self, index: int) -> torch.Tensor:
+    def __getitem__(self, ix: int) -> torch.Tensor:
         """Read image."""
-        fn = self.fnames[index]
+        fn = self.fnames[ix]
         im = FileHandler.read_img(fn.as_posix())
         im = torch.from_numpy(im.transpose(2, 0, 1))
 
-        return {"im": im, "file": fn.name[:-4]}
+        return {"im": im, "file": fn.with_suffix("").name}
diff --git a/cellseg_models_pytorch/inference/hdf5_dataset_infer.py b/cellseg_models_pytorch/inference/hdf5_dataset_infer.py
@@ -0,0 +1,65 @@
+from pathlib import Path
+from typing import Union
+
+import torch
+from torch.utils.data import Dataset
+
+from cellseg_models_pytorch.utils import FileHandler
+
+try:
+    import tables as tb
+except Exception:
+    raise ImportError(
+        "`pytables` needed for this class. Install with: `pip install tables`"
+    )
+
+
+__all__ = ["HDF5DatasetInfer"]
+
+
+class HDF5DatasetInfer(Dataset, FileHandler):
+    def __init__(self, path: Union[str, Path], n_images: int = None, **kwargs) -> None:
+        """Folder dataset that can be used during inference for loading images.
+
+        NOTE: loads only images.
+
+        Parameters
+        ----------
+            path : str | Path
+                Path to the folder containing image files.
+            n_images : int, optional
+                First n-number of images used from the folder.
+
+        Raises
+        ------
+            ValueError if the input path has incorrect suffix.
+        """
+        super().__init__()
+
+        self.path = Path(path)
+
+        if self.path.suffix not in (".h5", ".hdf5"):
+            raise ValueError(
+                f"The input path has to be a hdf5 db. Got suffix: {self.path.suffix} "
+                "Allowed suffices: {('.h5', '.hdf5')}"
+            )
+
+        with tb.open_file(self.path) as h5:
+            if n_images is not None:
+                self.fnames = h5.root.fnames[:n_images]
+            else:
+                self.fnames = h5.root.fnames[:]
+
+    def __len__(self) -> int:
+        """Return the number of items in the db."""
+        return len(self.fnames)
+
+    def __getitem__(self, ix: int) -> torch.Tensor:
+        """Read image."""
+        fn = self.fnames[ix]
+
+        with tb.open_file(self.path.as_posix(), "r") as h5:
+            im = h5.root.imgs[ix, ...]
+
+        im = torch.from_numpy(im.transpose(2, 0, 1))
+        return {"im": im, "file": Path(fn.decode("UTF-8")).name}
diff --git a/cellseg_models_pytorch/inference/resize_inferer.py b/cellseg_models_pytorch/inference/resize_inferer.py
@@ -12,7 +12,7 @@ class ResizeInferer(BaseInferer):
     def __init__(
         self,
         model: nn.Module,
-        input_folder: Union[Path, str],
+        input_path: Union[Path, str],
         out_activations: Dict[str, str],
         out_boundary_weights: Dict[str, bool],
         resize: Tuple[int, int],
@@ -43,8 +43,8 @@ def __init__(
         ----------
             model : nn.Module
                 A segmentation model.
-            input_folder : Path | str
-                Path to a folder of images.
+            input_path : Path | str
+                Path to a folder of images or to hdf5 db.
             out_activations : Dict[str, str]
                 Dictionary of head names mapped to a string value that specifies the
                 activation applied at the head. E.g. {"type": "tanh", "cellpose": None}
@@ -83,7 +83,7 @@ def __init__(
             checkpoint_path : Path | str, optional
                 Path to the model weight checkpoints.
             n_images : int, optional
-                First n-number of images used from the `input_folder`.
+                First n-number of images used from the `input_path`.
             type_post_proc : Callable, optional
                 A post-processing function for the type maps. If not None, overrides
                 the default.
@@ -92,10 +92,34 @@ def __init__(
                 overrides the default.
             **kwargs:
                 Arbitrary keyword arguments expecially for post-processing and saving.
+
+        Examples
+        --------
+            >>> # initialize model and paths
+            >>> model = cellpose_base(len(type_classes))
+            >>> inputs = "/path/to/imgs"
+            >>> ckpt_path = "/path/to/myweights.ckpt"
+
+            >>> # initialize output head args
+            >>> out_activations={"type": "softmax", "cellpose": None}
+            >>> out_boundary_weights={"type": None, "cellpose": None}
+
+            >>> inferer = ResizeInferer(
+                    model=model,
+                    input_path=inputs,
+                    checkpoint_path=ckpt_path,
+                    out_activations=out_activations,
+                    out_boundary_weights=out_boundary_weights,
+                    resize=(256, 256),
+                    instance_postproc="cellpose",
+                    padding=0,
+                    normalization="minmax" # This needs to be same as during training
+                )
+            >>> inferer.infer()
         """
         super().__init__(
             model=model,
-            input_folder=input_folder,
+            input_path=input_path,
             out_activations=out_activations,
             out_boundary_weights=out_boundary_weights,
             patch_size=resize,
diff --git a/cellseg_models_pytorch/inference/sliding_window_inferer.py b/cellseg_models_pytorch/inference/sliding_window_inferer.py
@@ -14,7 +14,7 @@ class SlidingWindowInferer(BaseInferer):
     def __init__(
         self,
         model: nn.Module,
-        input_folder: Union[Path, str],
+        input_path: Union[Path, str],
         out_activations: Dict[str, str],
         out_boundary_weights: Dict[str, bool],
         stride: int,
@@ -44,8 +44,8 @@ def __init__(
         ----------
             model : nn.Module
                 A segmentation model.
-            input_folder : Path | str
-                Path to a folder of images.
+            input_path : Path | str
+                Path to a folder of images or to hdf5 db.
             out_activations : Dict[str, str]
                 Dictionary of head names mapped to a string value that specifies the
                 activation applied at the head. E.g. {"type": "tanh", "cellpose": None}
@@ -86,7 +86,7 @@ def __init__(
             checkpoint_path : Path | str, optional
                 Path to the model weight checkpoints.
             n_images : int, optional
-                First n-number of images used from the `ìnput_folder`.
+                First n-number of images used from the `input_path`.
             type_post_proc : Callable, optional
                 A post-processing function for the type maps. If not None, overrides
                 the default.
@@ -95,10 +95,35 @@ def __init__(
                 overrides the default.
             **kwargs:
                 Arbitrary keyword arguments expecially for post-processing and saving.
+
+        Examples
+        --------
+            >>> # initialize model and paths
+            >>> model = cellpose_plus(len(type_classes), len(area_classes))
+            >>> inputs = "/path/to/images"
+            >>> ckpt_path = "/path/to/my_weights.ckpt"
+
+            >>> # initialize output head args
+            >>> out_activations={"type": "softmax", "cellpose": None, "sem": "softmax"}
+            >>> out_boundary_weights={"type": False, "cellpose": True, "sem": False}
+
+            >>> # Run inference
+            >>> inferer = SlidingWindowInferer(
+                    model=model,
+                    input_path=inputs,
+                    checkpoint_path=ckpt_path,
+                    out_activations=out_activations,
+                    out_boundary_weights=out_boundary_weights,
+                    stride=256,
+                    patch_size=(320, 320),
+                    instance_postproc="cellpose",
+                    normalization="minmax" # This needs to be same as during training
+                )
+            >>> inferer.infer()
         """
         super().__init__(
             model=model,
-            input_folder=input_folder,
+            input_path=input_path,
             out_activations=out_activations,
             out_boundary_weights=out_boundary_weights,
             patch_size=patch_size,
diff --git a/changelog.d/20230113_171503_oskari.lehtonen.md b/changelog.d/20230113_171503_oskari.lehtonen.md
@@ -0,0 +1,3 @@
+## Features
+
+- Add hdf5 input file reading to `Inferer` classes.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+## Features`
	`2`	`+`
	`3`	+- Add hdf5 input file reading to `Inferer` classes.