Add support for cryo-et-data (#89)

constantinpape · web-flow · commit 38f790798f1a · 2025-02-21T07:53:23.000+01:00
Add scripts for CryoET Data Portal submissions
diff --git a/doc/start_page.md b/doc/start_page.md
@@ -164,3 +164,10 @@ Domain adaptation is implemented in `synapse_net.training.domain_adaptation`. Yo
 
 We also provide functionality for 'regular' neural network training. In this case, you have to provide data **and** manual annotations for the structure(s) you want to segment.
 This functionality is implemented in `synapse_net.training.supervised_training`. You can find an example script that shows how to use it [here](https://github.com/computational-cell-analytics/synapse-net/blob/main/examples/network_training.py).
+
+## Segmentation for the CryoET Data Portal
+
+We have published segmentation results for tomograms of synapses stored in the [CryoET Data Portal](https://cryoetdataportal.czscience.com/). So far we have made the following depositions:
+- [CZCDP-10330](https://cryoetdataportal.czscience.com/depositions/10330): Contains synaptic vesicle segmentations for over 50 tomograms of synaptosomes. The segmentations were made with a model domain adapted to the synaptosome tomograms.
+
+The scripts for the submissions can be found in [scripts/cryo/cryo-et-portal](https://github.com/computational-cell-analytics/synapse-net/tree/main/scripts/cryo/cryo-et-portal).
diff --git a/scripts/cryo/cryo-et-portal/.gitignore b/scripts/cryo/cryo-et-portal/.gitignore
@@ -0,0 +1,4 @@
+credentials.portal
+sync_with_s3.sh
+segmentations/
+upload_CZCDP-10330/
diff --git a/scripts/cryo/cryo-et-portal/README.md b/scripts/cryo/cryo-et-portal/README.md
@@ -0,0 +1,6 @@
+# Segmentation for the CryoET Data Portal
+
+Scripts to prepare submissions for the [CryoET Data Portal](https://cryoetdataportal.czscience.com).
+
+We have created the following submissions for the portal:
+- [CZCDP-10330](https://cryoetdataportal.czscience.com/depositions/10330): synaptic vesicles segmented in tomograms of synaptosomes.
diff --git a/scripts/cryo/cryo-et-portal/download_sample_tomograms.py b/scripts/cryo/cryo-et-portal/download_sample_tomograms.py
@@ -0,0 +1,55 @@
+# from torch_em.data.datasets.util import download_from_cryo_et_portal
+#
+# path = "/scratch-grete/projects/nim00007/cryo-et/from_portal"
+#
+# # TODO this is the stuff to extract later
+# ids = [
+#  "RN-16498",
+#  "RN-16514",
+#  "RN-16581",
+#  "RN-16641",
+# ]
+#
+# # "24sep24a_Position_102"
+# # "24sep24a_Position_113_3"
+# # "24sep24a_Position_84"
+# # "24sep24a_Position_38"
+#
+# did = "10443"
+# download_from_cryo_et_portal(path, did, download=True)
+
+import cryoet_data_portal as cdp
+import s3fs
+import os
+
+# S3 filesystem instance
+fs = s3fs.S3FileSystem(anon=True)
+
+# Client instance
+client = cdp.Client()
+
+# Run IDs (integers)
+runs = [16498, 16514, 16581, 16641]
+
+root = "/scratch-grete/projects/nim00007/cryo-et/from_portal"
+
+# Loop over run IDs
+for run_id in runs:
+    # Query denoised tomograms
+    tomograms = cdp.Tomogram.find(
+        client,
+        [
+            cdp.Tomogram.run_id == run_id,
+            cdp.Tomogram.processing == "denoised",
+        ]
+    )
+
+    # Select the first tomogram (there should only be one in this case)
+    tomo = tomograms[0]
+
+    # Download the denoised tomogram
+    output_folder = os.path.join(root, str(run_id))
+    os.makedirs(output_folder, exist_ok=True)
+    fname = f"{tomo.id}_{tomo.processing}.mrc"
+    output_path = os.path.join(output_folder, fname)
+    fs.get(tomo.s3_mrc_file, output_path)
diff --git a/scripts/cryo/cryo-et-portal/download_tomogram_lists.py b/scripts/cryo/cryo-et-portal/download_tomogram_lists.py
@@ -0,0 +1,40 @@
+import json
+import os
+
+from synapse_net.file_utils import read_data_from_cryo_et_portal_run
+from tqdm import tqdm
+
+
+def download_tomogram_list(run_ids, output_root):
+    print("Downloading", len(run_ids), "tomograms")
+    os.makedirs(output_root, exist_ok=True)
+    for run_id in tqdm(run_ids):
+        output_path = os.path.join(output_root, f"{run_id}.mrc")
+        data, voxel_size = read_data_from_cryo_et_portal_run(
+            run_id, use_zarr_format=False, output_path=output_path, id_field="id",
+        )
+        if data is None:
+            print("Did not find a tomogram for", run_id)
+
+
+def download_tomograms_for_da():
+    with open("./list_for_da.json") as f:
+        run_ids = json.load(f)
+    output_root = "/scratch-grete/projects/nim00007/cryo-et/from_portal/for_domain_adaptation"
+    download_tomogram_list(run_ids, output_root)
+
+
+def download_tomograms_for_eval():
+    with open("./list_for_eval.json") as f:
+        run_ids = json.load(f)
+    output_root = "/scratch-grete/projects/nim00007/cryo-et/from_portal/for_eval"
+    download_tomogram_list(run_ids, output_root)
+
+
+def main():
+    download_tomograms_for_eval()
+    # download_tomograms_for_da()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/cryo/cryo-et-portal/process_results.py b/scripts/cryo/cryo-et-portal/process_results.py
@@ -0,0 +1,69 @@
+import os
+from glob import glob
+from pathlib import Path
+
+import h5py
+import numpy as np
+import zarr
+
+from synapse_net.file_utils import read_mrc
+from tqdm import tqdm
+
+from ome_zarr.writer import write_image
+from ome_zarr.io import parse_url
+
+
+IN_ROOT = "/scratch-grete/projects/nim00007/cryo-et/from_portal/for_eval"
+OUT_ROOT = "/scratch-grete/projects/nim00007/cryo-et/from_portal/segmentations/DA_with_new_portalData_origDim"  # noqa
+
+IN_ROOT = "/scratch-grete/projects/nim00007/cryo-et/from_portal/for_domain_adaptation"
+OUT_ROOT = "/scratch-grete/projects/nim00007/cryo-et/from_portal/segmentations/DA_with_new_portalData_forDAdata"
+
+
+def export_to_ome_zarr(export_file, seg, voxel_size):
+    store = parse_url(export_file, mode="w").store
+    root = zarr.group(store=store)
+
+    scale = list(voxel_size.values())
+    trafo = [
+        [{"scale": scale, "type": "scale"}]
+    ]
+    write_image(seg, root, axes="zyx", coordinate_transformations=trafo, scaler=None)
+
+
+def export_segmentation(export_folder, segmentation_file):
+    fname = Path(segmentation_file).stem
+    key = "/vesicles/segment_from_vesicle_DA_portal_v3"
+    export_file = os.path.join(export_folder, f"{fname}.ome.zarr")
+
+    if os.path.exists(export_file):
+        return
+
+    input_file = os.path.join(IN_ROOT, f"{fname}.mrc")
+    raw, voxel_size = read_mrc(input_file)
+    voxel_size = {k: v * 10 for k, v in voxel_size.items()}
+
+    try:
+        with h5py.File(segmentation_file, "r") as f:
+            seg = f[key][:]
+    except OSError as e:
+        print(e)
+        return
+
+    seg = np.flip(seg, axis=1)
+    assert seg.shape == raw.shape
+
+    assert seg.max() < 128, f"{seg.max()}"
+    seg = seg.astype("int8")
+    export_to_ome_zarr(export_file, seg, voxel_size)
+
+
+def main():
+    export_folder = "./for_portal2"
+    os.makedirs(export_folder, exist_ok=True)
+    files = glob(os.path.join(OUT_ROOT, "*.h5"))
+    for file in tqdm(files):
+        export_segmentation(export_folder, file)
+
+
+main()
diff --git a/scripts/cryo/cryo-et-portal/reformat_submission.py b/scripts/cryo/cryo-et-portal/reformat_submission.py
@@ -0,0 +1,29 @@
+import os
+from glob import glob
+from pathlib import Path
+from shutil import move
+
+import cryoet_data_portal as cdp
+from tqdm import tqdm
+
+
+def main():
+    input_folder = "segmentations"
+    output_root = "upload_CZCDP-10330"
+
+    client = cdp.Client()
+
+    tomograms = sorted(glob(os.path.join("segmentations/*.ome.zarr")))
+    for input_file in tqdm(tomograms, desc="Formatting submission"):
+        tomo_id = Path(input_file).stem
+        tomo_id = int(Path(tomo_id).stem)
+
+        tomo = cdp.Tomogram.get_by_id(client, tomo_id)
+        output_folder = os.path.join(output_root, str(tomo.run.dataset_id))
+        os.makedirs(output_folder, exist_ok=True)
+        output_file = os.path.join(output_folder, f"{tomo.run.name}.zarr")
+        move(input_file, output_file)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/synapse_net/file_utils.py b/synapse_net/file_utils.py
@@ -5,6 +5,21 @@
 import numpy as np
 import pooch
 
+try:
+    import cryoet_data_portal as cdp
+except ImportError:
+    cdp = None
+
+try:
+    import zarr
+except ImportError:
+    zarr = None
+
+try:
+    import s3fs
+except ImportError:
+    s3fs = None
+
 
 def get_cache_dir() -> str:
     """Get the cache directory of synapse net.
@@ -88,3 +103,115 @@ def read_mrc(path: str) -> Tuple[np.ndarray, Dict[str, float]]:
     # Transpose the data to match python axis order.
     data = np.flip(data, axis=1) if data.ndim == 3 else np.flip(data, axis=0)
     return data, voxel_size
+
+
+def read_ome_zarr(uri: str, scale_level: int = 0, fs=None) -> Tuple[np.ndarray, Dict[str, float]]:
+    """Read data and voxel size from an ome.zarr file.
+
+    Args:
+        uri: Path or url to the ome.zarr file.
+        scale_level: The level of the multi-scale image pyramid to load.
+        fs: S3 filesystem to use for initializing the store.
+
+    Returns:
+        The data read from the file.
+        The voxel size read from the file.
+    """
+    if zarr is None:
+        raise RuntimeError("The zarr library is required to read ome.zarr files.")
+
+    def parse_s3_uri(uri):
+        return uri.lstrip("s3://")
+
+    if uri.startswith("s3"):
+        if fs is None:
+            fs = s3fs.S3FileSystem(anon=True)
+        s3_uri = parse_s3_uri(uri)
+        store = s3fs.S3Map(root=s3_uri, s3=fs, check=False)
+    elif fs is not None:
+        s3_uri = parse_s3_uri(uri)
+        store = s3fs.S3Map(root=s3_uri, s3=fs, check=False)
+    else:
+        if not os.path.exists(uri):
+            raise ValueError(f"Cannot find the filepath at {uri}.")
+        store = uri
+
+    with zarr.open(store, "r") as f:
+        multiscales = f.attrs["multiscales"][0]
+
+        # Read the axis and transformation metadata for this dataset, to determine the voxel size.
+        axes = [axis["name"] for axis in multiscales["axes"]]
+        assert set(axes) == set("xyz")
+        transformations = multiscales["datasets"][scale_level]["coordinateTransformations"]
+        scale_transformation = [trafo["scale"] for trafo in transformations if trafo["type"] == "scale"][0]
+
+        # The voxel size is given in angstrom, we divide it by 10 to convert it to nanometer.
+        voxel_size = {axis: scale / 10.0 for axis, scale in zip(axes, scale_transformation)}
+
+        # Get the internale path for the given scale and load the data.
+        internal_path = multiscales["datasets"][scale_level]["path"]
+        data = f[internal_path][:]
+
+    return data, voxel_size
+
+
+def read_data_from_cryo_et_portal_run(
+    run_id: int,
+    output_path: Optional[str] = None,
+    use_zarr_format: bool = True,
+    processing_type: str = "denoised",
+    id_field: str = "run_id",
+    scale_level: Optional[int] = None,
+) -> Tuple[np.ndarray, Dict[str, float]]:
+    """Read data and voxel size from a CryoET Data Portal run.
+
+    Args:
+        run_id: The ID of the experiment run.
+        output_path: The path for saving the data. The data will be streamed if the path is not given.
+        use_zarr_format: Whether to use the data in zarr format instead of mrc.
+        processing_type: The processing type of the tomogram to download.
+        id_field: The name of the id field. One of 'id' or 'run_id'.
+            The 'id' references specific tomograms, whereas 'run_id' references a collection of experimental data.
+        scale_level: The scale level to read from the data. Only valid for zarr data.
+
+    Returns:
+        The data read from the run.
+        The voxel size read from the run.
+    """
+    assert id_field in ("id", "run_id")
+    if output_path is not None and os.path.exists(output_path):
+        return read_ome_zarr(output_path) if use_zarr_format else read_mrc(output_path)
+
+    if cdp is None:
+        raise RuntimeError("The CryoET data portal library is required to download data from the portal.")
+    if s3fs is None:
+        raise RuntimeError("The CryoET data portal download requires s3fs download.")
+
+    client = cdp.Client()
+
+    fs = s3fs.S3FileSystem(anon=True)
+    tomograms = cdp.Tomogram.find(
+        client, [getattr(cdp.Tomogram, id_field) == run_id, cdp.Tomogram.processing == processing_type]
+    )
+    if len(tomograms) == 0:
+        return None, None
+    if len(tomograms) > 1:
+        raise NotImplementedError
+    tomo = tomograms[0]
+
+    if use_zarr_format:
+        if output_path is None:
+            scale_level = 0 if scale_level is None else scale_level
+            data, voxel_size = read_ome_zarr(tomo.s3_omezarr_dir, fs=fs)
+        else:
+            # TODO: write the outuput to ome zarr, for all scale levels.
+            raise NotImplementedError
+    else:
+        if scale_level is not None:
+            raise ValueError
+        if output_path is None:
+            raise RuntimeError("You have to pass an output_path to download the data as mrc file.")
+        fs.get(tomo.s3_mrc_file, output_path)
+        data, voxel_size = read_mrc(output_path)
+
+    return data, voxel_size