Update the example scripts and add downloads for data from zenodo

constantinpape · constantinpape · commit c75abd9ddd83 · 2024-12-09T15:37:16.000+01:00
diff --git a/examples/.gitignore b/examples/.gitignore
@@ -0,0 +1,5 @@
+data/
+set_up_pool.py
+*.h5
+*.tif
+*.mrc
diff --git a/examples/analysis_pipeline.py b/examples/analysis_pipeline.py
@@ -6,10 +6,11 @@
 from skimage.measure import regionprops
 from skimage.segmentation import find_boundaries
 
-from synapse_net.file_utils import read_mrc
-from synapse_net.sample_data import get_sample_data
 from synapse_net.distance_measurements import measure_segmentation_to_object_distances
+from synapse_net.file_utils import read_mrc
+from synapse_net.imod.to_imod import convert_segmentation_to_spheres
 from synapse_net.inference import compute_scale_from_voxel_size, get_model, run_segmentation
+from synapse_net.sample_data import get_sample_data
 
 
 def segment_structures(tomogram, voxel_size):
@@ -72,15 +73,23 @@ def postprocess_segmentation(segmentations):
 
 
 def measure_distances(segmentations, voxel_size):
+    # Here, we measure the distances from each vesicle to the active zone.
+    # We use the function 'measure_segmentation_to_object_distances' for this,
+    # which uses an euclidean distance transform scaled with the voxel size
+    # to determine distances.
     vesicles, active_zone = segmentations["vesicles"], segmentations["active_zone"]
     voxel_size = tuple(voxel_size[ax] for ax in "zyx")
     distances, _, _, vesicle_ids = measure_segmentation_to_object_distances(
         vesicles, active_zone, resolution=voxel_size
     )
+    # We convert the result to a pandas data frame.
     return pd.DataFrame({"vesicle_id": vesicle_ids, "distance": distances})
 
 
 def assign_vesicle_pools(vesicle_attributes):
+    # We assign the vesicles to their respective pool, 'docked' and 'non-attached',
+    # based on the criterion of being within 2 nm from the active zone.
+    # We add the pool assignment as a new column to the dataframe with vesicle attributes.
     docked_vesicle_distance = 2  # nm
     vesicle_attributes["pool"] = vesicle_attributes["distance"].apply(
         lambda x: "docked" if x < docked_vesicle_distance else "non-attached"
@@ -89,6 +98,7 @@ def assign_vesicle_pools(vesicle_attributes):
 
 
 def visualize_results(tomogram, segmentations, vesicle_attributes):
+    # Here, we visualize the segmentation and pool assignment result in napari.
 
     # Create a segmentation to visualize the vesicle pools.
     docked_ids = vesicle_attributes[vesicle_attributes.pool == "docked"].vesicle_id
@@ -97,6 +107,7 @@ def visualize_results(tomogram, segmentations, vesicle_attributes):
     vesicle_pools = np.isin(vesicles, docked_ids).astype("uint8")
     vesicle_pools[np.isin(vesicles, non_attached_ids)] = 2
 
+    # Create a napari viewer, add the tomogram data and the segmentation results.
     viewer = napari.Viewer()
     viewer.add_image(tomogram)
     for name, segmentation in segmentations.items():
@@ -105,9 +116,16 @@ def visualize_results(tomogram, segmentations, vesicle_attributes):
     napari.run()
 
 
-# TODO compute the vesicle radii and other features and then save the attributes.
 def save_analysis(segmentations, vesicle_attributes, save_path):
-    pass
+    # Here, we compute the radii and centroid positions of the vesicles,
+    # add them to the vesicle attributes and then save all vesicle attributes to
+    # an excel table. You can use this table for evaluation of the analysis.
+    vesicles = segmentations["vesicles"]
+    coordinates, radii = convert_segmentation_to_spheres(vesicles, radius_factor=0.7)
+    vesicle_attributes["radius"] = radii
+    for ax_id, ax_name in enumerate("zyx"):
+        vesicle_attributes[f"center-{ax_name}"] = coordinates[:, ax_id]
+    vesicle_attributes.to_excel(save_path, index=False)
 
 
 def main():
@@ -119,16 +137,7 @@ def main():
     tomogram, voxel_size = read_mrc(mrc_path)
 
     # Segment synaptic vesicles, the active zone, and the synaptic compartment.
-    # segmentations = segment_structures(tomogram, voxel_size)
-
-    # Load saved segmentations for development.
-    import h5py
-    segmentations = {}
-    with h5py.File("seg.h5", "r") as f:
-        for name, ds in f.items():
-            # f.create_dataset(name, data=seg, compression="gzip")
-            seg = ds[:]
-            segmentations[name] = seg
+    segmentations = segment_structures(tomogram, voxel_size)
 
     # Post-process the segmentations, to find the presynaptic terminal,
     # filter out vesicles not in the terminal, and to 'snape' the AZ to the presynaptic boundary.
diff --git a/examples/domain_adaptation.py b/examples/domain_adaptation.py
@@ -4,35 +4,41 @@
 a different electron tomogram with different specimen and sample preparation.
 You don't need any annotations in the new domain to run this script.
 
-You can download example data for this script from:
-- Adaptation to 2d TEM data: TODO zenodo link
-- Adaptation to different tomography data: TODO zenodo link
+We use data from the SynapseNet publication for this example:
+- Adaptation to 2d TEM data: https://doi.org/10.5281/zenodo.14236381
+- Adaptation to different tomography data (3d data): https://doi.org/10.5281/zenodo.14232606
+
+It is of course possible to adapt it to your own data.
 """
 
 import os
 from glob import glob
 
 from sklearn.model_selection import train_test_split
+from synapse_net.inference.inference import get_model_path
+from synapse_net.sample_data import download_data_from_zenodo
 from synapse_net.training import mean_teacher_adaptation
-from synapse_net.tools.util import get_model_path
 
 
 def main():
     # Choose whether to adapt the model to 2D or to 3D data.
-    train_2d_model = True
-
-    # TODO adjust to zenodo downloads
-    # These are the data folders for the example data downloaded from zenodo.
-    # Update these paths to apply the script to your own data.
-    # Check out the example data to see the data format for training.
-    data_root_folder_2d = "./data/2d_tem/train_unlabeled"
-    data_root_folder_3d = "./data/..."
+    train_2d_model = False
 
-    # Choose the correct data folder depending on 2d/3d training.
-    data_root_folder = data_root_folder_2d if train_2d_model else data_root_folder_3d
+    # Download the training data from zenodo.
+    # You have to replace this if you want to train on your own data.
+    # The training data should be stored in an hdf5 file per tomogram,
+    # with tomgoram data stored in the internal dataset 'raw'.
+    if train_2d_model:
+        data_root = "./data/2d_tem"
+        download_data_from_zenodo(data_root, "2d_tem")
+        train_root_folder = os.path.join(data_root, "train_unlabeled")
+    else:
+        data_root = "./data/inner_ear_ribbon_synapse"
+        download_data_from_zenodo(data_root, "inner_ear_ribbon_synapse")
+        train_root_folder = data_root
 
     # Get all files with ending .h5 in the training folder.
-    files = sorted(glob(os.path.join(data_root_folder, "**", "*.h5"), recursive=True))
+    files = sorted(glob(os.path.join(train_root_folder, "**", "*.h5"), recursive=True))
 
     # Crate a train / val split.
     train_ratio = 0.85
diff --git a/examples/network_training.py b/examples/network_training.py
@@ -5,30 +5,36 @@
 to adapt an already trained network to your data without the need for
 additional annotations then check out `domain_adaptation.py`.
 
-You can download example data for this script from:
-TODO zenodo link to Single-Ax / Chemical Fix data.
+We will use the data from our manuscript here:
+https://doi.org/10.5281/zenodo.14330011
+
+You can also use your own data, if you prepare it in the same format.
 """
 import os
 from glob import glob
 
 from sklearn.model_selection import train_test_split
+from synapse_net.sample_data import download_data_from_zenodo
 from synapse_net.training import supervised_training
 
 
 def main():
-    # This is the folder that contains your training data.
-    # The example was designed so that it runs for the sample data downloaded to './data'.
-    # If you want to train on your own data than change this filepath accordingly.
-    # TODO update to match zenodo download
-    data_root_folder = "./data/vesicles/train"
+    # Download the training data from zenodo.
+    # You have to replace this if you want to train on your own data.
+    # The training data should be stored in an hdf5 file per tomogram,
+    # with tomgoram data stored in the internal dataset 'raw'
+    # and the vesicle annotations stored in the internal dataset 'labels/vesicles'.
+    data_root = "./data/training_data"
+    download_data_from_zenodo(data_root, "training_data")
+    train_root_folder = os.path.join(data_root, "vesicles/train")
 
     # The training data should be saved as .h5 files, with:
     # an internal dataset called 'raw' that contains the image data
     # and another dataset that contains the training annotations.
     label_key = "labels/vesicles"
 
     # Get all files with the ending .h5 in the training folder.
-    files = sorted(glob(os.path.join(data_root_folder, "**", "*.h5"), recursive=True))
+    files = sorted(glob(os.path.join(train_root_folder, "**", "*.h5"), recursive=True))
 
     # Crate a train / val split.
     train_ratio = 0.85
diff --git a/scripts/prepare_zenodo_uploads.py b/scripts/prepare_zenodo_uploads.py
@@ -56,7 +56,7 @@ def _export_az(train_root, test_tomos, name):
 
     for tomo in tqdm(tomograms):
         fname = os.path.basename(tomo)
-        if tomo in test_tomos:
+        if fname in test_tomos:
             out_path = os.path.join(test_out, fname)
         else:
             out_path = os.path.join(train_out, fname)
diff --git a/synapse_net/sample_data.py b/synapse_net/sample_data.py
@@ -1,4 +1,5 @@
 import os
+import tempfile
 import pooch
 
 from .file_utils import read_mrc, get_cache_dir
@@ -52,3 +53,32 @@ def sample_data_tem_2d():
 
 def sample_data_tem_tomo():
     return _sample_data("tem_tomo")
+
+
+def download_data_from_zenodo(path: str, name: str):
+    """Download data uploaded for the SynapseNet manuscript from zenodo.
+
+    Args:
+        path: The path where the downloaded data will be saved.
+        name: The name of the zenodi dataset.
+    """
+    from torch_em.data.datasets.util import download_source, unzip
+
+    urls = {
+        "2d_tem": "https://zenodo.org/records`/14236382/files/tem_2d.zip?download=1",
+        "inner_ear_ribbon_synapse": "https://zenodo.org/records/14232607/files/inner-ear-ribbon-synapse-tomgrams.zip?download=1",  # noqa
+        "training_data": "https://zenodo.org/records/14330011/files/synapse-net.zip?download=1"
+    }
+    assert name in urls
+    url = urls[name]
+
+    # May need to adapt this for other datasets.
+    # Check if the download already exists.
+    dl_path = path
+    if os.path.exists(dl_path):
+        return
+
+    with tempfile.TemporaryDirectory() as tmp:
+        tmp_path = os.path.join(tmp, f"{name}.zip")
+        download_source(tmp_path, url, download=True, checksum=None)
+        unzip(tmp_path, path, remove=False)

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +data/
 +set_up_pool.py
 +*.h5
 +*.tif
 +*.mrc