Update ground-truth annotation scripts

constantinpape · constantinpape · commit 997d80428a7a · 2024-10-15T21:37:56.000+02:00
diff --git a/scripts/cooper/ground_truth/compartments/preprocess.py b/scripts/cooper/ground_truth/compartments/preprocess.py
@@ -6,6 +6,7 @@
 from skimage.transform import rescale
 
 ROOT = "/mnt/lustre-emmy-hdd/projects/nim00007/data/synaptic-reconstruction/cooper/original_imod_data/20240909_cp_datatransfer"  # noqa
+ROOT_CRYO = "/mnt/lustre-emmy-hdd/projects/nim00007/data/synaptic-reconstruction/fernandez-busnadiego/vesicle_gt/v1"  # noqa
 
 
 def preprocess_tomogram(dataset, tomogram):
@@ -44,6 +45,44 @@ def preprocess_tomogram(dataset, tomogram):
     )
 
 
+def preprocess_cryo_tomogram(fname):
+    scale = (0.5, 0.5, 0.5)
+
+    dataset = "cryo"
+    output_root = f"./output/{dataset}"
+    output_tomos = os.path.join(output_root, "tomograms")
+    output_embed = os.path.join(output_root, "embeddings")
+    os.makedirs(output_tomos, exist_ok=True)
+    os.makedirs(output_embed, exist_ok=True)
+
+    tomogram = os.path.join(ROOT_CRYO, f"{fname}.h5")
+
+    input_path = os.path.join(output_tomos, f"{fname}.h5")
+    output_path = os.path.join(output_embed, f"{fname}.zarr")
+    if os.path.exists(output_path):
+        return
+
+    tomogram_path = os.path.join(ROOT_CRYO, dataset, tomogram)
+    with open_file(tomogram_path, "r") as f:
+        tomo = f["raw"][:]
+
+    print("Resizing tomogram ...")
+    tomo = rescale(tomo, scale, preserve_range=True).astype(tomo.dtype)
+
+    with open_file(input_path, "a") as f:
+        f.create_dataset("data", data=tomo, compression="gzip")
+
+    print("Precompute state ...")
+    precompute_state(
+        input_path=input_path,
+        output_path=output_path,
+        model_type="vit_b",
+        key="data",
+        checkpoint_path="./checkpoints/compartment_model/best.pt",
+        ndim=3,
+    )
+
+
 def preprocess_05():
     dataset = "05_stem750_sv_training"
     tomograms = sorted(glob(os.path.join(ROOT, dataset, "*.mrc")))
@@ -65,10 +104,19 @@ def preprocess_09():
         preprocess_tomogram(dataset, os.path.basename(tomo))
 
 
+def preprocess_cryo():
+    fname = "vesicles-33K-L1"
+    preprocess_cryo_tomogram(fname)
+
+    fname = "vesicles-64K-LAM12"
+    preprocess_cryo_tomogram(fname)
+
+
 def main():
-    preprocess_05()
-    preprocess_06()
-    preprocess_09()
+    # preprocess_05()
+    # preprocess_06()
+    # preprocess_09()
+    preprocess_cryo()
 
 
 if __name__ == "__main__":
diff --git a/scripts/cooper/ground_truth/compartments/preprocess_for_annotation.py b/scripts/cooper/ground_truth/compartments/preprocess_for_annotation.py
@@ -0,0 +1,92 @@
+import os
+from glob import glob
+
+import imageio.v3 as imageio
+
+from elf.io import open_file
+from skimage.transform import rescale
+from tqdm import tqdm
+
+ROOT = "/mnt/lustre-emmy-hdd/projects/nim00007/data/synaptic-reconstruction/cooper/original_imod_data/20240909_cp_datatransfer"  # noqa
+ROOT_CRYO = "/mnt/lustre-emmy-hdd/projects/nim00007/data/synaptic-reconstruction/fernandez-busnadiego/vesicle_gt/v1"  # noqa
+OUTPUT_IMAGES = "./output/images"
+
+
+def process_tomogram(tomo_path, scale, tomo_key="data"):
+    with open_file(tomo_path, "r") as f:
+        tomo = f[tomo_key][:]
+
+    os.makedirs(OUTPUT_IMAGES, exist_ok=True)
+    offset = len(glob(os.path.join(OUTPUT_IMAGES, "*.tif")))
+
+    print("Resizing tomogram ...")
+    tomo = rescale(tomo, scale, preserve_range=True).astype(tomo.dtype)
+
+    z_max = tomo.shape[0]
+    slices = [z_max // 2, z_max // 4, 3 * z_max // 4]
+
+    for i, z in enumerate(slices):
+        im = tomo[z]
+        idx = i + offset
+        out_path = os.path.join(OUTPUT_IMAGES, f"image_{idx:05}.tif")
+        imageio.imwrite(out_path, im, compression="zlib")
+
+
+def preprocess_05():
+    scale = (0.25, 0.25, 0.25)
+    dataset = "05_stem750_sv_training"
+    tomograms = sorted(glob(os.path.join(ROOT, dataset, "*.mrc")))
+    for tomo in tqdm(tomograms):
+        process_tomogram(tomo, scale)
+
+
+def preprocess_06():
+    scale = (0.25, 0.25, 0.25)
+    dataset = "06_hoi_wt_stem750_fm"
+    tomograms = sorted(glob(os.path.join(ROOT, dataset, "*.mrc")))
+    for tomo in tqdm(tomograms):
+        process_tomogram(tomo, scale)
+
+
+def preprocess_09():
+    scale = (0.25, 0.25, 0.25)
+    dataset = "09_stem750_66k"
+    tomograms = sorted(glob(os.path.join(ROOT, dataset, "*.mrc")))
+    for tomo in tqdm(tomograms):
+        process_tomogram(tomo, scale)
+
+
+def preprocess_cryo():
+    scale = (0.5, 0.5, 0.5)
+    tomograms = sorted(glob(os.path.join(ROOT_CRYO, "*.h5")))
+    for tomo in tqdm(tomograms):
+        process_tomogram(tomo, scale, tomo_key="raw")
+
+
+def precompute_state():
+    from micro_sam.util import get_sam_model
+    from micro_sam.precompute_state import _precompute_state_for_files
+
+    images = sorted(glob(os.path.join(OUTPUT_IMAGES, "*.tif")))
+    embedding_path = "./output/embeddings"
+
+    predictor = get_sam_model(model_type="vit_b", checkpoint_path="./checkpoints/compartment_model/best.pt")
+    precompute_amg_state = False
+    decoder = None
+
+    _precompute_state_for_files(
+        predictor, images, embedding_path, ndim=2, tile_shape=None, halo=None,
+        precompute_amg_state=precompute_amg_state, decoder=decoder,
+    )
+
+
+def main():
+    # preprocess_05()
+    # preprocess_06()
+    # preprocess_09()
+    # preprocess_cryo()
+    precompute_state()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/cooper/ground_truth/compartments/train_sam.py b/scripts/cooper/ground_truth/compartments/train_sam.py
@@ -1,35 +1,92 @@
+import os
+from glob import glob
+
 import numpy as np
+
 from micro_sam.training import train_sam, default_sam_dataset
+from sklearn.model_selection import train_test_split
 from torch_em.data.sampler import MinInstanceSampler
 from torch_em.segmentation import get_data_loader
 
-data_path = "./segmentation.h5"
-
-with_segmentation_decoder = False
-patch_shape = [1, 462, 462]
-z_split = 400
-
-train_ds = default_sam_dataset(
-    raw_paths=data_path, raw_key="raw_downscaled",
-    label_paths=data_path, label_key="segmentation/compartments",
-    patch_shape=patch_shape, with_segmentation_decoder=with_segmentation_decoder,
-    sampler=MinInstanceSampler(2), rois=np.s_[z_split:, :, :],
-    n_samples=200,
-)
-train_loader = get_data_loader(train_ds, shuffle=True, batch_size=2)
-
-val_ds = default_sam_dataset(
-    raw_paths=data_path, raw_key="raw_downscaled",
-    label_paths=data_path, label_key="segmentation/compartments",
-    patch_shape=patch_shape, with_segmentation_decoder=with_segmentation_decoder,
-    sampler=MinInstanceSampler(2), rois=np.s_[:z_split, :, :],
-    is_train=False, n_samples=25,
-)
-val_loader = get_data_loader(val_ds, shuffle=True, batch_size=1)
-
-train_sam(
-    name="compartment_model", model_type="vit_b",
-    train_loader=train_loader, val_loader=val_loader,
-    n_epochs=100, n_objects_per_batch=10,
-    with_segmentation_decoder=with_segmentation_decoder,
-)
+
+def train_v1():
+    data_path = "./segmentation.h5"
+
+    with_segmentation_decoder = False
+    patch_shape = [1, 462, 462]
+    z_split = 400
+
+    train_ds = default_sam_dataset(
+        raw_paths=data_path, raw_key="raw_downscaled",
+        label_paths=data_path, label_key="segmentation/compartments",
+        patch_shape=patch_shape, with_segmentation_decoder=with_segmentation_decoder,
+        sampler=MinInstanceSampler(2), rois=np.s_[z_split:, :, :],
+        n_samples=200,
+    )
+    train_loader = get_data_loader(train_ds, shuffle=True, batch_size=2)
+
+    val_ds = default_sam_dataset(
+        raw_paths=data_path, raw_key="raw_downscaled",
+        label_paths=data_path, label_key="segmentation/compartments",
+        patch_shape=patch_shape, with_segmentation_decoder=with_segmentation_decoder,
+        sampler=MinInstanceSampler(2), rois=np.s_[:z_split, :, :],
+        is_train=False, n_samples=25,
+    )
+    val_loader = get_data_loader(val_ds, shuffle=True, batch_size=1)
+
+    train_sam(
+        name="compartment_model", model_type="vit_b",
+        train_loader=train_loader, val_loader=val_loader,
+        n_epochs=100, n_objects_per_batch=10,
+        with_segmentation_decoder=with_segmentation_decoder,
+    )
+
+
+def normalize_trafo(raw):
+    raw = raw.astype("float32")
+    raw -= raw.min()
+    raw /= raw.max()
+    raw *= 255
+    return raw
+
+
+def train_v2():
+    data_root = "./output/postprocessed_annotations"
+    paths = glob(os.path.join(data_root, "*.h5"))
+    train_paths, val_paths = train_test_split(paths, test_size=0.1, random_state=42)
+
+    with_segmentation_decoder = True
+    patch_shape = (462, 462)
+
+    train_ds = default_sam_dataset(
+        raw_paths=train_paths, raw_key="data",
+        label_paths=train_paths, label_key="labels/compartments",
+        patch_shape=patch_shape, with_segmentation_decoder=with_segmentation_decoder,
+        sampler=MinInstanceSampler(2), n_samples=250,
+        raw_transform=normalize_trafo,
+    )
+    train_loader = get_data_loader(train_ds, shuffle=True, batch_size=2)
+
+    val_ds = default_sam_dataset(
+        raw_paths=val_paths, raw_key="data",
+        label_paths=val_paths, label_key="labels/compartments",
+        patch_shape=patch_shape, with_segmentation_decoder=with_segmentation_decoder,
+        sampler=MinInstanceSampler(2),  is_train=False, n_samples=25,
+        raw_transform=normalize_trafo,
+    )
+    val_loader = get_data_loader(val_ds, shuffle=True, batch_size=1)
+
+    train_sam(
+        name="compartment_model_v2", model_type="vit_b",
+        train_loader=train_loader, val_loader=val_loader,
+        n_epochs=100, n_objects_per_batch=10,
+        with_segmentation_decoder=with_segmentation_decoder,
+    )
+
+
+def main():
+    train_v2()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/cooper/ground_truth/vesicles/count_annotations.py b/scripts/cooper/ground_truth/vesicles/count_annotations.py
@@ -0,0 +1,51 @@
+import os
+from glob import glob
+
+import h5py
+import numpy as np
+
+from tqdm import tqdm
+
+
+ROOT = "/mnt/lustre-emmy-hdd/projects/nim00007/data/synaptic-reconstruction/cooper/vesicles_processed_v2"
+SKIP_PREFIX = ("06", "08", "09")
+
+
+def main():
+    n_tomograms = {}
+    n_vesicles_imod = {}
+    n_vesicles_auto = {}
+    n_vesicles_total = {}
+
+    datasets = sorted(glob(os.path.join(ROOT, "*")))
+
+    for ds in tqdm(datasets):
+        ds_name = os.path.basename(ds)
+        if ds_name.startswith(SKIP_PREFIX):
+            continue
+        tomograms = glob(os.path.join(ds, "*.h5"))
+
+        n_ves_imod, n_ves_auto = 0, 0
+        for tomo in tomograms:
+            with h5py.File(tomo, "r") as f:
+                ves_imod = f["/labels/vesicles/imod"][:]
+                ves_auto = f["/labels/vesicles/additional_vesicles"][:]
+            n_ves_imod += (len(np.unique(ves_imod)) - 1)
+            n_ves_auto += (len(np.unique(ves_auto)) - 1)
+
+        n_tomograms[ds_name] = len(tomograms)
+        n_vesicles_imod[ds_name] = n_ves_imod
+        n_vesicles_auto[ds_name] = n_ves_auto
+        n_vesicles_total[ds_name] = n_ves_imod + n_ves_auto
+
+    print("Total number of tomograms:")
+    print(sum(n_tomograms.values()))
+
+    print("Total number of vesicles:")
+    print(sum(n_vesicles_total.values()))
+
+    # TODO analyze the number of vesicles from IMOD and auto annotation further for the methods tile
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/inner_ear/ground_truth/count_annotations.py b/scripts/inner_ear/ground_truth/count_annotations.py
@@ -0,0 +1,13 @@
+import os
+from glob import glob
+
+ROOT = "/mnt/lustre-emmy-hdd/projects/nim00007/data/synaptic-reconstruction/moser/inner_ear_data"
+
+
+def main():
+    tomograms = glob(os.path.join(ROOT, "**/*.h5"), recursive=True)
+    print("Number of tomograms:")
+    print(len(tomograms))
+
+
+main()