Merge pull request #41 from computational-cell-analytics/sm-dev

constantinpape · web-flow · commit eabd2aaba16f · 2024-10-18T20:20:34.000+02:00
Sm dev
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,6 @@ models/*/
 run_sbatch.sbatch
 slurm/
 scripts/cooper/evaluation_results/
-scripts/cooper/training/copy_testset.py
+scripts/cooper/training/copy_testset.py
+scripts/rizzoli/upsample_data.py
+scripts/cooper/training/find_rec_testset.py
diff --git a/scripts/cooper/training/evaluation.py b/scripts/cooper/training/evaluation.py
@@ -30,8 +30,8 @@ def evaluate_file(labels_path, vesicles_path, model_name, segment_key, anno_key)
     #get the labels and vesicles
     with h5py.File(labels_path) as label_file:
         labels = label_file["labels"]
-        vesicles = labels["vesicles"]
-        gt = vesicles[anno_key][:]
+        #vesicles = labels["vesicles"]
+        gt = labels[anno_key][:]
         
     with h5py.File(vesicles_path) as seg_file:
         segmentation = seg_file["vesicles"]
diff --git a/scripts/cooper/training/train_AZ.py b/scripts/cooper/training/train_AZ.py
@@ -0,0 +1,134 @@
+import os
+from glob import glob
+import argparse
+import json
+
+import torch_em
+import torch
+
+from sklearn.model_selection import train_test_split
+
+from synaptic_reconstruction.training import supervised_training
+from synaptic_reconstruction.training import semisupervised_training
+
+TRAIN_ROOT = "/mnt/lustre-emmy-hdd/projects/nim00007/data/synaptic-reconstruction/cooper/exported_imod_objects"
+OUTPUT_ROOT = "/mnt/lustre-emmy-hdd/usr/u12095/synaptic_reconstruction/training_AZ_v1"
+
+
+def _require_train_val_test_split(datasets):
+    train_ratio, val_ratio, test_ratio = 0.8, 0.1, 0.1
+    if len(datasets) < 10:
+        train_ratio, val_ratio, test_ratio = 0.5, 0.25, 0.25
+
+    def _train_val_test_split(names):
+        train, test = train_test_split(names, test_size=1 - train_ratio, shuffle=True)
+        _ratio = test_ratio / (test_ratio + val_ratio)
+        val, test = train_test_split(test, test_size=_ratio)
+        return train, val, test
+
+    for ds in datasets:
+        print(ds)
+        split_path = os.path.join(OUTPUT_ROOT, f"split-{ds}.json")
+        if os.path.exists(split_path):
+            continue
+
+        file_paths = sorted(glob(os.path.join(TRAIN_ROOT, ds, "*.h5")))
+        file_names = [os.path.basename(path) for path in file_paths]
+
+        train, val, test = _train_val_test_split(file_names)
+
+        with open(split_path, "w") as f:
+            json.dump({"train": train, "val": val, "test": test}, f)
+
+def _require_train_val_split(datasets):
+    train_ratio, val_ratio= 0.8, 0.2
+
+    def _train_val_split(names):
+        train, val = train_test_split(names, test_size=1 - train_ratio, shuffle=True)
+        return train, val
+
+    for ds in datasets:
+        print(ds)
+        split_path = os.path.join(OUTPUT_ROOT, f"split-{ds}.json")
+        if os.path.exists(split_path):
+            continue
+
+        file_paths = sorted(glob(os.path.join(TRAIN_ROOT, ds, "*.h5")))
+        file_names = [os.path.basename(path) for path in file_paths]
+
+        train, val = _train_val_split(file_names)
+
+        with open(split_path, "w") as f:
+            json.dump({"train": train, "val": val}, f)
+
+def get_paths(split, datasets, testset=True):
+    if testset:
+        _require_train_val_test_split(datasets)
+    else:
+        _require_train_val_split(datasets)
+
+    paths = []
+    for ds in datasets:
+        split_path = os.path.join(OUTPUT_ROOT, f"split-{ds}.json")
+        with open(split_path) as f:
+            names = json.load(f)[split]
+        ds_paths = [os.path.join(TRAIN_ROOT, ds, name) for name in names]
+        assert all(os.path.exists(path) for path in ds_paths)
+        paths.extend(ds_paths)
+
+    return paths
+
+def train(key, ignore_label = None, training_2D = False, testset = True):
+
+    datasets = [
+    "01_hoi_maus_2020_incomplete",
+    "06_hoi_wt_stem750_fm",
+    "12_chemical_fix_cryopreparation"
+]
+    train_paths = get_paths("train", datasets=datasets, testset=testset)
+    val_paths = get_paths("val", datasets=datasets, testset=testset)
+
+    print("Start training with:")
+    print(len(train_paths), "tomograms for training")
+    print(len(val_paths), "tomograms for validation")
+
+    patch_shape = [48, 256, 256]
+    model_name=f"3D-AZ-model-v1"
+
+    #checking for 2D training
+    if training_2D:
+        patch_shape = [1, 256, 256]
+        model_name=f"2D-AZ-model-v1"
+    
+    batch_size = 4
+    check = False
+
+    supervised_training(
+        name=model_name,
+        train_paths=train_paths,
+        val_paths=val_paths,
+        label_key=f"/labels/{key}",
+        patch_shape=patch_shape, batch_size=batch_size,
+        sampler = torch_em.data.sampler.MinInstanceSampler(min_num_instances=1),
+        n_samples_train=None, n_samples_val=25,
+        check=check,
+        save_root="/mnt/lustre-emmy-hdd/usr/u12095/synaptic_reconstruction/AZ_models",
+        n_iterations=int(5e3),
+        ignore_label= ignore_label,
+        label_transform=torch_em.transform.label.labels_to_binary,
+        out_channels = 1,
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-k", "--key", required=True, help="Key ID that will be used by model in training")
+    parser.add_argument("-m", "--mask", type=int, default=None, help="Mask ID that will be ignored by model in training")
+    parser.add_argument("-2D", "--training_2D", action='store_true', help="Set to True for 2D training")
+    parser.add_argument("-t", "--testset", action='store_false', help="Set to False if no testset should be created")
+    args = parser.parse_args()
+    train(args.key, args.mask, args.training_2D, args.testset)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/cooper/vesicle_segmentation_h5.py b/scripts/cooper/vesicle_segmentation_h5.py
@@ -34,11 +34,19 @@ def get_volume(input_path):
         input_volume = f[key][:]
     return input_volume
 
-def run_vesicle_segmentation(input_path, output_path, model_path, tile_shape, halo, include_boundary, key_label):
+def run_vesicle_segmentation(input_path, output_path, model_path, mask_path, mask_key,tile_shape, halo, include_boundary, key_label):
     tiling = parse_tiling(tile_shape, halo)
     print(f"using tiling {tiling}")
     input = get_volume(input_path)
-    segmentation, prediction = segment_vesicles(input_volume=input, model_path=model_path, verbose=False, tiling=tiling, return_predictions=True, exclude_boundary=not include_boundary)
+
+    #check if we have a restricting mask for the segmentation
+    if mask_path is not None:
+        with open_file(mask_path, "r") as f:
+                        mask = f[mask_key][:]
+    else:
+        mask = None
+
+    segmentation, prediction = segment_vesicles(input_volume=input, model_path=model_path, verbose=False, tiling=tiling, return_predictions=True, exclude_boundary=not include_boundary, mask = mask)
     foreground, boundaries = prediction[:2]
 
     seg_output = _require_output_folders(output_path)
@@ -63,6 +71,12 @@ def run_vesicle_segmentation(input_path, output_path, model_path, tile_shape, ha
             f.create_dataset(f"prediction_{key_label}/foreground", data = foreground, compression="gzip")
             f.create_dataset(f"prediction_{key_label}/boundaries", data = boundaries, compression="gzip")
         
+        if mask is not None:
+            if mask_key in f:
+                print("mask image already saved")
+            else:
+                f.create_dataset(mask_key, data = mask, compression = "gzip")
+        
         
 
 
@@ -75,7 +89,15 @@ def segment_folder(args):
     print(input_files)
     pbar = tqdm(input_files, desc="Run segmentation")
     for input_path in pbar:
-        run_vesicle_segmentation(input_path, args.output_path, args.model_path, args.tile_shape, args.halo, args.include_boundary, args.key_label)
+
+        filename = os.path.basename(input_path)
+        try:
+            mask_path = os.path.join(args.mask_path, filename)
+        except:
+            print(f"Mask file not found for {input_path}")
+            mask_path = None
+
+        run_vesicle_segmentation(input_path, args.output_path, args.model_path, mask_path, args.mask_key, args.tile_shape, args.halo, args.include_boundary, args.key_label)
 
 def main():
     parser = argparse.ArgumentParser(description="Segment vesicles in EM tomograms.")
@@ -90,6 +112,12 @@ def main():
     parser.add_argument(
         "--model_path", "-m", required=True, help="The filepath to the vesicle model."
     )
+    parser.add_argument(
+        "--mask_path", help="The filepath to a h5 file with a mask that will be used to restrict the segmentation. Needs to be in combination with mask_key."
+    )
+    parser.add_argument(
+        "--mask_key", help="Key name that holds the mask segmentation"
+    )
     parser.add_argument(
         "--tile_shape", type=int, nargs=3,
         help="The tile shape for prediction. Lower the tile shape if GPU memory is insufficient."
@@ -113,7 +141,7 @@ def main():
     if os.path.isdir(input_):
         segment_folder(args)
     else:
-        run_vesicle_segmentation(input_, args.output_path, args.model_path, args.tile_shape, args.halo, args.include_boundary, args.key_label)
+        run_vesicle_segmentation(input_, args.output_path, args.model_path, args.mask_path, args.mask_key, args.tile_shape, args.halo, args.include_boundary, args.key_label)
 
     print("Finished segmenting!")
 
diff --git a/scripts/cryo/vesicles/train_domain_adaptation.py b/scripts/cryo/vesicles/train_domain_adaptation.py
@@ -84,7 +84,7 @@ def vesicle_domain_adaptation(teacher_model, testset = True):
 
     #adjustable parameters
     patch_shape = [48, 256, 256]
-    model_name = "vesicle-DA-cryo-v1"
+    model_name = "vesicle-DA-cryo-v2"
     
     model_root = "/mnt/lustre-emmy-hdd/usr/u12095/synaptic_reconstruction/models_v2/checkpoints/"
     checkpoint_path = os.path.join(model_root, teacher_model)
@@ -98,6 +98,7 @@ def vesicle_domain_adaptation(teacher_model, testset = True):
         save_root="/mnt/lustre-emmy-hdd/usr/u12095/synaptic_reconstruction/DA_models",
         source_checkpoint=checkpoint_path,
         confidence_threshold=0.75,
+        n_iterations=int(5e4),
     )
 
 
diff --git a/scripts/inner_ear/training/train_domain_adaptation_vesicles.py b/scripts/inner_ear/training/train_domain_adaptation_vesicles.py
@@ -119,7 +119,7 @@ def vesicle_domain_adaptation(teacher_model, testset = True):
 
     #adjustable parameters
     patch_shape = [48, 256, 256]
-    model_name = "vesicle-DA-inner_ear-v1"
+    model_name = "vesicle-DA-inner_ear-v2"
     
     model_root = "/mnt/lustre-emmy-hdd/usr/u12095/synaptic_reconstruction/models_v2/checkpoints/"
     checkpoint_path = os.path.join(model_root, teacher_model)
@@ -133,6 +133,7 @@ def vesicle_domain_adaptation(teacher_model, testset = True):
         save_root="/mnt/lustre-emmy-hdd/usr/u12095/synaptic_reconstruction/DA_models",
         source_checkpoint=checkpoint_path,
         confidence_threshold=0.75,
+        n_iterations=int(1e5),
     )
 
 
diff --git a/scripts/rizzoli/2D_vesicle_segmentation.py b/scripts/rizzoli/2D_vesicle_segmentation.py
@@ -6,6 +6,7 @@
 from tqdm import tqdm
 import torch
 import torch_em
+import numpy as np
 
 from synaptic_reconstruction.inference.vesicles import segment_vesicles
 from synaptic_reconstruction.inference.util import parse_tiling
@@ -73,13 +74,18 @@ def run_vesicle_segmentation(input_path, output_path, model_path, tile_shape, ha
 
     def process_slices(input_volume):
         processed_slices = []
+        foreground = []
+        boundaries = []
         for z in range(input_volume.shape[0]):
             slice_ = input_volume[z, :, :]
-            segmented_slice = segment_vesicles(input_volume=slice_, model=model, verbose=False, tiling=tiling, exclude_boundary=not include_boundary)
+            segmented_slice, prediction_slice = segment_vesicles(input_volume=slice_, model=model, verbose=False, tiling=tiling, return_predictions=True, exclude_boundary=not include_boundary)
             processed_slices.append(segmented_slice)
-        return processed_slices
+            foreground_pred_slice, boundaries_pred_slice = prediction_slice[:2]
+            foreground.append(foreground_pred_slice)
+            boundaries.append(boundaries_pred_slice)
+        return processed_slices, foreground, boundaries
 
-    segmentation = process_slices(input)
+    segmentation, foreground, boundaries = process_slices(input)
 
     seg_output = _require_output_folders(output_path)
     file_name = Path(input_path).stem
@@ -100,6 +106,8 @@ def process_slices(input_volume):
             print("Skipping", input_path, "because", key, "exists")
         else:
             f.create_dataset(key, data=segmentation, compression="gzip")
+            f.create_dataset(f"prediction_{key_label}/foreground", data = foreground, compression="gzip")
+            f.create_dataset(f"prediction_{key_label}/boundaries", data = boundaries, compression="gzip")
         
         
 
diff --git a/scripts/rizzoli/evaluation_2D.py b/scripts/rizzoli/evaluation_2D.py
@@ -58,8 +58,8 @@ def evaluate_file(labels_path, vesicles_path, model_name, segment_key, anno_key)
     #get the labels and vesicles
     with h5py.File(labels_path) as label_file:
         labels = label_file["labels"]
-        vesicles = labels["vesicles"]
-        gt = vesicles[anno_key][:]
+        #vesicles = labels["vesicles"]
+        gt = labels[anno_key][:]
         
     with h5py.File(vesicles_path) as seg_file:
         segmentation = seg_file["vesicles"]
diff --git a/scripts/rizzoli/train_2D_domain_adaptation.py b/scripts/rizzoli/train_2D_domain_adaptation.py
diff --git a/scripts/wichmann/train_domain_adaptation.py b/scripts/wichmann/train_domain_adaptation.py
diff --git a/synaptic_reconstruction/inference/compartments.py b/synaptic_reconstruction/inference/compartments.py