computational-cell-analytics · SarahMuth · Oct 23, 2024 · Oct 28, 2024 · Oct 28, 2024 · Oct 28, 2024
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,11 @@ models/*/
 run_sbatch.sbatch
 slurm/
 scripts/cooper/evaluation_results/
+analysis_results/
 scripts/cooper/training/copy_testset.py
 scripts/rizzoli/upsample_data.py
-scripts/cooper/training/find_rec_testset.py
+scripts/cooper/training/find_rec_testset.py
+scripts/rizzoli/combine_2D_slices.py
+scripts/rizzoli/combine_2D_slices_raw.py
+scripts/cooper/remove_h5key.py
+scripts/cooper/analysis/calc_AZ_area.py
diff --git a/scripts/cooper/AZ_segmentation_h5.py b/scripts/cooper/AZ_segmentation_h5.py
@@ -0,0 +1,173 @@
+import argparse
+import h5py
+import os
+from pathlib import Path
+
+from tqdm import tqdm
+from elf.io import open_file
+
+from synaptic_reconstruction.inference.AZ import segment_AZ
+from synaptic_reconstruction.inference.util import parse_tiling
+
+def _require_output_folders(output_folder):
+    #seg_output = os.path.join(output_folder, "segmentations")
+    seg_output = output_folder
+    os.makedirs(seg_output, exist_ok=True)
+    return seg_output
+
+def get_volume(input_path):
+    '''
+    with h5py.File(input_path) as seg_file:
+        input_volume = seg_file["raw"][:]
+    '''
+    with open_file(input_path, "r") as f:
+
+        # Try to automatically derive the key with the raw data.
+        keys = list(f.keys())
+        if len(keys) == 1:
+            key = keys[0]
+        elif "data" in keys:
+            key = "data"
+        elif "raw" in keys:
+            key = "raw"
+
+        input_volume = f[key][:]
+    return input_volume
+
+def run_AZ_segmentation(input_path, output_path, model_path, mask_path, mask_key,tile_shape, halo, key_label, compartment_seg):
+    tiling = parse_tiling(tile_shape, halo)
+    print(f"using tiling {tiling}")
+    input = get_volume(input_path)
+
+    #check if we have a restricting mask for the segmentation
+    if mask_path is not None:
+        with open_file(mask_path, "r") as f:
+                        mask = f[mask_key][:]
+    else:
+        mask = None
+
+    #check if intersection with compartment is necessary
+    if compartment_seg is None:
+        foreground = segment_AZ(input_volume=input, model_path=model_path, verbose=False, tiling=tiling, mask = mask)
+        intersection = None
+    else:
+        with open_file(compartment_seg, "r") as f:
+            compartment = f["/labels/compartment"][:]
+        foreground, intersection = segment_AZ(input_volume=input, model_path=model_path, verbose=False, tiling=tiling, mask = mask, compartment=compartment)
+
+    seg_output = _require_output_folders(output_path)
+    file_name = Path(input_path).stem
+    seg_path = os.path.join(seg_output, f"{file_name}.h5")
+
+    #check
+    os.makedirs(Path(seg_path).parent, exist_ok=True)
+
+    print(f"Saving results in {seg_path}")
+    with h5py.File(seg_path, "a") as f:
+        if "raw" in f:
+            print("raw image already saved")
+        else:
+            f.create_dataset("raw", data=input, compression="gzip")
+
+        key=f"AZ/segment_from_{key_label}"
+        if key in f:
+            print("Skipping", input_path, "because", key, "exists")
+        else:
+            f.create_dataset(key, data=foreground, compression="gzip")
+
+        if mask is not None:
+            if mask_key in f:
+                print("mask image already saved")
+            else:
+                f.create_dataset(mask_key, data = mask, compression = "gzip")
+
+        if intersection is not None:
+            intersection_key = "AZ/compartment_AZ_intersection"
+            if intersection_key in f:
+                print("intersection already saved")
+            else:
+                f.create_dataset(intersection_key, data = intersection, compression = "gzip")
+
+
+
+
+def segment_folder(args):
+    input_files = []
+    for root, dirs, files in os.walk(args.input_path):
+        input_files.extend([
+            os.path.join(root, name) for name in files if name.endswith(args.data_ext)
+        ])
+    print(input_files)
+    pbar = tqdm(input_files, desc="Run segmentation")
+    for input_path in pbar:
+
+        filename = os.path.basename(input_path)
+        try:
+            mask_path = os.path.join(args.mask_path, filename)
+        except:
+            print(f"Mask file not found for {input_path}")
+            mask_path = None
+
+        if args.compartment_seg is not None:
+            try:
+                compartment_seg = os.path.join(args.compartment_seg, os.path.splitext(filename)[0] + '.h5')
+            except:
+                print(f"compartment file not found for {input_path}")
+                compartment_seg = None
+
+        run_AZ_segmentation(input_path, args.output_path, args.model_path, mask_path, args.mask_key, args.tile_shape, args.halo, args.key_label, compartment_seg)
+
+def main():
+    parser = argparse.ArgumentParser(description="Segment vesicles in EM tomograms.")
+    parser.add_argument(
+        "--input_path", "-i", required=True,
+        help="The filepath to the mrc file or the directory containing the tomogram data."
+    )
+    parser.add_argument(
+        "--output_path", "-o", required=True,
+        help="The filepath to directory where the segmentations will be saved."
+    )
+    parser.add_argument(
+        "--model_path", "-m", required=True, help="The filepath to the vesicle model."
+    )
+    parser.add_argument(
+        "--mask_path", help="The filepath to a h5 file with a mask that will be used to restrict the segmentation. Needs to be in combination with mask_key."
+    )
+    parser.add_argument(
+        "--mask_key", help="Key name that holds the mask segmentation"
+    )
+    parser.add_argument(
+        "--tile_shape", type=int, nargs=3,
+        help="The tile shape for prediction. Lower the tile shape if GPU memory is insufficient."
+    )
+    parser.add_argument(
+        "--halo", type=int, nargs=3,
+        help="The halo for prediction. Increase the halo to minimize boundary artifacts."
+    )
+    parser.add_argument(
+        "--key_label", "-k", default = "combined_vesicles",
+        help="Give the key name for saving the segmentation in h5."
+    )
+    parser.add_argument(
+        "--data_ext", "-d", default = ".h5",
+        help="Format extension of data to be segmented, default is .h5."
+    )
+    parser.add_argument(
+        "--compartment_seg", "-c", 
+        help="Path to compartment segmentation."
+        "If the compartment segmentation was executed before, this will add a key to output file that stores the intersection between compartment boundary and AZ."
+        "Maybe need to adjust the compartment key that the segmentation is stored under"
+    )
+    args = parser.parse_args()
+
+    input_ = args.input_path
+
+    if os.path.isdir(input_):
+        segment_folder(args)
+    else:
+        run_AZ_segmentation(input_, args.output_path, args.model_path, args.mask_path, args.mask_key, args.tile_shape, args.halo, args.key_label, args.compartment_seg)
+
+    print("Finished segmenting!")
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/cooper/analysis/run_size_analysis.py b/scripts/cooper/analysis/run_size_analysis.py
@@ -0,0 +1,174 @@
+import os
+from glob import glob
+
+import numpy as np
+import pandas as pd
+import h5py
+from tqdm import tqdm
+from synaptic_reconstruction.imod.to_imod import convert_segmentation_to_spheres
+
+DATA_ROOT = "/mnt/lustre-emmy-hdd/usr/u12095/synaptic_reconstruction/segmentation/for_spatial_distribution_analysis/final_Imig2014_seg/"  # noqa
+PREDICTION_ROOT = "/mnt/lustre-emmy-hdd/usr/u12095/synaptic_reconstruction/segmentation/for_spatial_distribution_analysis/final_Imig2014_seg/"  # noqa
+RESULT_FOLDER = "./analysis_results/AZ_filtered_autoComp"
+
+def get_compartment_with_max_overlap(compartments, vesicles):
+    """
+    Given 3D numpy arrays of compartments and vesicles, this function returns a binary mask
+    of the compartment with the most overlap with vesicles based on the number of overlapping voxels.
+
+    Parameters:
+    compartments (numpy.ndarray): 3D array of compartment labels.
+    vesicles (numpy.ndarray): 3D array of vesicle labels or binary mask.
+
+    Returns:
+    numpy.ndarray: Binary mask of the compartment with the most overlap with vesicles.
+    """
+
+    unique_compartments = np.unique(compartments)
+    if 0 in unique_compartments:
+        unique_compartments = unique_compartments[unique_compartments != 0]
+
+    max_overlap_count = 0
+    best_compartment = None
+
+    # Iterate over each compartment and calculate the overlap with vesicles
+    for compartment_label in unique_compartments:
+        compartment_mask = compartments == compartment_label
+        vesicle_mask = vesicles > 0 
+
+        intersection = np.logical_and(compartment_mask, vesicle_mask)
+        overlap_count = np.sum(intersection)
+
+        # Track the compartment with the most overlap in terms of voxel count
+        if overlap_count > max_overlap_count:
+            max_overlap_count = overlap_count
+            best_compartment = compartment_label
+
+    final_mask = compartments == best_compartment
+
+    return final_mask
+
+# We compute the sizes for all vesicles in the MANUALLY ANNOTATED compartment masks.
+# We use the same logic in the size computation as for the vesicle extraction to IMOD,
+# including the radius correction factor. --> not needed here
+# The number of vesicles is automatically computed as the length of the size list.
+def compute_sizes_for_all_tomorams_manComp():
+    os.makedirs(RESULT_FOLDER, exist_ok=True)
+
+    resolution = (1.554,) * 3  # Change for each dataset #1.554 for Munc and snap #0.8681 for 04 dataset
+    radius_factor = 1
+    estimate_radius_2d = True
+    dataset_results = {}
+
+    tomograms = sorted(glob(os.path.join(PREDICTION_ROOT, "**/*.h5"), recursive=True))
+    for tomo in tqdm(tomograms):
+        ds_name, fname = os.path.split(tomo)
+        ds_name = os.path.split(ds_name)[1]
+        fname = os.path.splitext(fname)[0]
+
+        # Determine if the tomogram is 'CTRL' or 'DKO'
+        category = "CTRL" if "CTRL" in fname else "DKO"
+
+        if ds_name not in dataset_results:
+            dataset_results[ds_name] = {'CTRL': {}, 'DKO': {}}
+
+        if fname in dataset_results[ds_name][category]:
+            continue
+
+        # Load the vesicle segmentation from the predictions.
+        with h5py.File(tomo, "r") as f:
+            segmentation = f["/vesicles/segment_from_combined_vesicles"][:]
+
+        input_path = os.path.join(DATA_ROOT, ds_name, f"{fname}.h5")
+        assert os.path.exists(input_path), input_path
+
+        # Load the compartment mask from the tomogram
+        with h5py.File(input_path, "r") as f:
+            mask = f["labels/compartment"][:]
+
+        segmentation[mask == 0] = 0
+        _, sizes = convert_segmentation_to_spheres(
+            segmentation, resolution=resolution, radius_factor=radius_factor, estimate_radius_2d=estimate_radius_2d
+        )
+
+
+        dataset_results[ds_name][category][fname] = sizes
+
+    # Save each dataset's results into separate CSV files for CTRL and DKO tomograms
+    for ds_name, categories in dataset_results.items():
+        for category, tomogram_data in categories.items():
+            sorted_data = dict(sorted(tomogram_data.items()))  # Sort by tomogram names
+            result_df = pd.DataFrame.from_dict(sorted_data, orient='index').transpose()
+
+            output_path = os.path.join(RESULT_FOLDER, f"size_analysis_for_{ds_name}_{category}_rf1.csv")
+
+            # Save the DataFrame to CSV
+            result_df.to_csv(output_path, index=False)
+
+# We compute the sizes for all vesicles in the AUTOMATIC SEGMENTED compartment masks.
+# We use the same logic in the size computation as for the vesicle extraction to IMOD,
+# including the radius correction factor. --> not needed here
+# The number of vesicles is automatically computed as the length of the size list.
+def compute_sizes_for_all_tomorams_autoComp():
+    os.makedirs(RESULT_FOLDER, exist_ok=True)
+
+    resolution = (1.554,) * 3  # Change for each dataset #1.554 for Munc and snap #0.8681 for 04 dataset
+    radius_factor = 1
+    estimate_radius_2d = True
+    dataset_results = {}
+
+    tomograms = sorted(glob(os.path.join(PREDICTION_ROOT, "**/*.h5"), recursive=True))
+    for tomo in tqdm(tomograms):
+        ds_name, fname = os.path.split(tomo)
+        ds_name = os.path.split(ds_name)[1]
+        fname = os.path.splitext(fname)[0]
+
+        # Determine if the tomogram is 'CTRL' or 'DKO'
+        category = "CTRL" if "CTRL" in fname else "DKO"
+
+        if ds_name not in dataset_results:
+            dataset_results[ds_name] = {'CTRL': {}, 'DKO': {}}
+
+        if fname in dataset_results[ds_name][category]:
+            continue
+
+        # Load the vesicle segmentation from the predictions.
+        with h5py.File(tomo, "r") as f:
+            segmentation = f["/vesicles/segment_from_combined_vesicles"][:]
+
+        input_path = os.path.join(DATA_ROOT, ds_name, f"{fname}.h5")
+        assert os.path.exists(input_path), input_path
+
+        # Load the compartment mask from the tomogram
+        with h5py.File(input_path, "r") as f:
+            compartments  = f["/compartments/segment_from_3Dmodel_v2"][:]
+        mask = get_compartment_with_max_overlap(compartments, segmentation)
+
+        # if more than half of the vesicles (approximation, its checking pixel and not label) would get filtered by mask it means the compartment seg didn't work and thus we won't use the mask
+        if np.sum(segmentation[mask == 0] > 0) > (0.5 * np.sum(segmentation > 0)):
+            print(f"using no mask for {tomo}")
+        else:
+            segmentation[mask == 0] = 0
+        _, sizes = convert_segmentation_to_spheres(
+            segmentation, resolution=resolution, radius_factor=radius_factor, estimate_radius_2d=estimate_radius_2d
+        )
+
+        dataset_results[ds_name][category][fname] = sizes
+
+    # Save each dataset's results into separate CSV files for CTRL and DKO tomograms
+    for ds_name, categories in dataset_results.items():
+        for category, tomogram_data in categories.items():
+            sorted_data = dict(sorted(tomogram_data.items()))  # Sort by tomogram names
+            result_df = pd.DataFrame.from_dict(sorted_data, orient='index').transpose()
+
+            output_path = os.path.join(RESULT_FOLDER, f"size_analysis_for_{ds_name}_{category}_rf1.csv")
+
+            # Save the DataFrame to CSV
+            result_df.to_csv(output_path, index=False)
+
+def main():
+    #compute_sizes_for_all_tomorams_manComp()
+    compute_sizes_for_all_tomorams_autoComp()
+
+if __name__ == "__main__":
+    main()