Skip to content

Commit 38f7907

Browse files
Add support for cryo-et-data (#89)
Add scripts for CryoET Data Portal submissions
1 parent 1f0387a commit 38f7907

File tree

8 files changed

+337
-0
lines changed

8 files changed

+337
-0
lines changed

doc/start_page.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,3 +164,10 @@ Domain adaptation is implemented in `synapse_net.training.domain_adaptation`. Yo
164164

165165
We also provide functionality for 'regular' neural network training. In this case, you have to provide data **and** manual annotations for the structure(s) you want to segment.
166166
This functionality is implemented in `synapse_net.training.supervised_training`. You can find an example script that shows how to use it [here](https://github.com/computational-cell-analytics/synapse-net/blob/main/examples/network_training.py).
167+
168+
## Segmentation for the CryoET Data Portal
169+
170+
We have published segmentation results for tomograms of synapses stored in the [CryoET Data Portal](https://cryoetdataportal.czscience.com/). So far we have made the following depositions:
171+
- [CZCDP-10330](https://cryoetdataportal.czscience.com/depositions/10330): Contains synaptic vesicle segmentations for over 50 tomograms of synaptosomes. The segmentations were made with a model domain adapted to the synaptosome tomograms.
172+
173+
The scripts for the submissions can be found in [scripts/cryo/cryo-et-portal](https://github.com/computational-cell-analytics/synapse-net/tree/main/scripts/cryo/cryo-et-portal).
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
credentials.portal
2+
sync_with_s3.sh
3+
segmentations/
4+
upload_CZCDP-10330/

scripts/cryo/cryo-et-portal/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# Segmentation for the CryoET Data Portal
2+
3+
Scripts to prepare submissions for the [CryoET Data Portal](https://cryoetdataportal.czscience.com).
4+
5+
We have created the following submissions for the portal:
6+
- [CZCDP-10330](https://cryoetdataportal.czscience.com/depositions/10330): synaptic vesicles segmented in tomograms of synaptosomes.
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# from torch_em.data.datasets.util import download_from_cryo_et_portal
2+
#
3+
# path = "/scratch-grete/projects/nim00007/cryo-et/from_portal"
4+
#
5+
# # TODO this is the stuff to extract later
6+
# ids = [
7+
# "RN-16498",
8+
# "RN-16514",
9+
# "RN-16581",
10+
# "RN-16641",
11+
# ]
12+
#
13+
# # "24sep24a_Position_102"
14+
# # "24sep24a_Position_113_3"
15+
# # "24sep24a_Position_84"
16+
# # "24sep24a_Position_38"
17+
#
18+
# did = "10443"
19+
# download_from_cryo_et_portal(path, did, download=True)
20+
21+
import cryoet_data_portal as cdp
22+
import s3fs
23+
import os
24+
25+
# S3 filesystem instance
26+
fs = s3fs.S3FileSystem(anon=True)
27+
28+
# Client instance
29+
client = cdp.Client()
30+
31+
# Run IDs (integers)
32+
runs = [16498, 16514, 16581, 16641]
33+
34+
root = "/scratch-grete/projects/nim00007/cryo-et/from_portal"
35+
36+
# Loop over run IDs
37+
for run_id in runs:
38+
# Query denoised tomograms
39+
tomograms = cdp.Tomogram.find(
40+
client,
41+
[
42+
cdp.Tomogram.run_id == run_id,
43+
cdp.Tomogram.processing == "denoised",
44+
]
45+
)
46+
47+
# Select the first tomogram (there should only be one in this case)
48+
tomo = tomograms[0]
49+
50+
# Download the denoised tomogram
51+
output_folder = os.path.join(root, str(run_id))
52+
os.makedirs(output_folder, exist_ok=True)
53+
fname = f"{tomo.id}_{tomo.processing}.mrc"
54+
output_path = os.path.join(output_folder, fname)
55+
fs.get(tomo.s3_mrc_file, output_path)
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import json
2+
import os
3+
4+
from synapse_net.file_utils import read_data_from_cryo_et_portal_run
5+
from tqdm import tqdm
6+
7+
8+
def download_tomogram_list(run_ids, output_root):
9+
print("Downloading", len(run_ids), "tomograms")
10+
os.makedirs(output_root, exist_ok=True)
11+
for run_id in tqdm(run_ids):
12+
output_path = os.path.join(output_root, f"{run_id}.mrc")
13+
data, voxel_size = read_data_from_cryo_et_portal_run(
14+
run_id, use_zarr_format=False, output_path=output_path, id_field="id",
15+
)
16+
if data is None:
17+
print("Did not find a tomogram for", run_id)
18+
19+
20+
def download_tomograms_for_da():
21+
with open("./list_for_da.json") as f:
22+
run_ids = json.load(f)
23+
output_root = "/scratch-grete/projects/nim00007/cryo-et/from_portal/for_domain_adaptation"
24+
download_tomogram_list(run_ids, output_root)
25+
26+
27+
def download_tomograms_for_eval():
28+
with open("./list_for_eval.json") as f:
29+
run_ids = json.load(f)
30+
output_root = "/scratch-grete/projects/nim00007/cryo-et/from_portal/for_eval"
31+
download_tomogram_list(run_ids, output_root)
32+
33+
34+
def main():
35+
download_tomograms_for_eval()
36+
# download_tomograms_for_da()
37+
38+
39+
if __name__ == "__main__":
40+
main()
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import os
2+
from glob import glob
3+
from pathlib import Path
4+
5+
import h5py
6+
import numpy as np
7+
import zarr
8+
9+
from synapse_net.file_utils import read_mrc
10+
from tqdm import tqdm
11+
12+
from ome_zarr.writer import write_image
13+
from ome_zarr.io import parse_url
14+
15+
16+
IN_ROOT = "/scratch-grete/projects/nim00007/cryo-et/from_portal/for_eval"
17+
OUT_ROOT = "/scratch-grete/projects/nim00007/cryo-et/from_portal/segmentations/DA_with_new_portalData_origDim" # noqa
18+
19+
IN_ROOT = "/scratch-grete/projects/nim00007/cryo-et/from_portal/for_domain_adaptation"
20+
OUT_ROOT = "/scratch-grete/projects/nim00007/cryo-et/from_portal/segmentations/DA_with_new_portalData_forDAdata"
21+
22+
23+
def export_to_ome_zarr(export_file, seg, voxel_size):
24+
store = parse_url(export_file, mode="w").store
25+
root = zarr.group(store=store)
26+
27+
scale = list(voxel_size.values())
28+
trafo = [
29+
[{"scale": scale, "type": "scale"}]
30+
]
31+
write_image(seg, root, axes="zyx", coordinate_transformations=trafo, scaler=None)
32+
33+
34+
def export_segmentation(export_folder, segmentation_file):
35+
fname = Path(segmentation_file).stem
36+
key = "/vesicles/segment_from_vesicle_DA_portal_v3"
37+
export_file = os.path.join(export_folder, f"{fname}.ome.zarr")
38+
39+
if os.path.exists(export_file):
40+
return
41+
42+
input_file = os.path.join(IN_ROOT, f"{fname}.mrc")
43+
raw, voxel_size = read_mrc(input_file)
44+
voxel_size = {k: v * 10 for k, v in voxel_size.items()}
45+
46+
try:
47+
with h5py.File(segmentation_file, "r") as f:
48+
seg = f[key][:]
49+
except OSError as e:
50+
print(e)
51+
return
52+
53+
seg = np.flip(seg, axis=1)
54+
assert seg.shape == raw.shape
55+
56+
assert seg.max() < 128, f"{seg.max()}"
57+
seg = seg.astype("int8")
58+
export_to_ome_zarr(export_file, seg, voxel_size)
59+
60+
61+
def main():
62+
export_folder = "./for_portal2"
63+
os.makedirs(export_folder, exist_ok=True)
64+
files = glob(os.path.join(OUT_ROOT, "*.h5"))
65+
for file in tqdm(files):
66+
export_segmentation(export_folder, file)
67+
68+
69+
main()
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
import os
2+
from glob import glob
3+
from pathlib import Path
4+
from shutil import move
5+
6+
import cryoet_data_portal as cdp
7+
from tqdm import tqdm
8+
9+
10+
def main():
11+
input_folder = "segmentations"
12+
output_root = "upload_CZCDP-10330"
13+
14+
client = cdp.Client()
15+
16+
tomograms = sorted(glob(os.path.join("segmentations/*.ome.zarr")))
17+
for input_file in tqdm(tomograms, desc="Formatting submission"):
18+
tomo_id = Path(input_file).stem
19+
tomo_id = int(Path(tomo_id).stem)
20+
21+
tomo = cdp.Tomogram.get_by_id(client, tomo_id)
22+
output_folder = os.path.join(output_root, str(tomo.run.dataset_id))
23+
os.makedirs(output_folder, exist_ok=True)
24+
output_file = os.path.join(output_folder, f"{tomo.run.name}.zarr")
25+
move(input_file, output_file)
26+
27+
28+
if __name__ == "__main__":
29+
main()

synapse_net/file_utils.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,21 @@
55
import numpy as np
66
import pooch
77

8+
try:
9+
import cryoet_data_portal as cdp
10+
except ImportError:
11+
cdp = None
12+
13+
try:
14+
import zarr
15+
except ImportError:
16+
zarr = None
17+
18+
try:
19+
import s3fs
20+
except ImportError:
21+
s3fs = None
22+
823

924
def get_cache_dir() -> str:
1025
"""Get the cache directory of synapse net.
@@ -88,3 +103,115 @@ def read_mrc(path: str) -> Tuple[np.ndarray, Dict[str, float]]:
88103
# Transpose the data to match python axis order.
89104
data = np.flip(data, axis=1) if data.ndim == 3 else np.flip(data, axis=0)
90105
return data, voxel_size
106+
107+
108+
def read_ome_zarr(uri: str, scale_level: int = 0, fs=None) -> Tuple[np.ndarray, Dict[str, float]]:
109+
"""Read data and voxel size from an ome.zarr file.
110+
111+
Args:
112+
uri: Path or url to the ome.zarr file.
113+
scale_level: The level of the multi-scale image pyramid to load.
114+
fs: S3 filesystem to use for initializing the store.
115+
116+
Returns:
117+
The data read from the file.
118+
The voxel size read from the file.
119+
"""
120+
if zarr is None:
121+
raise RuntimeError("The zarr library is required to read ome.zarr files.")
122+
123+
def parse_s3_uri(uri):
124+
return uri.lstrip("s3://")
125+
126+
if uri.startswith("s3"):
127+
if fs is None:
128+
fs = s3fs.S3FileSystem(anon=True)
129+
s3_uri = parse_s3_uri(uri)
130+
store = s3fs.S3Map(root=s3_uri, s3=fs, check=False)
131+
elif fs is not None:
132+
s3_uri = parse_s3_uri(uri)
133+
store = s3fs.S3Map(root=s3_uri, s3=fs, check=False)
134+
else:
135+
if not os.path.exists(uri):
136+
raise ValueError(f"Cannot find the filepath at {uri}.")
137+
store = uri
138+
139+
with zarr.open(store, "r") as f:
140+
multiscales = f.attrs["multiscales"][0]
141+
142+
# Read the axis and transformation metadata for this dataset, to determine the voxel size.
143+
axes = [axis["name"] for axis in multiscales["axes"]]
144+
assert set(axes) == set("xyz")
145+
transformations = multiscales["datasets"][scale_level]["coordinateTransformations"]
146+
scale_transformation = [trafo["scale"] for trafo in transformations if trafo["type"] == "scale"][0]
147+
148+
# The voxel size is given in angstrom, we divide it by 10 to convert it to nanometer.
149+
voxel_size = {axis: scale / 10.0 for axis, scale in zip(axes, scale_transformation)}
150+
151+
# Get the internale path for the given scale and load the data.
152+
internal_path = multiscales["datasets"][scale_level]["path"]
153+
data = f[internal_path][:]
154+
155+
return data, voxel_size
156+
157+
158+
def read_data_from_cryo_et_portal_run(
159+
run_id: int,
160+
output_path: Optional[str] = None,
161+
use_zarr_format: bool = True,
162+
processing_type: str = "denoised",
163+
id_field: str = "run_id",
164+
scale_level: Optional[int] = None,
165+
) -> Tuple[np.ndarray, Dict[str, float]]:
166+
"""Read data and voxel size from a CryoET Data Portal run.
167+
168+
Args:
169+
run_id: The ID of the experiment run.
170+
output_path: The path for saving the data. The data will be streamed if the path is not given.
171+
use_zarr_format: Whether to use the data in zarr format instead of mrc.
172+
processing_type: The processing type of the tomogram to download.
173+
id_field: The name of the id field. One of 'id' or 'run_id'.
174+
The 'id' references specific tomograms, whereas 'run_id' references a collection of experimental data.
175+
scale_level: The scale level to read from the data. Only valid for zarr data.
176+
177+
Returns:
178+
The data read from the run.
179+
The voxel size read from the run.
180+
"""
181+
assert id_field in ("id", "run_id")
182+
if output_path is not None and os.path.exists(output_path):
183+
return read_ome_zarr(output_path) if use_zarr_format else read_mrc(output_path)
184+
185+
if cdp is None:
186+
raise RuntimeError("The CryoET data portal library is required to download data from the portal.")
187+
if s3fs is None:
188+
raise RuntimeError("The CryoET data portal download requires s3fs download.")
189+
190+
client = cdp.Client()
191+
192+
fs = s3fs.S3FileSystem(anon=True)
193+
tomograms = cdp.Tomogram.find(
194+
client, [getattr(cdp.Tomogram, id_field) == run_id, cdp.Tomogram.processing == processing_type]
195+
)
196+
if len(tomograms) == 0:
197+
return None, None
198+
if len(tomograms) > 1:
199+
raise NotImplementedError
200+
tomo = tomograms[0]
201+
202+
if use_zarr_format:
203+
if output_path is None:
204+
scale_level = 0 if scale_level is None else scale_level
205+
data, voxel_size = read_ome_zarr(tomo.s3_omezarr_dir, fs=fs)
206+
else:
207+
# TODO: write the outuput to ome zarr, for all scale levels.
208+
raise NotImplementedError
209+
else:
210+
if scale_level is not None:
211+
raise ValueError
212+
if output_path is None:
213+
raise RuntimeError("You have to pass an output_path to download the data as mrc file.")
214+
fs.get(tomo.s3_mrc_file, output_path)
215+
data, voxel_size = read_mrc(output_path)
216+
217+
return data, voxel_size

0 commit comments

Comments
 (0)