Skip to content
This repository was archived by the owner on Nov 15, 2018. It is now read-only.

Commit 295173a

Browse files
committed
msm_basic.formula_imager_segm refactoring and bug fixing
* splitted compute_sf_images method into several ones * added code for handling cases when gen_iso_peak_images returns two images for the same formula-adduct-peak * sci test report update (fix of the previous buggy report) * new test module for msm_basic.formula_imager_segm
1 parent 721c075 commit 295173a

File tree

5 files changed

+89
-29
lines changed

5 files changed

+89
-29
lines changed

sm/engine/msm_basic/formula_imager_segm.py

Lines changed: 48 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import sys
2+
from collections import defaultdict
23
import pandas as pd
34
from itertools import izip, repeat, islice
45
import numpy as np
@@ -26,7 +27,7 @@ def _find_mz_bounds(mz_grid, workload_per_mz, n=32):
2627
return mz_bounds
2728

2829

29-
def _create_mz_buckets(mz_bounds, ppm):
30+
def _create_mz_segments(mz_bounds, ppm):
3031
mz_buckets = []
3132
for i, (l, r) in enumerate(zip([0] + mz_bounds, mz_bounds + [sys.float_info.max])):
3233
l -= l * ppm * 1e-6
@@ -41,7 +42,7 @@ def _create_mz_buckets(mz_bounds, ppm):
4142
# for s_i, (l, r) in enumerate(mz_buckets)]
4243

4344

44-
def _segment_spectra(sp, mz_buckets):
45+
def _segment_spectrum(sp, mz_buckets):
4546
sp_id, mzs, ints = sp
4647
for s_i, (l, r) in enumerate(mz_buckets):
4748
smask = (mzs >= l) & (mzs <= r)
@@ -70,7 +71,7 @@ def _gen_iso_images(spectra_it, sp_indexes, sf_peak_df, nrows, ncols, ppm, peaks
7071
# a bit slower than using pure numpy arrays but much shorter
7172
# may leak memory because of https://github.com/pydata/pandas/issues/2659 or smth else
7273
sp_df = pd.DataFrame(_sp_df_gen(sp_list, sp_indexes),
73-
columns=['idx', 'mz', 'ints'], dtype=np.float64).sort_values(by='mz')
74+
columns=['idx', 'mz', 'ints']).sort_values(by='mz')
7475
# print sp_df.info()
7576

7677
# -1, + 1 are needed to extend sf_peak_mz range so that it covers 100% of spectra
@@ -91,17 +92,49 @@ def _gen_iso_images(spectra_it, sp_indexes, sf_peak_df, nrows, ncols, ppm, peaks
9192
(sf_peak_df.peak_i.iloc[i], coo_matrix((data, (row_inds, col_inds)), shape=(nrows, ncols)))
9293

9394

94-
def _img_pairs_to_list(pairs):
95+
def _img_pairs_to_list(pairs, shape):
9596
""" list of (coord, value) pairs -> list of values """
9697
if not pairs:
9798
return None
98-
length = max([i for i, img in pairs]) + 1
99-
res = np.ndarray((length,), dtype=object)
100-
for i, img in pairs:
101-
res[i] = img
99+
100+
d = defaultdict(lambda: coo_matrix(shape))
101+
for k, m in pairs:
102+
_m = d[k]
103+
d[k] = _m if _m.nnz >= m.nnz else m
104+
distinct_pairs = d.items()
105+
106+
res = np.ndarray((max(d.keys()) + 1,), dtype=object)
107+
for i, m in distinct_pairs:
108+
res[i] = m
102109
return res.tolist()
103110

104111

112+
def find_mz_segments(spectra, sf_peak_df, ppm):
113+
# spectra_sample = spectra.take(200)
114+
spectra_sample = spectra.takeSample(withReplacement=False, num=200)
115+
mz_grid, workload_per_mz = _estimate_mz_workload(spectra_sample, sf_peak_df, bins=10000)
116+
mz_bounds = _find_mz_bounds(mz_grid, workload_per_mz, n=1024)
117+
mz_segments = _create_mz_segments(mz_bounds, ppm=ppm)
118+
return spectra_sample, mz_segments
119+
120+
121+
def gen_iso_peak_images(sc, ds, sf_peak_df, segm_spectra, peaks_per_sp_segm, ppm):
122+
sp_indexes_brcast = sc.broadcast(ds.norm_img_pixel_inds)
123+
sf_peak_df_brcast = sc.broadcast(sf_peak_df) # TODO: replace broadcast variable with rdd and cogroup
124+
nrows, ncols = ds.get_dims()
125+
iso_peak_images = (segm_spectra.flatMap(lambda (s_i, sp_segm):
126+
_gen_iso_images(sp_segm, sp_indexes_brcast.value, sf_peak_df_brcast.value,
127+
nrows, ncols, ppm, peaks_per_sp_segm)))
128+
return iso_peak_images
129+
130+
131+
def gen_iso_sf_images(iso_peak_images, shape):
132+
iso_sf_images = (iso_peak_images
133+
.groupByKey(numPartitions=256)
134+
.mapValues(lambda img_pairs_it: _img_pairs_to_list(list(img_pairs_it), shape)))
135+
return iso_sf_images
136+
137+
105138
# TODO: add tests
106139
def compute_sf_images(sc, ds, sf_peak_df, ppm):
107140
""" Compute isotopic images for all formula
@@ -111,27 +144,16 @@ def compute_sf_images(sc, ds, sf_peak_df, ppm):
111144
: pyspark.rdd.RDD
112145
RDD of sum formula, list[sparse matrix of intensities]
113146
"""
114-
nrows, ncols = ds.get_dims()
115147
spectra_rdd = ds.get_spectra()
116-
# spectra_rdd.cache()
117148

118-
spectra_sample = spectra_rdd.takeSample(withReplacement=False, num=200)
119-
mz_grid, workload_per_mz = _estimate_mz_workload(spectra_sample, sf_peak_df, bins=10000)
120-
121-
mz_bounds = _find_mz_bounds(mz_grid, workload_per_mz, n=1024)
122-
mz_buckets = _create_mz_buckets(mz_bounds, ppm=ppm)
149+
spectra_sample, mz_segments = find_mz_segments(spectra_rdd, sf_peak_df, ppm)
123150
segm_spectra = (spectra_rdd
124-
.flatMap(lambda sp: _segment_spectra(sp, mz_buckets))
125-
.groupByKey(numPartitions=len(mz_buckets)))
151+
.flatMap(lambda sp: _segment_spectrum(sp, mz_segments))
152+
.groupByKey(numPartitions=len(mz_segments)))
126153

127-
peaks_per_sp_segm = int(np.mean([t[1].shape[0] for t in spectra_sample])) / len(mz_buckets)
128-
sp_indexes_brcast = sc.broadcast(ds.norm_img_pixel_inds)
129-
sf_peak_df_brcast = sc.broadcast(sf_peak_df) # TODO: replace broadcast variable with rdd and cogroup
130-
iso_peak_images = (segm_spectra.flatMap(lambda (s_i, sp_segm):
131-
_gen_iso_images(sp_segm, sp_indexes_brcast.value, sf_peak_df_brcast.value,
132-
nrows, ncols, ppm, peaks_per_sp_segm)))
154+
peaks_per_sp_segm = int(np.mean([t[1].shape[0] for t in spectra_sample])) / len(mz_segments)
155+
iso_peak_images = gen_iso_peak_images(sc, ds, sf_peak_df, segm_spectra, peaks_per_sp_segm, ppm)
156+
iso_sf_images = gen_iso_sf_images(iso_peak_images, shape=ds.get_dims())
133157

134-
iso_sf_images = (iso_peak_images
135-
.groupByKey(numPartitions=256)
136-
.mapValues(lambda img_pairs_it: _img_pairs_to_list(list(img_pairs_it))))
137158
return iso_sf_images
159+
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from scipy.sparse import coo_matrix
2+
3+
from sm.engine.tests.util import spark_context
4+
from sm.engine.msm_basic.formula_imager_segm import gen_iso_sf_images
5+
6+
7+
def test_gen_iso_sf_images(spark_context):
8+
iso_peak_images = spark_context.parallelize([((3079, '+H'), (0, coo_matrix([[1., 0., 0.]]))),
9+
((3079, '+H'), (3, coo_matrix([[2., 1., 0.]]))),
10+
((3079, '+H'), (3, coo_matrix([[0., 0., 10.]])))])
11+
exp_iso_sf_imgs = [((3079, '+H'), [coo_matrix([[1., 0., 0.]]),
12+
None,
13+
None,
14+
coo_matrix([[2., 1., 0.]])])]
15+
16+
iso_sf_imgs = gen_iso_sf_images(iso_peak_images, shape=(1, 3)).collect()
17+
18+
assert len(iso_sf_imgs) == len(exp_iso_sf_imgs)
19+
for (k, l), (ek, el) in zip(iso_sf_imgs, exp_iso_sf_imgs):
20+
assert k == ek
21+
assert len(l) == len(el)
22+
for m, em in zip(l, el):
23+
if em is None:
24+
assert m is None
25+
else:
26+
assert (m == em).toarray().all()

tests/reports/spheroid_12h_search_res.csv

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ C11H10O7 +K 0.976315789474 0.0 0.953741700383
112112
C11H11F3N2O3 +K 0.0 0.0 0.942679626657
113113
C11H11F3N2O4 +K 0.982216494845 0.0294604948304 0.953647643013
114114
C11H11NO2 +K 0.941666666667 0.0 0.944314212783
115-
C11H11NO3 +H 0.960407569141 0.0 0.951904869996
115+
C11H11NO3 +H 0.995960739902 0.0 0.951904869996
116116
C11H11NO3 +K 0.981077694236 0.0 0.943664621254
117117
C11H11NO4 +H 0.929777777778 0.0 0.963007681842
118118
C11H12Cl2N2O5 +Na 0.9375 0.0 0.858224057253
@@ -2301,7 +2301,7 @@ C46H84NO8P +H 0.999588477366 0.958440753291 0.982990026501
23012301
C46H84NO8P +K 0.998063872255 0.253617907717 0.903428997805
23022302
C46H84NO8P +Na 0.975414781297 0.630554518247 0.915396518583
23032303
C46H84O13P2 +H 0.0 0.0 0.851762937609
2304-
C46H86NO7P +H 0.95 0.0 0.855117050438
2304+
C46H86NO7P +H 0.997692307692 0.448313830417 0.941610217342
23052305
C46H86NO7P +Na 0.0 0.0 0.855151947092
23062306
C46H86NO8P +H 0.998567335244 0.405050835985 0.869712787869
23072307
C46H86NO8P +K 0.993885819521 0.447644458556 0.900866918992

tests/sci_test_search_job_spheroid_dataset.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,18 @@
1414
from sm.engine.util import proj_root, SMConfig
1515

1616

17+
# def sm_config():
18+
# with open(join(proj_root(), 'conf/config.json')) as f:
19+
# return json.load(f)
20+
21+
SMConfig.set_path(join(proj_root(), 'conf/config.json'))
22+
sm_config = SMConfig.get_conf()
23+
24+
ds_name = 'sci_test_spheroid_12h'
25+
data_dir_path = join(SMConfig.get_conf()['fs']['base_path'], ds_name)
26+
input_dir_path = join(proj_root(), 'test/data/sci_test_search_job_spheroid_dataset')
27+
ds_config_path = join(input_dir_path, 'config.json')
28+
1729
SEARCH_RES_SELECT = ("select sf, adduct, stats "
1830
"from iso_image_metrics s "
1931
"join formula_db sf_db on sf_db.id = s.db_id "

tests/test_search_job_imzml_example.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def create_fill_sm_database(create_test_db, drop_test_db, sm_config):
3737
db.close()
3838

3939

40-
@patch('sm.engine.formula_img_validator.get_compute_img_metrics')
40+
@patch('sm.engine.msm_basic.formula_img_validator.get_compute_img_metrics')
4141
def test_search_job_imzml_example(get_compute_img_measures_mock, create_fill_sm_database, sm_config):
4242
get_compute_img_measures_mock.return_value = lambda *args: (0.9, 0.9, 0.9)
4343

0 commit comments

Comments
 (0)