1
1
import sys
2
+ from collections import defaultdict
2
3
import pandas as pd
3
4
from itertools import izip , repeat , islice
4
5
import numpy as np
@@ -26,7 +27,7 @@ def _find_mz_bounds(mz_grid, workload_per_mz, n=32):
26
27
return mz_bounds
27
28
28
29
29
- def _create_mz_buckets (mz_bounds , ppm ):
30
+ def _create_mz_segments (mz_bounds , ppm ):
30
31
mz_buckets = []
31
32
for i , (l , r ) in enumerate (zip ([0 ] + mz_bounds , mz_bounds + [sys .float_info .max ])):
32
33
l -= l * ppm * 1e-6
@@ -41,7 +42,7 @@ def _create_mz_buckets(mz_bounds, ppm):
41
42
# for s_i, (l, r) in enumerate(mz_buckets)]
42
43
43
44
44
- def _segment_spectra (sp , mz_buckets ):
45
+ def _segment_spectrum (sp , mz_buckets ):
45
46
sp_id , mzs , ints = sp
46
47
for s_i , (l , r ) in enumerate (mz_buckets ):
47
48
smask = (mzs >= l ) & (mzs <= r )
@@ -70,7 +71,7 @@ def _gen_iso_images(spectra_it, sp_indexes, sf_peak_df, nrows, ncols, ppm, peaks
70
71
# a bit slower than using pure numpy arrays but much shorter
71
72
# may leak memory because of https://github.com/pydata/pandas/issues/2659 or smth else
72
73
sp_df = pd .DataFrame (_sp_df_gen (sp_list , sp_indexes ),
73
- columns = ['idx' , 'mz' , 'ints' ], dtype = np . float64 ).sort_values (by = 'mz' )
74
+ columns = ['idx' , 'mz' , 'ints' ]).sort_values (by = 'mz' )
74
75
# print sp_df.info()
75
76
76
77
# -1, + 1 are needed to extend sf_peak_mz range so that it covers 100% of spectra
@@ -91,17 +92,49 @@ def _gen_iso_images(spectra_it, sp_indexes, sf_peak_df, nrows, ncols, ppm, peaks
91
92
(sf_peak_df .peak_i .iloc [i ], coo_matrix ((data , (row_inds , col_inds )), shape = (nrows , ncols )))
92
93
93
94
94
- def _img_pairs_to_list (pairs ):
95
+ def _img_pairs_to_list (pairs , shape ):
95
96
""" list of (coord, value) pairs -> list of values """
96
97
if not pairs :
97
98
return None
98
- length = max ([i for i , img in pairs ]) + 1
99
- res = np .ndarray ((length ,), dtype = object )
100
- for i , img in pairs :
101
- res [i ] = img
99
+
100
+ d = defaultdict (lambda : coo_matrix (shape ))
101
+ for k , m in pairs :
102
+ _m = d [k ]
103
+ d [k ] = _m if _m .nnz >= m .nnz else m
104
+ distinct_pairs = d .items ()
105
+
106
+ res = np .ndarray ((max (d .keys ()) + 1 ,), dtype = object )
107
+ for i , m in distinct_pairs :
108
+ res [i ] = m
102
109
return res .tolist ()
103
110
104
111
112
+ def find_mz_segments (spectra , sf_peak_df , ppm ):
113
+ # spectra_sample = spectra.take(200)
114
+ spectra_sample = spectra .takeSample (withReplacement = False , num = 200 )
115
+ mz_grid , workload_per_mz = _estimate_mz_workload (spectra_sample , sf_peak_df , bins = 10000 )
116
+ mz_bounds = _find_mz_bounds (mz_grid , workload_per_mz , n = 1024 )
117
+ mz_segments = _create_mz_segments (mz_bounds , ppm = ppm )
118
+ return spectra_sample , mz_segments
119
+
120
+
121
+ def gen_iso_peak_images (sc , ds , sf_peak_df , segm_spectra , peaks_per_sp_segm , ppm ):
122
+ sp_indexes_brcast = sc .broadcast (ds .norm_img_pixel_inds )
123
+ sf_peak_df_brcast = sc .broadcast (sf_peak_df ) # TODO: replace broadcast variable with rdd and cogroup
124
+ nrows , ncols = ds .get_dims ()
125
+ iso_peak_images = (segm_spectra .flatMap (lambda (s_i , sp_segm ):
126
+ _gen_iso_images (sp_segm , sp_indexes_brcast .value , sf_peak_df_brcast .value ,
127
+ nrows , ncols , ppm , peaks_per_sp_segm )))
128
+ return iso_peak_images
129
+
130
+
131
+ def gen_iso_sf_images (iso_peak_images , shape ):
132
+ iso_sf_images = (iso_peak_images
133
+ .groupByKey (numPartitions = 256 )
134
+ .mapValues (lambda img_pairs_it : _img_pairs_to_list (list (img_pairs_it ), shape )))
135
+ return iso_sf_images
136
+
137
+
105
138
# TODO: add tests
106
139
def compute_sf_images (sc , ds , sf_peak_df , ppm ):
107
140
""" Compute isotopic images for all formula
@@ -111,27 +144,16 @@ def compute_sf_images(sc, ds, sf_peak_df, ppm):
111
144
: pyspark.rdd.RDD
112
145
RDD of sum formula, list[sparse matrix of intensities]
113
146
"""
114
- nrows , ncols = ds .get_dims ()
115
147
spectra_rdd = ds .get_spectra ()
116
- # spectra_rdd.cache()
117
148
118
- spectra_sample = spectra_rdd .takeSample (withReplacement = False , num = 200 )
119
- mz_grid , workload_per_mz = _estimate_mz_workload (spectra_sample , sf_peak_df , bins = 10000 )
120
-
121
- mz_bounds = _find_mz_bounds (mz_grid , workload_per_mz , n = 1024 )
122
- mz_buckets = _create_mz_buckets (mz_bounds , ppm = ppm )
149
+ spectra_sample , mz_segments = find_mz_segments (spectra_rdd , sf_peak_df , ppm )
123
150
segm_spectra = (spectra_rdd
124
- .flatMap (lambda sp : _segment_spectra (sp , mz_buckets ))
125
- .groupByKey (numPartitions = len (mz_buckets )))
151
+ .flatMap (lambda sp : _segment_spectrum (sp , mz_segments ))
152
+ .groupByKey (numPartitions = len (mz_segments )))
126
153
127
- peaks_per_sp_segm = int (np .mean ([t [1 ].shape [0 ] for t in spectra_sample ])) / len (mz_buckets )
128
- sp_indexes_brcast = sc .broadcast (ds .norm_img_pixel_inds )
129
- sf_peak_df_brcast = sc .broadcast (sf_peak_df ) # TODO: replace broadcast variable with rdd and cogroup
130
- iso_peak_images = (segm_spectra .flatMap (lambda (s_i , sp_segm ):
131
- _gen_iso_images (sp_segm , sp_indexes_brcast .value , sf_peak_df_brcast .value ,
132
- nrows , ncols , ppm , peaks_per_sp_segm )))
154
+ peaks_per_sp_segm = int (np .mean ([t [1 ].shape [0 ] for t in spectra_sample ])) / len (mz_segments )
155
+ iso_peak_images = gen_iso_peak_images (sc , ds , sf_peak_df , segm_spectra , peaks_per_sp_segm , ppm )
156
+ iso_sf_images = gen_iso_sf_images (iso_peak_images , shape = ds .get_dims ())
133
157
134
- iso_sf_images = (iso_peak_images
135
- .groupByKey (numPartitions = 256 )
136
- .mapValues (lambda img_pairs_it : _img_pairs_to_list (list (img_pairs_it ))))
137
158
return iso_sf_images
159
+
0 commit comments