@@ -245,28 +245,6 @@ std::vector<std::string> read_bins_paths(const std::string& path, uint32_t n_bin
245
245
return bin_paths;
246
246
}
247
247
248
- std::vector<std::vector<uint64_t >> split_anchors (const std::string& path, uint32_t n_bins) {
249
- std::vector<std::vector<uint64_t >> res;
250
- if (path == " " ) {
251
- return res;
252
- }
253
- res.resize (n_bins);
254
- std::ifstream in (path);
255
- if (!in) {
256
- std::cerr << " Error: cannot open file " << path << " \n " ;
257
- exit (1 );
258
- }
259
- std::string str_anchor;
260
- ReadAnchorsFromPlainOrDSV (in, path,
261
- [&res, &n_bins](uint64_t anchor)
262
- {
263
- uint64_t bin_id = refresh::MurMur64Hash{}(anchor) % n_bins;
264
- res[bin_id].push_back (anchor);
265
- });
266
- return res;
267
- }
268
-
269
-
270
248
template <typename writer_t >
271
249
class SeparatelyOrNot {
272
250
static_assert (std::is_same_v<writer_t , writer_binary> ||
@@ -323,21 +301,24 @@ void process_multibin_mode_impl(const Params& params) {
323
301
std::is_same_v<separately_or_not_t , SeparatelyOrNot<writer_binary>>);
324
302
325
303
auto bin_paths = read_bins_paths (params.input , params.n_bins );
326
- auto bins_anchors = split_anchors (params.anchor_list_path , params.n_bins );
327
- bool accept_all = bins_anchors.empty ();
328
304
329
305
separately_or_not_t separately_or_not (params.output , params.separately );
330
306
307
+
308
+ // previously I have filtering anchors (params.anchor_list_path) splitted into bins to lower the size of the datastructure in AcceptedAnchors
309
+ // I have never test how it influences the performance
310
+ // the problem with this is that if bins in the input file are not in order bin_id_0, bin_id_1, etc. there will be inconsistency
311
+ // and wrong filtering set is used
312
+ // this could be solved by checking the bin id based on first anchor in given bin and selecting appropriate filtering anchor set
313
+ // but for simiplicity I will just create a single AcceptedAnchors instance for all bins
314
+ // I leave this comment for future reference, if there will be performance issue
315
+ // Last commit having this with splitting filtering anchors and possible wrong results: 23b32b5d85049d3f808f4e444910832d2d03d2f1
316
+
317
+ AcceptedAnchors accepted_anchors (params.anchor_list_path );
331
318
SampleNameDecoder sample_name_decoder (params.sample_names );
332
319
for (uint32_t bin_id = 0 ; bin_id < params.n_bins ; ++bin_id) {
333
320
separately_or_not.StartBin (bin_id);
334
321
335
- if (!accept_all && bins_anchors[bin_id].empty ()) {
336
- std::cerr << " INFO: none of specified anchors occurs in bin " << bin_id << " (" << bin_paths[bin_id]<< " ). Skip reading bin content.\n " ;
337
- continue ;
338
- }
339
- AcceptedAnchors accepted_anchors (accept_all ? std::vector<uint64_t >{} : bins_anchors[bin_id]);
340
-
341
322
buffered_binary_reader in (bin_paths[bin_id]);
342
323
if (!in) {
343
324
std::cerr << " Error: cannot open file " << bin_paths[bin_id] << " \n " ;
0 commit comments