@@ -73,6 +73,9 @@ int usage_subcommand(std::string subcommand) {
7373
7474 if (subcommand == " compute" || subcommand == " maskopt" || subcommand == " lowerbound" )
7575 std::cerr << " -u - treat k-mer and its reverse complement as distinct" << std::endl;
76+
77+ if (subcommand == " compute" )
78+ std::cerr << " -z INT - minimum frequency to represent a k-mer; default 1" << std::endl;
7679
7780 if (subcommand == " mssep2ms" ) {
7881 std::cerr << " -m FILE - input file with mask" << std::endl;
@@ -97,7 +100,7 @@ void Version() {
97100// / Run KmerCamel with the given parameters.
98101template <typename kmer_t , typename kh_wrapper_t >
99102int kmercamel (kh_wrapper_t wrapper, kmer_t kmer_type, std::string path, int k, int d_max, std::ostream *of, std::ostream *maskf, bool complements, bool masks,
100- std::string algorithm, bool lower_bound, bool assume_simplitigs) {
103+ std::string algorithm, bool lower_bound, bool assume_simplitigs, uint16_t min_frequency ) {
101104 if (masks) {
102105 WriteLog (" Started optimization of a masked superstring from '" + path + " '." );
103106 int ret = Optimize (wrapper, kmer_type, algorithm, path, *of, k, complements);
@@ -112,7 +115,8 @@ int kmercamel(kh_wrapper_t wrapper, kmer_t kmer_type, std::string path, int k, i
112115 /* Handle streaming algorithm separately. */
113116 if (algorithm == " streaming" ) {
114117 WriteName (path, algorithm, k, false , !complements, *of);
115- Streaming (wrapper, kmer_type, path, *of, k , complements);
118+ if (min_frequency == 1 ) Streaming (wrapper, kmer_type, path, *of, k , complements);
119+ else StreamingFiltered (wrapper, kmer_type, path, *of, k , complements, min_frequency);
116120 WriteLog (" Finished masked superstring computation." );
117121 }
118122 /* Handle hash table based algorithms separately so that they consume less memory. */
@@ -121,7 +125,11 @@ int kmercamel(kh_wrapper_t wrapper, kmer_t kmer_type, std::string path, int k, i
121125 auto *kMers = wrapper.kh_init_set ();
122126 size_t kmer_count;
123127 if (!assume_simplitigs) {
124- ReadKMers (kMers , wrapper, kmer_type, path, k, complements);
128+ if (min_frequency == 1 ) {
129+ ReadKMers (kMers , wrapper, kmer_type, path, k, complements);
130+ } else {
131+ ReadKMersFiltered (kMers , wrapper, kmer_type, path, k, complements, min_frequency);
132+ }
125133
126134 if (!kh_size (kMers )) {
127135 std::cerr << " Path '" << path << " ' contains no k-mers. Make sure that your file is a FASTA or gzipped FASTA." << std::endl;
@@ -198,8 +206,9 @@ int camel_compute(int argc, char **argv) {
198206 bool d_set = false ;
199207 bool assume_simplitigs = false ;
200208 int opt;
209+ uint16_t min_frequency = 1 ;
201210 try {
202- while ((opt = getopt (argc, argv, " k:d:a:o:huxM:S " )) != -1 ) {
211+ while ((opt = getopt (argc, argv, " k:d:a:o:huxM:Sz: " )) != -1 ) {
203212 switch (opt) {
204213 case ' o' :
205214 output.open (optarg);
@@ -231,6 +240,9 @@ int camel_compute(int argc, char **argv) {
231240 case ' h' :
232241 usage_subcommand (subcommand);
233242 return 0 ;
243+ case ' z' :
244+ min_frequency = std::stoi (optarg);
245+ break ;
234246 default :
235247 return usage_subcommand (subcommand);
236248 }
@@ -263,13 +275,19 @@ int camel_compute(int argc, char **argv) {
263275 } else if (assume_simplitigs && algorithm != " global" ) {
264276 std::cerr << " Optimization for the input being simplitigs is possible only with global." << std::endl;
265277 return usage_subcommand (subcommand);
278+ } else if (min_frequency >= 256 || min_frequency < 1 ) {
279+ std::cerr << " Minimum frequency '-z' must be between 1 and 255." << std::endl;
280+ return usage_subcommand (subcommand);
281+ } else if (min_frequency != 1 && assume_simplitigs) {
282+ std::cerr << " Inputting simplitigs is not compatible with frequency filterring." << std::endl;
283+ return usage_subcommand (subcommand);
266284 }
267285 if (k < 32 ) {
268- return kmercamel (kmer_dict64_t (), kmer64_t (0 ), path, k, d_max, of, maskf, complements, false , algorithm, false , assume_simplitigs);
286+ return kmercamel (kmer_dict64_t (), kmer64_t (0 ), path, k, d_max, of, maskf, complements, false , algorithm, false , assume_simplitigs, min_frequency );
269287 } else if (k < 64 ) {
270- return kmercamel (kmer_dict128_t (), kmer128_t (0 ), path, k, d_max, of, maskf, complements, false , algorithm, false , assume_simplitigs);
288+ return kmercamel (kmer_dict128_t (), kmer128_t (0 ), path, k, d_max, of, maskf, complements, false , algorithm, false , assume_simplitigs, min_frequency );
271289 } else {
272- return kmercamel (kmer_dict256_t (), kmer256_t (0 ), path, k, d_max, of, maskf, complements, false , algorithm, false , assume_simplitigs);
290+ return kmercamel (kmer_dict256_t (), kmer256_t (0 ), path, k, d_max, of, maskf, complements, false , algorithm, false , assume_simplitigs, min_frequency );
273291 }
274292}
275293
@@ -327,11 +345,11 @@ int camel_optimize(int argc, char **argv) {
327345 return usage_subcommand (subcommand);
328346 }
329347 if (k < 32 ) {
330- return kmercamel (kmer_dict64_t (), kmer64_t (0 ), path, k, 0 , of, nullptr , complements, true , algorithm, false , false );
348+ return kmercamel (kmer_dict64_t (), kmer64_t (0 ), path, k, 0 , of, nullptr , complements, true , algorithm, false , false , 1 );
331349 } else if (k < 64 ) {
332- return kmercamel (kmer_dict128_t (), kmer128_t (0 ), path, k, 0 , of, nullptr , complements, true , algorithm, false , false );
350+ return kmercamel (kmer_dict128_t (), kmer128_t (0 ), path, k, 0 , of, nullptr , complements, true , algorithm, false , false , 1 );
333351 } else {
334- return kmercamel (kmer_dict256_t (), kmer256_t (0 ), path, k, 0 , of, nullptr , complements, true , algorithm, false , false );
352+ return kmercamel (kmer_dict256_t (), kmer256_t (0 ), path, k, 0 , of, nullptr , complements, true , algorithm, false , false , 1 );
335353 }
336354}
337355
@@ -379,11 +397,11 @@ int camel_lowerbound(int argc, char **argv) {
379397 return usage_subcommand (subcommand);
380398 }
381399 if (k < 32 ) {
382- return kmercamel (kmer_dict64_t (), kmer64_t (0 ), path, k, 0 , of, nullptr , complements, false , " global" , true , false );
400+ return kmercamel (kmer_dict64_t (), kmer64_t (0 ), path, k, 0 , of, nullptr , complements, false , " global" , true , false , 1 );
383401 } else if (k < 64 ) {
384- return kmercamel (kmer_dict128_t (), kmer128_t (0 ), path, k, 0 , of, nullptr , complements, false , " global" , true , false );
402+ return kmercamel (kmer_dict128_t (), kmer128_t (0 ), path, k, 0 , of, nullptr , complements, false , " global" , true , false , 1 );
385403 } else {
386- return kmercamel (kmer_dict256_t (), kmer256_t (0 ), path, k, 0 , of, nullptr , complements, false , " global" , true , false );
404+ return kmercamel (kmer_dict256_t (), kmer256_t (0 ), path, k, 0 , of, nullptr , complements, false , " global" , true , false , 1 );
387405 }
388406}
389407
0 commit comments