@@ -41,8 +41,6 @@ process blastn {
4141
4242 output:
4343 tuple val(sample_id), val(db_id), path(" ${ sample_id} _${ db_id} _blast.csv" ), emit: blast_report, optional:true
44- tuple val(sample_id), val(db_id), path(" ${ sample_id} _${ db_id} _seq_description" ), emit: seq_description, optional:true
45- tuple val(sample_id), val(db_id), path(" ${ sample_id} _${ db_id} _lineages.tsv" ), emit: lineage, optional:true
4644 tuple val(sample_id), path(" ${ sample_id} _${ db_id} _blastn_provenance.yml" ), emit: provenance
4745
4846 script:
@@ -52,7 +50,6 @@ process blastn {
5250 echo "${ seq.seqString} " >> ${ sample_id} .fa
5351
5452 export BLASTDB="${ db_dir} "
55- export TAXONKIT_DB="${ params.taxonkit_db} "
5653
5754 echo "query_seq_id,subject_accession,subject_strand,query_length,query_start,query_end,subject_length,subject_start,subject_end,alignment_length,percent_identity,percent_coverage,num_mismatch,num_gaps,e_value,bitscore,subject_taxids,subject_names" > ${ sample_id} _${ db_id} _blast.csv
5855
@@ -65,11 +62,6 @@ process blastn {
6562 -outfmt "6 qseqid saccver sstrand qlen qstart qend slen sstart send length pident qcovhsp mismatch gaps evalue bitscore staxids sscinames" \
6663 | tr \$ "\\ t" "," >> ${ sample_id} _${ db_id} _blast.csv
6764
68- get_taxids.py --input ${ sample_id} _${ db_id} _blast.csv > ${ sample_id} _${ db_id} _taxids.csv
69- printf 'query_taxid\\ tlineage\\ tlineage_taxids\\ tquery_taxon_name\\ tlineage_ranks\\ n' > ${ sample_id} _${ db_id} _lineages.tsv
70- taxonkit lineage -R -n -t ${ sample_id} _${ db_id} _taxids.csv >> ${ sample_id} _${ db_id} _lineages.tsv
71- mv ${ sample_id} _${ db_id} _blast.csv ${ sample_id} _${ db_id} _blast_tmp.csv
72- bind_taxonkit.py -f ${ sample_id} _${ db_id} _lineages.tsv -b ${ sample_id} _${ db_id} _blast_tmp.csv > ${ sample_id} _${ db_id} _blast.csv
7365
7466 if [ "${ params.no_db_metadata} " == "false" ]; then
7567 mv ${ sample_id} _${ db_id} _blast.csv ${ sample_id} _${ db_id} _blast_tmp.csv
@@ -86,15 +78,130 @@ process blastn {
8678 value: ${ params.minid}
8779 - parameter: "qcov_hsp_perc"
8880 value: ${ params.mincov}
81+ databases:
82+ - database_name: ${ db_name}
83+ database_version: \$ (grep "version" ${ db_dir} /metadata.json | cut -d" " -f4 | sed 's/"//g;s/,//g')
84+ files:
85+ \$ (sha256sum \$ (readlink -f ${ db_dir} )/${ db_name} * | awk '{ printf(" - filename: \\ "%s\\ "\\ n sha256: \\ "%s\\ "\\ n", \$ 2, \$ 1) }')
86+ EOL_VERSIONS
87+
88+ """
89+ }
90+
91+ process blastn_ncbi {
92+
93+ tag { sample_id + ' / nt' }
94+
95+ cpus params. remote_ncbi ? 1 : 32
96+
97+ memory params. remote_ncbi ? " 2 GB" : " 128 GB"
98+
99+ publishDir " ${ params.outdir} /${ sample_id} " , mode: ' copy' , pattern: " ${ sample_id} *blast*"
100+
101+ input:
102+ tuple val(seq), path(db_path)
103+
104+ output:
105+ tuple val(sample_id), val(" nt" ), path(" ${ sample_id} _nt_blast.csv" ) , emit: blast_report, optional: true
106+ tuple val(sample_id), path(" ${ sample_id} _blastn_nt_provenance.yml" ), emit: provenance
107+
108+ script:
109+ sample_id = seq. id
110+ remote = params. remote_ncbi ? " -remote" : " "
111+ threads = params. remote_ncbi ? " " : " -num_threads ${ task.cpus} "
112+
113+ """
114+ echo ">${ sample_id} " > ${ sample_id} .fa
115+ echo "${ seq.seqString} " >> ${ sample_id} .fa
116+
117+ echo "query_seq_id,subject_accession,subject_strand,query_length,query_start,query_end,subject_length,subject_start,subject_end,alignment_length,percent_identity,percent_coverage,num_mismatch,num_gaps,e_value,bitscore,subject_taxids,subject_names" > ${ sample_id} _nt_blast.csv
118+
119+ export BLASTDB=${ db_path}
120+
121+ blastn \
122+ ${ remote} \
123+ ${ threads} \
124+ -db core_nt \
125+ -perc_identity ${ params.minid} \
126+ -qcov_hsp_perc ${ params.mincov} \
127+ -query ${ sample_id} .fa \
128+ -outfmt "6 qseqid saccver sstrand qlen qstart qend slen sstart send length pident qcovhsp mismatch gaps evalue bitscore staxids sscinames" \
129+ | tr \$ "\\ t" "," >> ${ sample_id} _nt_blast.csv
130+
131+
132+ if [ "${ params.no_db_metadata} " == "false" ]; then
133+ mv ${ sample_id} _nt_blast.csv ${ sample_id} _blast_tmp.csv
134+ add_db_metadata.py -m ${ db_path} /core_nt-nucl-metadata.json -b ${ sample_id} _blast_tmp.csv -d "core_nt" > ${ sample_id} _nt_blast.csv
135+ fi
136+
137+
138+ cat <<-EOL_PROVENANCE > ${ sample_id} _blastn_nt_provenance.yml
139+ - process_name: "${ task.process} "
140+ tools:
141+ - tool_name: blastn
142+ tool_version: \$ (blastn -version | head -n1 | sed 's/blastn: //g')
143+ parameters:
144+ - parameter: "perc_identity"
145+ value: ${ params.minid}
146+ - parameter: "qcov_hsp_perc"
147+ value: ${ params.mincov}
148+ databases:
149+ EOL_PROVENANCE
150+
151+ if [ ${ params.remote_ncbi} ] ; then
152+ cat <<-EOL_PROVENANCE >> ${ sample_id} _blastn_nt_provenance.yml
153+ - database_name: core_nt
154+ database_version: N/A
155+ EOL_PROVENANCE
156+ else
157+ cat <<-EOL_PROVENANCE >> ${ sample_id} _blastn_nt_provenance.yml
158+ - database_name: ${ db_path}
159+ database_version: \$ (grep "version" ${ db_path} /metadata.json | cut -d" " -f4 | sed 's/"//g;s/,//g')
160+ files:
161+ \$ (sha256sum \$ (readlink -f ${ db_path} )/* | awk '{ printf(" - filename: \\ "%s\\ "\\ n sha256: \\ "%s\\ "\\ n", \$ 2, \$ 1) }')
162+ EOL_PROVENANCE
163+ fi
164+
165+ """
166+ }
167+
168+ process taxonkit_annotation {
169+
170+ tag { sample_id + ' / ' + db_id }
171+
172+ publishDir " ${ params.outdir} /${ sample_id} " , mode: ' copy' , pattern: " ${ sample_id} _${ db_id} *"
173+
174+ input:
175+ tuple val(sample_id), val(db_id), path(blast_results)
176+
177+ output:
178+ tuple val(sample_id), val(db_id), path(" ${ sample_id} _${ db_id} _blast_anno.csv" ), emit: blast_report, optional:true
179+ tuple val(sample_id), val(db_id), path(" ${ sample_id} _${ db_id} _seq_description" ), emit: seq_description, optional:true
180+ tuple val(sample_id), val(db_id), path(" ${ sample_id} _${ db_id} _lineages.tsv" ), emit: lineage, optional:true
181+ tuple val(sample_id), path(" ${ sample_id} _${ db_id} _taxonkit_provenance.yml" ), emit: provenance
182+
183+ script:
184+
185+ """
186+ export TAXONKIT_DB="${ params.taxonkit_db} "
187+
188+ get_taxids.py --input ${ blast_results} > ${ sample_id} _${ db_id} _taxids.csv
189+ printf 'query_taxid\\ tlineage\\ tlineage_taxids\\ tquery_taxon_name\\ tlineage_ranks\\ n' > ${ sample_id} _${ db_id} _lineages.tsv
190+ taxonkit lineage -R -n -t ${ sample_id} _${ db_id} _taxids.csv >> ${ sample_id} _${ db_id} _lineages.tsv
191+ bind_taxonkit.py -f ${ sample_id} _${ db_id} _lineages.tsv -b ${ sample_id} _${ db_id} _blast.csv > ${ sample_id} _${ db_id} _blast_anno.csv
192+
193+ cat <<-EOL_VERSIONS > ${ sample_id} _${ db_id} _taxonkit_provenance.yml
194+ - process_name: "${ task.process} "
195+ tools:
89196 - tool_name: taxonkit
90197 tool_version: \$ (taxonkit version | cut -d' ' -f2)
91198 - tool_name: python
92199 tool_version: \$ (python3 --version | cut -d' ' -f2)
93200 databases:
94- - database_name: ${ db_name }
95- database_version: \$ (grep "version" ${ db_dir } /metadata.json | cut -d" " -f4 | sed 's/"//g;s/,//g')
201+ - database_name: taxonkit
202+ database_version: \$ (grep "version" \$ {TAXONKIT_DB }/metadata.json | cut -d" " -f4 | sed 's/"//g;s/,//g')
96203 files:
97- \$ (sha256sum \$ (readlink -f ${ db_dir } )/${ db_name } * | awk '{ printf(" - filename: \\ "%s\\ "\\ n sha256: \\ "%s\\ "\\ n", \$ 2, \$ 1) }')
204+ \$ (sha256sum \$ (readlink -f \$ {TAXONKIT_DB })/*.dmp | awk '{ printf(" - filename: \\ "%s\\ "\\ n sha256: \\ "%s\\ "\\ n", \$ 2, \$ 1) }')
98205 EOL_VERSIONS
99206
100207 """
@@ -120,15 +227,6 @@ process filter_by_regex {
120227 """
121228 filter_by_regex.py -i ${ full_blast_report} -r ${ filter_regexes} > ${ sample_id} _${ db_id} _blast_filtered.csv
122229
123- # cat <<-EOL_VERSIONS > ${ sample_id} _${ db_id} _filter_regex_provenance.yml
124- # - process_name: "${ task.process} "
125- # tools:
126- # - tool_name: python
127- # tool_version: \$ (python3 --version | cut -d' ' -f2)
128- # parameters:
129- # - parameter: filter_regexes
130- # value: ${ filter_regexes}
131- # EOL_VERSIONS
132230 """
133231}
134232
@@ -166,6 +264,7 @@ process build_report {
166264
167265 input:
168266 path(collected_blast)
267+ path(collected_blast_ncbi)
169268 path(database_csv)
170269
171270 output:
@@ -174,7 +273,7 @@ process build_report {
174273
175274 script:
176275 """
177- report.py --blast ${ collected_blast} --db ${ database_csv} --output ${ params.run_name} _report.html
276+ report.py --blast ${ collected_blast} --ncbi ${ collected_blast_ncbi } -- db ${ database_csv} --output ${ params.run_name} _report.html
178277
179278 cat <<-EOL_VERSIONS > report_provenance.yml
180279 - process_name: "${ task.process} "
0 commit comments