Skip to content

Commit 6f85247

Browse files
committed
add compatibility for second ncbi table
1 parent 68dce2a commit 6f85247

File tree

3 files changed

+154
-34
lines changed

3 files changed

+154
-34
lines changed

main.nf

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,19 @@ println "Current date and time: $formattedDateTime"
1717

1818
nextflow.enable.dsl = 2
1919

20-
include { hash_seqs } from './modules/hash_seqs.nf'
21-
include { seq_qc } from './modules/blast.nf'
22-
include { blastn } from './modules/blast.nf'
23-
include { filter_by_regex } from './modules/blast.nf'
24-
include { filter_best_bitscore } from './modules/blast.nf'
25-
include { build_report } from './modules/blast.nf'
26-
include { collect_provenance } from './modules/provenance.nf'
27-
include { pipeline_provenance } from './modules/provenance.nf'
20+
include { hash_seqs } from './modules/hash_seqs.nf'
21+
include { seq_qc } from './modules/blast.nf'
22+
include { blastn } from './modules/blast.nf'
23+
include { blastn_ncbi } from './modules/blast.nf'
24+
include { taxonkit_annotation as taxonkit_annotation_local } from './modules/blast.nf'
25+
include { taxonkit_annotation as taxonkit_annotation_ncbi } from './modules/blast.nf'
26+
include { filter_by_regex as filter_by_regex_local } from './modules/blast.nf'
27+
include { filter_by_regex as filter_by_regex_ncbi } from './modules/blast.nf'
28+
include { filter_best_bitscore as filter_best_bitscore_local } from './modules/blast.nf'
29+
include { filter_best_bitscore as filter_best_bitscore_ncbi } from './modules/blast.nf'
30+
include { build_report } from './modules/blast.nf'
31+
include { collect_provenance } from './modules/provenance.nf'
32+
include { pipeline_provenance } from './modules/provenance.nf'
2833

2934

3035
workflow {
@@ -49,6 +54,8 @@ workflow {
4954
ch_db = Channel.of()
5055
}
5156

57+
ch_ncbi_db = Channel.fromPath(params.ncbi_db)
58+
5259
ch_seqs = ch_fasta.splitFasta(record: [id: true, seqString: true])
5360

5461
main:
@@ -58,20 +65,31 @@ workflow {
5865

5966
seq_qc(ch_seqs)
6067
ch_blast = blastn(ch_seqs.combine(ch_db)).blast_report
61-
ch_blast_prov = blastn.out.provenance.map{}
68+
ch_blast = taxonkit_annotation_local(ch_blast).blast_report
69+
70+
ch_blast_ncbi = blastn_ncbi(ch_seqs.combine(ch_ncbi_db)).blast_report
71+
ch_blast_ncbi = taxonkit_annotation_ncbi(ch_blast_ncbi).blast_report
6272

6373
if (params.filter_regexes != 'NO_FILE') {
6474
ch_regexes = Channel.fromPath(params.filter_regexes)
65-
ch_blast = filter_by_regex(ch_blast.combine(ch_regexes)).blast_filtered
75+
ch_blast = filter_by_regex_local(ch_blast.combine(ch_regexes)).blast_filtered
76+
ch_blast_ncbi = filter_by_regex_ncbi(ch_blast_ncbi.combine(ch_regexes)).blast_filtered
6677
}
6778

6879
ch_blast_collect = ch_blast.collectFile(it -> it[2], name: "collected_blast.csv", storeDir: params.outdir, keepHeader: true, skip: 1)
80+
81+
ch_blast_ncbi_collect = ch_blast_ncbi.collectFile(it -> it[2], name: "collected_blast_ncbi.csv", storeDir: params.outdir, keepHeader: true, skip: 1)
6982

70-
filter_best_bitscore(ch_blast)
83+
filter_best_bitscore_local(ch_blast)
84+
85+
filter_best_bitscore_ncbi(ch_blast_ncbi)
7186

72-
filter_best_bitscore.out.blast_best_bitscore_csv.collectFile(it -> it[1], name: "collected_blast_best_bitscore.csv", storeDir: params.outdir, keepHeader: true, skip: 1)
87+
filter_best_bitscore_local.out.blast_best_bitscore_csv.collectFile(it -> it[1], name: "collected_blast_best_bitscore.csv", storeDir: params.outdir, keepHeader: true, skip: 1)
88+
89+
filter_best_bitscore_ncbi.out.blast_best_bitscore_csv.collectFile(it -> it[1], name: "collected_blast_ncbi_best_bitscore.csv", storeDir: params.outdir, keepHeader: true, skip: 1)
90+
7391

74-
build_report(ch_blast_collect, Channel.fromPath(params.databases))
92+
build_report(ch_blast_collect, ch_blast_ncbi_collect, Channel.fromPath(params.databases))
7593

7694
// Build pipeline provenance
7795
ch_pipeline_provenance = pipeline_provenance(ch_pipeline_metadata, build_report.out.provenance)
@@ -80,6 +98,7 @@ workflow {
8098
ch_provenance = hash_seqs.out.provenance
8199
ch_provenance = ch_provenance.join(seq_qc.out.provenance).map{ it -> [it[0], [it[1]] << it[2]] }
82100
ch_provenance = ch_provenance.join(blastn.out.provenance.groupTuple()).map{ it -> [it[0], (it[1] + it[2]).flatten() ] }
101+
ch_provenance = ch_provenance.join(blastn_ncbi.out.provenance.groupTuple()).map{ it -> [it[0], (it[1] + it[2]).flatten() ] }
83102
//ch_provenance = ch_provenance.join(filter_best_bitscore.out.provenance.groupTuple()).map{ it -> [it[0], (it[1] + it[2]).flatten()] }
84103
ch_provenance = ch_provenance.join(seq_qc.out.provenance.map{it -> it[0]}.combine(ch_pipeline_provenance)).map{ it -> [it[0], it[1] << it[2]] }
85104
collect_provenance(ch_provenance)

modules/blast.nf

Lines changed: 120 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,6 @@ process blastn {
4141

4242
output:
4343
tuple val(sample_id), val(db_id), path("${sample_id}_${db_id}_blast.csv"), emit: blast_report, optional:true
44-
tuple val(sample_id), val(db_id), path("${sample_id}_${db_id}_seq_description"), emit: seq_description, optional:true
45-
tuple val(sample_id), val(db_id), path("${sample_id}_${db_id}_lineages.tsv"), emit: lineage, optional:true
4644
tuple val(sample_id), path("${sample_id}_${db_id}_blastn_provenance.yml"), emit: provenance
4745

4846
script:
@@ -52,7 +50,6 @@ process blastn {
5250
echo "${seq.seqString}" >> ${sample_id}.fa
5351
5452
export BLASTDB="${db_dir}"
55-
export TAXONKIT_DB="${params.taxonkit_db}"
5653
5754
echo "query_seq_id,subject_accession,subject_strand,query_length,query_start,query_end,subject_length,subject_start,subject_end,alignment_length,percent_identity,percent_coverage,num_mismatch,num_gaps,e_value,bitscore,subject_taxids,subject_names" > ${sample_id}_${db_id}_blast.csv
5855
@@ -65,11 +62,6 @@ process blastn {
6562
-outfmt "6 qseqid saccver sstrand qlen qstart qend slen sstart send length pident qcovhsp mismatch gaps evalue bitscore staxids sscinames" \
6663
| tr \$"\\t" "," >> ${sample_id}_${db_id}_blast.csv
6764
68-
get_taxids.py --input ${sample_id}_${db_id}_blast.csv > ${sample_id}_${db_id}_taxids.csv
69-
printf 'query_taxid\\tlineage\\tlineage_taxids\\tquery_taxon_name\\tlineage_ranks\\n' > ${sample_id}_${db_id}_lineages.tsv
70-
taxonkit lineage -R -n -t ${sample_id}_${db_id}_taxids.csv >> ${sample_id}_${db_id}_lineages.tsv
71-
mv ${sample_id}_${db_id}_blast.csv ${sample_id}_${db_id}_blast_tmp.csv
72-
bind_taxonkit.py -f ${sample_id}_${db_id}_lineages.tsv -b ${sample_id}_${db_id}_blast_tmp.csv > ${sample_id}_${db_id}_blast.csv
7365
7466
if [ "${params.no_db_metadata}" == "false" ]; then
7567
mv ${sample_id}_${db_id}_blast.csv ${sample_id}_${db_id}_blast_tmp.csv
@@ -86,15 +78,130 @@ process blastn {
8678
value: ${params.minid}
8779
- parameter: "qcov_hsp_perc"
8880
value: ${params.mincov}
81+
databases:
82+
- database_name: ${db_name}
83+
database_version: \$(grep "version" ${db_dir}/metadata.json | cut -d" " -f4 | sed 's/"//g;s/,//g')
84+
files:
85+
\$(sha256sum \$(readlink -f ${db_dir})/${db_name}* | awk '{ printf(" - filename: \\"%s\\"\\n sha256: \\"%s\\"\\n", \$2, \$1) }')
86+
EOL_VERSIONS
87+
88+
"""
89+
}
90+
91+
process blastn_ncbi {
92+
93+
tag { sample_id + ' / nt' }
94+
95+
cpus params.remote_ncbi ? 1 : 32
96+
97+
memory params.remote_ncbi ? "2 GB" : "128 GB"
98+
99+
publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}*blast*"
100+
101+
input:
102+
tuple val(seq), path(db_path)
103+
104+
output:
105+
tuple val(sample_id), val("nt"), path("${sample_id}_nt_blast.csv") , emit: blast_report, optional: true
106+
tuple val(sample_id), path("${sample_id}_blastn_nt_provenance.yml"), emit: provenance
107+
108+
script:
109+
sample_id = seq.id
110+
remote = params.remote_ncbi ? "-remote" : ""
111+
threads = params.remote_ncbi ? "" : "-num_threads ${task.cpus}"
112+
113+
"""
114+
echo ">${sample_id}" > ${sample_id}.fa
115+
echo "${seq.seqString}" >> ${sample_id}.fa
116+
117+
echo "query_seq_id,subject_accession,subject_strand,query_length,query_start,query_end,subject_length,subject_start,subject_end,alignment_length,percent_identity,percent_coverage,num_mismatch,num_gaps,e_value,bitscore,subject_taxids,subject_names" > ${sample_id}_nt_blast.csv
118+
119+
export BLASTDB=${db_path}
120+
121+
blastn \
122+
${remote} \
123+
${threads} \
124+
-db core_nt \
125+
-perc_identity ${params.minid} \
126+
-qcov_hsp_perc ${params.mincov} \
127+
-query ${sample_id}.fa \
128+
-outfmt "6 qseqid saccver sstrand qlen qstart qend slen sstart send length pident qcovhsp mismatch gaps evalue bitscore staxids sscinames" \
129+
| tr \$"\\t" "," >> ${sample_id}_nt_blast.csv
130+
131+
132+
if [ "${params.no_db_metadata}" == "false" ]; then
133+
mv ${sample_id}_nt_blast.csv ${sample_id}_blast_tmp.csv
134+
add_db_metadata.py -m ${db_path}/core_nt-nucl-metadata.json -b ${sample_id}_blast_tmp.csv -d "core_nt" > ${sample_id}_nt_blast.csv
135+
fi
136+
137+
138+
cat <<-EOL_PROVENANCE > ${sample_id}_blastn_nt_provenance.yml
139+
- process_name: "${task.process}"
140+
tools:
141+
- tool_name: blastn
142+
tool_version: \$(blastn -version | head -n1 | sed 's/blastn: //g')
143+
parameters:
144+
- parameter: "perc_identity"
145+
value: ${params.minid}
146+
- parameter: "qcov_hsp_perc"
147+
value: ${params.mincov}
148+
databases:
149+
EOL_PROVENANCE
150+
151+
if [ ${params.remote_ncbi} ] ; then
152+
cat <<-EOL_PROVENANCE >> ${sample_id}_blastn_nt_provenance.yml
153+
- database_name: core_nt
154+
database_version: N/A
155+
EOL_PROVENANCE
156+
else
157+
cat <<-EOL_PROVENANCE >> ${sample_id}_blastn_nt_provenance.yml
158+
- database_name: ${db_path}
159+
database_version: \$(grep "version" ${db_path}/metadata.json | cut -d" " -f4 | sed 's/"//g;s/,//g')
160+
files:
161+
\$(sha256sum \$(readlink -f ${db_path})/* | awk '{ printf(" - filename: \\"%s\\"\\n sha256: \\"%s\\"\\n", \$2, \$1) }')
162+
EOL_PROVENANCE
163+
fi
164+
165+
"""
166+
}
167+
168+
process taxonkit_annotation {
169+
170+
tag { sample_id + ' / ' + db_id }
171+
172+
publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_${db_id}*"
173+
174+
input:
175+
tuple val(sample_id), val(db_id), path(blast_results)
176+
177+
output:
178+
tuple val(sample_id), val(db_id), path("${sample_id}_${db_id}_blast_anno.csv"), emit: blast_report, optional:true
179+
tuple val(sample_id), val(db_id), path("${sample_id}_${db_id}_seq_description"), emit: seq_description, optional:true
180+
tuple val(sample_id), val(db_id), path("${sample_id}_${db_id}_lineages.tsv"), emit: lineage, optional:true
181+
tuple val(sample_id), path("${sample_id}_${db_id}_taxonkit_provenance.yml"), emit: provenance
182+
183+
script:
184+
185+
"""
186+
export TAXONKIT_DB="${params.taxonkit_db}"
187+
188+
get_taxids.py --input ${blast_results} > ${sample_id}_${db_id}_taxids.csv
189+
printf 'query_taxid\\tlineage\\tlineage_taxids\\tquery_taxon_name\\tlineage_ranks\\n' > ${sample_id}_${db_id}_lineages.tsv
190+
taxonkit lineage -R -n -t ${sample_id}_${db_id}_taxids.csv >> ${sample_id}_${db_id}_lineages.tsv
191+
bind_taxonkit.py -f ${sample_id}_${db_id}_lineages.tsv -b ${sample_id}_${db_id}_blast.csv > ${sample_id}_${db_id}_blast_anno.csv
192+
193+
cat <<-EOL_VERSIONS > ${sample_id}_${db_id}_taxonkit_provenance.yml
194+
- process_name: "${task.process}"
195+
tools:
89196
- tool_name: taxonkit
90197
tool_version: \$(taxonkit version | cut -d' ' -f2)
91198
- tool_name: python
92199
tool_version: \$(python3 --version | cut -d' ' -f2)
93200
databases:
94-
- database_name: ${db_name}
95-
database_version: \$(grep "version" ${db_dir}/metadata.json | cut -d" " -f4 | sed 's/"//g;s/,//g')
201+
- database_name: taxonkit
202+
database_version: \$(grep "version" \${TAXONKIT_DB}/metadata.json | cut -d" " -f4 | sed 's/"//g;s/,//g')
96203
files:
97-
\$(sha256sum \$(readlink -f ${db_dir})/${db_name}* | awk '{ printf(" - filename: \\"%s\\"\\n sha256: \\"%s\\"\\n", \$2, \$1) }')
204+
\$(sha256sum \$(readlink -f \${TAXONKIT_DB})/*.dmp | awk '{ printf(" - filename: \\"%s\\"\\n sha256: \\"%s\\"\\n", \$2, \$1) }')
98205
EOL_VERSIONS
99206
100207
"""
@@ -120,15 +227,6 @@ process filter_by_regex {
120227
"""
121228
filter_by_regex.py -i ${full_blast_report} -r ${filter_regexes} > ${sample_id}_${db_id}_blast_filtered.csv
122229
123-
# cat <<-EOL_VERSIONS > ${sample_id}_${db_id}_filter_regex_provenance.yml
124-
# - process_name: "${task.process}"
125-
# tools:
126-
# - tool_name: python
127-
# tool_version: \$(python3 --version | cut -d' ' -f2)
128-
# parameters:
129-
# - parameter: filter_regexes
130-
# value: ${filter_regexes}
131-
# EOL_VERSIONS
132230
"""
133231
}
134232

@@ -166,6 +264,7 @@ process build_report {
166264

167265
input:
168266
path(collected_blast)
267+
path(collected_blast_ncbi)
169268
path(database_csv)
170269

171270
output:
@@ -174,7 +273,7 @@ process build_report {
174273

175274
script:
176275
"""
177-
report.py --blast ${collected_blast} --db ${database_csv} --output ${params.run_name}_report.html
276+
report.py --blast ${collected_blast} --ncbi ${collected_blast_ncbi} --db ${database_csv} --output ${params.run_name}_report.html
178277
179278
cat <<-EOL_VERSIONS > report_provenance.yml
180279
- process_name: "${task.process}"

nextflow.config

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ params {
2020
no_db_metadata = false
2121
taxonkit_db = ''
2222
filter_regexes = 'NO_FILE'
23+
remote_ncbi = false
24+
ncbi_db = 'NO_FILE'
2325
minid = 95.0
2426
mincov = 95.0
2527
pipeline_short_name = parsePipelineName(manifest.toMap().get('name'))

0 commit comments

Comments
 (0)