Skip to content

Commit 34f1b17

Browse files
author
Sherrie Wang
committed
first commit
0 parents  commit 34f1b17

File tree

7 files changed

+338
-0
lines changed

7 files changed

+338
-0
lines changed

bin/bind_taxonkit.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import pandas as pd
5+
import numpy as np
6+
7+
def main(args):
8+
cols = 'subject_taxids,lineage,name,rank'.split(',')
9+
#taxon_results = pd.read_csv('F1910235_taxon_results.txt',sep = '\t', names=cols)
10+
taxon_results = pd.read_csv(args.taxonresult,sep = '\t', names=cols)
11+
12+
blast_results = pd.read_csv(args.blastresult,sep = ',')
13+
taxon_results = taxon_results.dropna()
14+
15+
16+
17+
conditions = [
18+
(taxon_results['rank'] == "genus"),
19+
(taxon_results['rank'] == "species"),
20+
(taxon_results['rank'] == "strain")
21+
22+
]
23+
24+
choices_species = [None, taxon_results['lineage'].apply(lambda x: x.split(';')[-1]), taxon_results['lineage'].apply(lambda x: x.split(';')[-2])]
25+
choices_genus = [taxon_results['lineage'].apply(lambda x: x.split(';')[-1]), taxon_results['lineage'].apply(lambda x: x.split(';')[-2]),taxon_results['lineage'].apply(lambda x: x.split(';')[-3])]
26+
27+
taxon_results['species'] = np.select(conditions,choices_species, default = taxon_results['lineage'].apply(lambda x: x.split(';')[-1]))
28+
taxon_results['genus'] = np.select(conditions,choices_genus, default = taxon_results['lineage'].apply(lambda x: x.split(';')[-2]))
29+
30+
31+
merged = pd.merge(blast_results,taxon_results[['subject_taxids','species','genus']],on='subject_taxids', how='left')
32+
fil = merged['species'].str.contains('uncultured')
33+
filtered_merged = merged[~fil]
34+
35+
filtered_merged.to_csv(args.outfile)
36+
37+
if __name__ == "__main__":
38+
parser = argparse.ArgumentParser()
39+
40+
parser.add_argument('-f','--taxonresult')
41+
parser.add_argument('-b','--blastresult')
42+
parser.add_argument('-o','--outfile')
43+
args = parser.parse_args()
44+
main(args)

bin/filter_best_bitscore.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import csv
5+
import json
6+
import sys
7+
8+
9+
def parse_blast_report(blast_report_path):
10+
with open(blast_report_path, 'r') as f:
11+
header_line = f.readline().strip()
12+
header_fieldnames = header_line.split(',')
13+
14+
int_fields = [
15+
'bitscore',
16+
]
17+
parsed_blast_report = []
18+
with open(blast_report_path, 'r') as f:
19+
reader = csv.DictReader(f)
20+
for row in reader:
21+
for field in int_fields:
22+
row[field] = int(row[field])
23+
parsed_blast_report.append(row)
24+
25+
return header_fieldnames, parsed_blast_report
26+
27+
28+
def determine_best_bitscore(parsed_blast_report):
29+
best_bitscore = 0
30+
for blast_record in parsed_blast_report:
31+
if blast_record['bitscore'] > best_bitscore:
32+
best_bitscore = blast_record['bitscore']
33+
34+
return best_bitscore
35+
36+
37+
def main(args):
38+
output_fieldnames, blast_report = parse_blast_report(args.input)
39+
best_bitscore = determine_best_bitscore(blast_report)
40+
41+
filtered_blast_report = list(filter(lambda x: x['bitscore'] == best_bitscore, blast_report))
42+
43+
writer = csv.DictWriter(sys.stdout, fieldnames=output_fieldnames, dialect='unix', quoting=csv.QUOTE_MINIMAL)
44+
writer.writeheader()
45+
for row in filtered_blast_report:
46+
writer.writerow(row)
47+
48+
49+
if __name__ == '__main__':
50+
parser = argparse.ArgumentParser()
51+
parser.add_argument('-i', '--input')
52+
args = parser.parse_args()
53+
main(args)

bin/seq_qc.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import json
5+
6+
def parse_fasta(fasta_path):
7+
fasta = {}
8+
seq_id = ''
9+
seq = []
10+
with open(fasta_path, 'r') as f:
11+
for line in f:
12+
if line.startswith('>'):
13+
seq_id = line.strip().lstrip('>').split()[0]
14+
else:
15+
seq.append(line.strip())
16+
17+
fasta['id'] = seq_id
18+
fasta['seq'] = ''.join(seq)
19+
20+
return fasta
21+
22+
def main(args):
23+
iupac_ambiguous_bases = set([
24+
'M', 'R', 'W', 'S', 'Y', 'K',
25+
'V', 'H', 'D', 'B',
26+
])
27+
fasta = parse_fasta(args.input)
28+
seq_length = len(fasta['seq'])
29+
num_ambiguous_bases = 0
30+
num_n_bases = 0
31+
32+
for base in fasta['seq']:
33+
if base.upper() in iupac_ambiguous_bases:
34+
num_ambiguous_bases += 1
35+
36+
for base in fasta['seq']:
37+
if base.upper() == 'N':
38+
num_n_bases += 1
39+
40+
print('seq_length,num_ambiguous_bases,num_n_bases')
41+
print(','.join(map(str, [seq_length, num_ambiguous_bases, num_n_bases])))
42+
43+
44+
if __name__ == '__main__':
45+
parser = argparse.ArgumentParser()
46+
parser.add_argument('-i', '--input')
47+
args = parser.parse_args()
48+
main(args)

environments/environment.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
name: its-nf
2+
channels:
3+
- conda-forge
4+
- bioconda
5+
- defaults
6+
dependencies:
7+
- python=3
8+
- blast=2.13.0
9+
- taxonkit=0.14.1
10+
- pandas=1.5.3
11+
- biopython=1.81
12+
- kraken2=2.1.3

main.nf

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
#!/usr/bin/env nextflow
2+
3+
nextflow.enable.dsl = 2
4+
5+
include { seq_qc } from './modules/blast.nf'
6+
include { blastn } from './modules/blast.nf'
7+
include { filter_best_bitscore } from './modules/blast.nf'
8+
9+
workflow {
10+
11+
if (params.samplesheet_input != 'NO_FILE') {
12+
ch_fasta = Channel.fromPath(params.samplesheet_input).splitCsv(header: true).map{ it -> [it['ID'], it['FILE']] }
13+
} else {
14+
ch_fasta = Channel.fromPath(params.fasta_search_path)
15+
}
16+
17+
ch_db = Channel.fromPath(params.db_dir).combine(Channel.of(params.db_name))
18+
19+
ch_seqs = ch_fasta.splitFasta(file: true)
20+
21+
main:
22+
seq_qc(ch_seqs)
23+
ch_blast = blastn(ch_seqs.combine(ch_db))
24+
ch_best_blast = filter_best_bitscore(ch_blast.taxon_results)
25+
26+
ch_blast.taxon_results
27+
.collectFile(it -> it[1], name: "combined_blast_species_genus_results.csv", storeDir: params.outdir,keepHeader: true, skip: 1)
28+
29+
ch_best_blast.blast_best_bitscore_csv
30+
.collectFile(it -> it[1], name: "combined_top1_results.csv", storeDir: params.outdir,keepHeader: true, skip: 1)
31+
}

modules/blast.nf

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
process seq_qc {
2+
3+
tag { sample_id }
4+
5+
executor 'local'
6+
7+
publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_seq_qc.csv"
8+
9+
input:
10+
tuple path(seq)
11+
12+
output:
13+
tuple val(sample_id), path("${sample_id}_seq_qc.csv"), emit: seq_qc_csv
14+
15+
script:
16+
sample_id = seq.getName().split('\\.')[0]
17+
"""
18+
seq_qc.py -i ${seq} > ${sample_id}_seq_qc.csv
19+
"""
20+
}
21+
22+
process blastn {
23+
//errorStrategy 'ignore'
24+
25+
26+
publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}*"
27+
28+
input:
29+
tuple path(query), path(db_dir), val(db_name)
30+
31+
output:
32+
tuple val(sample_id), path("${sample_id}_blast.csv"), emit: blast_csv, optional:true
33+
tuple val(sample_id), path("${sample_id}_seq_description"), emit: seq_description, optional:true
34+
tuple val(sample_id), path("${sample_id}_blast_species_genus_results.csv"), emit: taxon_results, optional:true
35+
tuple val(sample_id), path("${sample_id}_taxon_results.txt"), emit: raw_taxon_results, optional:true
36+
37+
script:
38+
sample_id = query.getName().split('\\.')[0]
39+
"""
40+
export BLASTDB="${db_dir}"
41+
42+
echo "query_seq_id,subject_accession,subject_strand,query_length,query_start,query_end,subject_length,subject_start,subject_end,alignment_length,percent_identity,percent_coverage,num_mismatch,num_gaps,e_value,bitscore,subject_taxids,subject_names" > ${sample_id}_blast.csv
43+
44+
blastn \
45+
-db ${db_name} \
46+
-num_threads ${task.cpus} \
47+
-perc_identity ${params.minid} \
48+
-qcov_hsp_perc ${params.mincov} \
49+
-query ${query} \
50+
-outfmt "6 qseqid saccver sstrand qlen qstart qend slen sstart send length pident qcovhsp mismatch gaps evalue bitscore staxids sscinames" \
51+
| tr \$"\\t" "," >> ${sample_id}_blast.csv
52+
53+
tail -qn+2 ${sample_id}_blast.csv | cut -d',' -f2 | sort -u > seqids
54+
blastdbcmd -db ${db_name} -entry_batch seqids | grep '>' > ${sample_id}_seq_description
55+
56+
57+
58+
59+
if [ "${db_dir}" == "2022-11-16_nt" ] || [ "${db_dir}" == "refseq_its" ] ; then
60+
tail -qn+2 ${sample_id}_blast.csv | cut -d',' -f17 | sort -u > taxids
61+
taxonkit lineage -r -n taxids > ${sample_id}_taxon_results.txt
62+
bind_taxonkit.py -f ${sample_id}_taxon_results.txt -b ${sample_id}_blast.csv -o ${sample_id}_blast_species_genus_results.csv
63+
fi
64+
65+
66+
"""
67+
}
68+
69+
70+
71+
process filter_best_bitscore {
72+
73+
tag { sample_id }
74+
75+
executor 'local'
76+
77+
publishDir "${params.outdir}/${sample_id}", mode: 'copy', pattern: "${sample_id}_blast_best_bitscore.csv"
78+
79+
input:
80+
tuple val(sample_id), path(full_blast_report)
81+
82+
output:
83+
tuple val(sample_id), path("${sample_id}_blast_best_bitscore.csv"), emit: blast_best_bitscore_csv
84+
85+
script:
86+
"""
87+
filter_best_bitscore.py -i ${full_blast_report} > ${sample_id}_blast_best_bitscore.csv
88+
"""
89+
}

nextflow.config

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
manifest {
2+
author = 'Sherrie Wang, Dan Fornika'
3+
name = 'BCCDC-PHL/its-nf'
4+
version = '0.1.0'
5+
description = 'BCCDC-PHL Taxonomic Assignment from ITS sequences'
6+
mainScript = 'main.nf'
7+
nextflowVersion = '>=20.01.0'
8+
}
9+
10+
params {
11+
profile = false
12+
cache = ''
13+
outdir = 'results'
14+
fasta_exts = ['*.fa', '*.fasta', '*.fna']
15+
fasta_search_path = makeFastaSearchPath(fasta_exts)
16+
fasta_input = 'NO_FILE'
17+
samplesheet_input = 'NO_FILE'
18+
db_dir = 'NO_FILE'
19+
db_name = 'db'
20+
db_version = 'current'
21+
minid = 95.0
22+
mincov = 95.0
23+
pipeline_short_name = parsePipelineName(manifest.toMap().get('name'))
24+
pipeline_minor_version = parseMinorVersion(manifest.toMap().get('version'))
25+
}
26+
27+
def makeFastaSearchPath (fasta_exts) {
28+
def fasta_search_path = []
29+
for (ext in fasta_exts) {
30+
fasta_search_path.add(params.fasta_input.toString() + '/' + ext.toString())
31+
fasta_search_path.add(params.fasta_input.toString() + '/**/' + ext.toString())
32+
}
33+
return fasta_search_path
34+
}
35+
36+
def parseMinorVersion(version) {
37+
minor_version = version.split('\\.')[0..1].join('.')
38+
return minor_version
39+
}
40+
41+
def parsePipelineName(name) {
42+
short_name = name.split('/')[1]
43+
return short_name
44+
}
45+
46+
profiles {
47+
conda {
48+
process.conda = "$baseDir/environments/environment.yml"
49+
if (params.cache){
50+
conda.cacheDir = params.cache
51+
}
52+
}
53+
}
54+
55+
process {
56+
57+
withName: blastn {
58+
shell = ['/bin/bash', '-uo','pipefail' ]
59+
}
60+
61+
}

0 commit comments

Comments
 (0)