Skip to content

Commit 2a40a15

Browse files
authored
Merge pull request #3 from BCCDC-PHL/add-ncbi-table
Add ncbi table
2 parents fce9e04 + 2de1791 commit 2a40a15

File tree

6 files changed

+205
-69
lines changed

6 files changed

+205
-69
lines changed

bin/custom_html.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,12 +77,15 @@ def build_dbnote(databases_df):
7777

7878
return f"<p>{DBNOTE}</p>"
7979

80+
def build_table_title(name):
81+
return f"""
82+
<h3 class = "first">{name}</h3>
83+
"""
8084

8185
def build_table(name, col_names, table_rows, hidden_rows=None):
8286
hidden_rows = '' if not hidden_rows else hidden_rows
8387
table_header = '</th>\n<th>'.join(col_names)
8488
return f'''
85-
<h3 class = "first">{name}</h3>
8689
<table>
8790
<tbody>
8891
<tr class="header">

bin/filter_best_bitscore.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,14 @@
77
import pandas as pd
88

99

10-
def filter_best_bitscore(df, group_col, score_col):
10+
def filter_dataframe(df, group_col, score_col):
1111
idxmax = df.groupby(group_col)[score_col].idxmax()
1212
df = df.loc[idxmax].reset_index(drop=True)
1313
return df
1414

1515
def main(args):
1616
blast_df = pd.read_csv(args.input)
17-
filtered_df = filter_best_bitscore(blast_df, args.group_col, args.score_col)
17+
filtered_df = filter_dataframe(blast_df, args.group_col, args.score_col)
1818
filtered_df.to_csv(args.output, index=False)
1919

2020

bin/report.py

Lines changed: 45 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,11 @@
44
import argparse
55
from functools import partial
66
import re
7-
from custom_html import HEAD, build_dbnote, build_table, build_row, FOOT, PAGEBREAK
7+
from custom_html import HEAD, build_dbnote, build_table,build_table_title, build_row, FOOT, PAGEBREAK
88

99

1010
EXPR_PRIORITY = re.compile(r'ATCC|LMG|type|NCTC')
11-
row_names = "subject_accession species bitscore percent_coverage percent_identity database_name extra_info".split()
12-
build_row_part = partial(build_row, row_names=row_names)
11+
ROW_NAMES = "subject_accession species bitscore percent_coverage percent_identity database_name extra_info".split()
1312

1413

1514
def parse_db_csv(filepath):
@@ -77,50 +76,64 @@ def parse_blast(filepath):
7776

7877
return df.fillna('N/A')
7978

80-
def main(args):
81-
blast_table = parse_blast(args.blast)
79+
def build_table_string(name, df, limit=20):
80+
build_row_partial = partial(build_row, row_names=ROW_NAMES)
81+
82+
if df.shape[0] < limit:
83+
str_rows = df.apply(build_row_partial, axis=1)
84+
str_rows = '\n'.join(str_rows)
85+
str_table = build_table(name, ROW_NAMES, str_rows)
86+
87+
else:
88+
N = df.shape[0]
89+
str_rows = df.iloc[0:limit].apply(build_row_partial, axis=1)
90+
str_rows = '\n'.join(str_rows)
91+
92+
hidden_str_rows = df.iloc[limit+1:min(N, 300)].apply(build_row_partial, axis=1)
93+
hidden_str_rows = '\n'.join(hidden_str_rows)
94+
95+
str_table = build_table(name, ROW_NAMES, str_rows, hidden_str_rows)
96+
97+
return str_table
8298

83-
outfile = open(args.output, "w")
84-
outfile.write(HEAD)
99+
def main(args):
100+
local_blast_table = parse_blast(args.blast)
101+
ncbi_blast_table = parse_blast(args.ncbi)
102+
ncbi_blast_table['extra_info'] = ''
85103

86104
database_df = parse_db_csv(args.db)
87105
DBNOTE = build_dbnote(database_df)
88106

89107
extra_info = extract_descriptions(database_df)
90108

91-
blast_table = blast_table.merge(extra_info, on='subject_accession', how='left')
92-
109+
local_blast_table = local_blast_table.merge(extra_info, on='subject_accession', how='left')
93110

94-
for name, df in blast_table.groupby('query_seq_id'):
95-
96-
print(name)
97-
98-
outfile.write(DBNOTE)
99-
100-
if df.shape[0] < 20:
101-
str_rows = df.apply(build_row_part, axis=1)
102-
str_rows = '\n'.join(str_rows)
103-
str_table = build_table(name, row_names, str_rows)
104-
outfile.write(str_table)
105-
else:
106-
N = df.shape[0]
107-
str_rows = df.iloc[0:20].apply(build_row_part, axis=1)
108-
str_rows = '\n'.join(str_rows)
111+
local_blast_dict = dict(list(local_blast_table.groupby('query_seq_id')))
112+
ncbi_blast_dict = dict(list(ncbi_blast_table.groupby('query_seq_id')))
113+
114+
with open(args.output, "w") as outfile:
115+
outfile.write(HEAD)
116+
for name in set(local_blast_dict.keys()).union(ncbi_blast_dict.keys()):
109117

110-
hidden_str_rows = df.iloc[21:min(N, 300)].apply(build_row_part, axis=1)
111-
hidden_str_rows = '\n'.join(hidden_str_rows)
118+
print(name)
119+
outfile.write(DBNOTE)
120+
121+
outfile.write(build_table_title(name))
112122

113-
str_table = build_table(name, row_names, str_rows, hidden_str_rows)
114-
outfile.write(str_table)
123+
if name in local_blast_dict:
124+
outfile.write(build_table_string(name, local_blast_dict[name]))
115125

116-
outfile.write(PAGEBREAK)
126+
if name in ncbi_blast_dict:
127+
outfile.write(build_table_string(name, ncbi_blast_dict[name]))
128+
129+
outfile.write(PAGEBREAK)
117130

118-
outfile.write(FOOT)
119-
outfile.close()
131+
outfile.write(FOOT)
120132

121133
if __name__ == '__main__':
122134
parser = argparse.ArgumentParser()
123-
parser.add_argument('-b', '--blast', help='A single concatenated BLAST CSV table with hits from multiple samples and multiple database sources.')
135+
parser.add_argument('-b', '--blast', required=True, help='A single concatenated BLAST CSV table with hits from multiple samples and multiple database sources.')
136+
parser.add_argument('-n', '--ncbi', required=True, help='A single concatenated BLAST CSV table with hits from multiple samples from NCBI core_nt database.')
124137
parser.add_argument('-d', '--db', help='Database CSV file containing ID, DBNAME, and PATH columns.')
125138
parser.add_argument('-o', '--output', help='Output HTML report filename.')
126139
args = parser.parse_args()

main.nf

Lines changed: 32 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,19 @@ println "Current date and time: $formattedDateTime"
1717

1818
nextflow.enable.dsl = 2
1919

20-
include { hash_seqs } from './modules/hash_seqs.nf'
21-
include { seq_qc } from './modules/blast.nf'
22-
include { blastn } from './modules/blast.nf'
23-
include { filter_by_regex } from './modules/blast.nf'
24-
include { filter_best_bitscore } from './modules/blast.nf'
25-
include { build_report } from './modules/blast.nf'
26-
include { collect_provenance } from './modules/provenance.nf'
27-
include { pipeline_provenance } from './modules/provenance.nf'
20+
include { hash_seqs } from './modules/hash_seqs.nf'
21+
include { seq_qc } from './modules/blast.nf'
22+
include { blastn } from './modules/blast.nf'
23+
include { blastn_ncbi } from './modules/blast.nf'
24+
include { taxonkit_annotation as taxonkit_annotation_local } from './modules/blast.nf'
25+
include { taxonkit_annotation as taxonkit_annotation_ncbi } from './modules/blast.nf'
26+
include { filter_by_regex as filter_by_regex_local } from './modules/blast.nf'
27+
include { filter_by_regex as filter_by_regex_ncbi } from './modules/blast.nf'
28+
include { filter_best_bitscore as filter_best_bitscore_local } from './modules/blast.nf'
29+
include { filter_best_bitscore as filter_best_bitscore_ncbi } from './modules/blast.nf'
30+
include { build_report } from './modules/blast.nf'
31+
include { collect_provenance } from './modules/provenance.nf'
32+
include { pipeline_provenance } from './modules/provenance.nf'
2833

2934

3035
workflow {
@@ -49,6 +54,8 @@ workflow {
4954
ch_db = Channel.of()
5055
}
5156

57+
ch_ncbi_db = Channel.fromPath(params.ncbi_db)
58+
5259
ch_seqs = ch_fasta.splitFasta(record: [id: true, seqString: true])
5360

5461
main:
@@ -58,20 +65,31 @@ workflow {
5865

5966
seq_qc(ch_seqs)
6067
ch_blast = blastn(ch_seqs.combine(ch_db)).blast_report
61-
ch_blast_prov = blastn.out.provenance.map{}
68+
ch_blast = taxonkit_annotation_local(ch_blast).blast_report
69+
70+
ch_blast_ncbi = blastn_ncbi(ch_seqs.combine(ch_ncbi_db)).blast_report
71+
ch_blast_ncbi = taxonkit_annotation_ncbi(ch_blast_ncbi).blast_report
6272

6373
if (params.filter_regexes != 'NO_FILE') {
6474
ch_regexes = Channel.fromPath(params.filter_regexes)
65-
ch_blast = filter_by_regex(ch_blast.combine(ch_regexes)).blast_filtered
75+
ch_blast = filter_by_regex_local(ch_blast.combine(ch_regexes)).blast_filtered
76+
ch_blast_ncbi = filter_by_regex_ncbi(ch_blast_ncbi.combine(ch_regexes)).blast_filtered
6677
}
6778

6879
ch_blast_collect = ch_blast.collectFile(it -> it[2], name: "collected_blast.csv", storeDir: params.outdir, keepHeader: true, skip: 1)
80+
81+
ch_blast_ncbi_collect = ch_blast_ncbi.collectFile(it -> it[2], name: "collected_blast_ncbi.csv", storeDir: params.outdir, keepHeader: true, skip: 1)
6982

70-
filter_best_bitscore(ch_blast)
83+
filter_best_bitscore_local(ch_blast)
84+
85+
filter_best_bitscore_ncbi(ch_blast_ncbi)
7186

72-
filter_best_bitscore.out.blast_best_bitscore_csv.collectFile(it -> it[1], name: "collected_blast_best_bitscore.csv", storeDir: params.outdir, keepHeader: true, skip: 1)
87+
filter_best_bitscore_local.out.blast_best_bitscore_csv.collectFile(it -> it[1], name: "collected_blast_best_bitscore.csv", storeDir: params.outdir, keepHeader: true, skip: 1)
88+
89+
filter_best_bitscore_ncbi.out.blast_best_bitscore_csv.collectFile(it -> it[1], name: "collected_blast_ncbi_best_bitscore.csv", storeDir: params.outdir, keepHeader: true, skip: 1)
90+
7391

74-
build_report(ch_blast_collect, Channel.fromPath(params.databases))
92+
build_report(ch_blast_collect, ch_blast_ncbi_collect, Channel.fromPath(params.databases))
7593

7694
// Build pipeline provenance
7795
ch_pipeline_provenance = pipeline_provenance(ch_pipeline_metadata, build_report.out.provenance)
@@ -80,6 +98,7 @@ workflow {
8098
ch_provenance = hash_seqs.out.provenance
8199
ch_provenance = ch_provenance.join(seq_qc.out.provenance).map{ it -> [it[0], [it[1]] << it[2]] }
82100
ch_provenance = ch_provenance.join(blastn.out.provenance.groupTuple()).map{ it -> [it[0], (it[1] + it[2]).flatten() ] }
101+
ch_provenance = ch_provenance.join(blastn_ncbi.out.provenance.groupTuple()).map{ it -> [it[0], (it[1] + it[2]).flatten() ] }
83102
//ch_provenance = ch_provenance.join(filter_best_bitscore.out.provenance.groupTuple()).map{ it -> [it[0], (it[1] + it[2]).flatten()] }
84103
ch_provenance = ch_provenance.join(seq_qc.out.provenance.map{it -> it[0]}.combine(ch_pipeline_provenance)).map{ it -> [it[0], it[1] << it[2]] }
85104
collect_provenance(ch_provenance)

0 commit comments

Comments
 (0)