Skip to content

Commit 46bc3bb

Browse files
committed
updates 2024_05_31_v2
1 parent 1b6284b commit 46bc3bb

18 files changed

+1372
-297
lines changed

workflow/Snakefile

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,12 +31,16 @@ include: "rules/deduplication.smk"
3131
# Filtering
3232
include: "rules/chromosomal.smk"
3333

34+
# Embedding
35+
include: "rules/mashdb.smk"
36+
3437
# Sequence annotation
3538
include: "rules/viral.smk"
3639
include: "rules/bgc.smk"
3740
include: "rules/amr.smk"
3841
include: "rules/features.smk"
39-
# include: "rules/typing.smk"
42+
include: "rules/proteins.smk"
43+
include: "rules/typing.smk"
4044

4145
# Metadata annotation
4246
include: "rules/ecosystem.smk"
@@ -85,6 +89,8 @@ rule all:
8589
### Features
8690
# rules.features_gbk.output,
8791
# rules.features_json.output,
92+
rules.ipg_join.output,
93+
# rules.eggnog_join.output,
8894
### Typing
8995
# rules.mob_typer.output,
9096
# rules.pmlst_join.output,

workflow/envs/cgview_build.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,6 @@ channels:
33
- defaults
44
dependencies:
55
- conda-forge::rb-bio==2.0.1
6-
- pip
6+
- pip
7+
- pip:
8+
- git+https://meta:glpat-YwMVAVL3bf5vVbFpXeN5@ccb-gitlab.cs.uni-saarland.de/metagenomics/pipelines/utils.git

workflow/envs/py_env.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ channels:
66
- defaults
77
dependencies:
88
- conda-forge::biopython==1.83
9+
- conda-forge:umap-learn
910
- pip
1011
- pip:
1112
- git+https://meta:glpat-YwMVAVL3bf5vVbFpXeN5@ccb-gitlab.cs.uni-saarland.de/metagenomics/pipelines/utils.git

workflow/envs/r_env.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,5 @@ dependencies:
2020
- conda-forge::r-janitor==2.2.0
2121
- conda-forge::r-svglite
2222
- conda-forge::r-upsetr
23+
- conda-forge::r-testit
24+
- conda-forge::r-argparse

workflow/rules/features.smk

Lines changed: 83 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -64,55 +64,92 @@ rule genbank_join:
6464
cat {input.gbk} > {output[0]}
6565
"""
6666

67-
##################################################
68-
# NCBI + AMR + ANTISMASH
69-
##################################################
67+
###############################################################################
68+
# Prokaryotic Genome Annotation Pipeline
69+
###############################################################################
70+
rule pgap:
71+
input: "/local/plsdb/plsdb_2024/workflow/AB011549.2.fasta"
72+
output: directory(join(OUTDIR, "pgap", "test"))
73+
container: "docker://ncbi/pgap-utils:2024-07-18.build7555"
74+
shell:
75+
"""
76+
./pgap.py -r -o {output[0]} -g {input[0]} -s 'Escherichia coli'
77+
"""
7078

71-
rule features_gbk:
72-
input:
73-
fasta = filtered_fasta,
74-
nucc = filtered_pls,
75-
bgc = rules.antismash_join.output.tsv,
76-
genbank = rules.genbank_join.output[0],
77-
amr = rules.hamronize_dedup.output[0]
79+
###############################################################################
80+
# IDENTICAL PROTEIN GROUP
81+
###############################################################################
82+
rule ipg_queries:
83+
input: join(OUTDIR, "final/proteins.csv")
84+
params:
85+
api_file = config["api_key_file"],
86+
ncbi_api = config["eutils"]["api_key"],
87+
database = 'ipg',
88+
batch_size = 8000, # Maximum
89+
eget_cmd = "| esummary -mode json",
90+
xtract_cmd = ""
91+
output:
92+
DIR = directory(join(OUTDIR, "data_collection/ipg",
93+
f"queries_{config['timestamp']}")),
94+
batches = expand(
95+
join(OUTDIR, "data_collection/ipg",
96+
f"queries_{config['timestamp']}", "batch_{batches}.pickle"),
97+
batches = range(0, config['ipg']['batches']))
98+
threads: 1
99+
log:
100+
join(OUTDIR, "data_collection/ipg",
101+
f"queries_{config['timestamp']}/ipg_queries.log")
102+
wrapper:
103+
"file:///local/plsdb/master/wrappers/ncbi/ipg/queries"
104+
105+
rule ipg_api:
106+
input:
107+
pickle = join(rules.ipg_queries.output.DIR, "batch_{batch}.pickle")
108+
params:
109+
ncbi_api = config["eutils"]["api_key"],
110+
api_file = config["api_key_file"],
111+
batch_size = 5000
78112
output:
79-
amr_tab = join(OUTDIR, "final", "amr.tsv"),
80-
gc_tab = join(OUTDIR, "filtering/metadata/nucc_gc.csv"),
81-
proteins_tab = join(OUTDIR, "final/proteins.csv"),
82-
proteins = join(OUTDIR, "final/proteins.fasta"),
83-
DIR = directory(join(OUTDIR, "final/features/gbk/"))
84-
conda: "../envs/py_env.yaml"
85-
script:
86-
"../scripts/features.py"
113+
pickle = join(OUTDIR, "data_collection/ipg",
114+
f"api_{config['timestamp']}", "batch_{batch}.pickle")
115+
threads: 10 # NOTE: max 10 to prevent too many requests
116+
benchmark:
117+
join(OUTDIR, "data_collection/ipg",
118+
f"api_{config['timestamp']}", "batch_{batch}.bench")
119+
log:
120+
join(OUTDIR, "data_collection/ipg",
121+
f"api_{config['timestamp']}", "batch_{batch}.log")
122+
wrapper:
123+
"file:///local/plsdb/master/wrappers/ncbi/ipg/api"
87124

88-
rule features_json:
125+
rule ipg_extraction:
89126
input:
90-
DIR = rules.features_gbk.output.DIR,
91-
config = "../src/cgview_config.yaml"
127+
pickle = lambda wildcards: expand(rules.ipg_api.output.pickle, batch=wildcards.batch)
92128
output:
93-
DIR = directory(join(OUTDIR, "final/features/json")),
94-
DIR_repo = directory("../scripts/cgview-builder/")
95-
conda: "../envs/cgview_build.yaml"
96-
shell:
97-
"""
98-
mkdir -p {output.DIR}
99-
git clone https://github.com/stothard-group/cgview-builder.git -b master {output.DIR_repo}
100-
DIR="{input.DIR}/*"
101-
for file in $DIR; do
102-
prefix=$(basename -- "$file" .gbk)
103-
ruby {output.DIR_repo}/cgview_builder_cli.rb --sequence $file \
104-
--outfile {output.DIR}/$prefix.json \
105-
--config {input.config}
106-
done
107-
"""
129+
csv = join(OUTDIR, "data_collection/ipg",
130+
f"extraction_{config['timestamp']}", "batch_{batch}.csv")
131+
log:
132+
join(OUTDIR, "data_collection/ipg",
133+
f"extraction_{config['timestamp']}", "batch_{batch}.log")
134+
benchmark:
135+
join(OUTDIR, "data_collection/ipg",
136+
f"extraction_{config['timestamp']}", "batch_{batch}.bench")
137+
threads: 1
138+
wrapper:
139+
"file:///local/plsdb/master/wrappers/ncbi/ipg/extraction"
140+
141+
rule ipg_join:
142+
input:
143+
files = expand(rules.ipg_extraction.output.csv,
144+
batch = range(0, config['ipg']['batches']))
145+
output:
146+
csv = join(OUTDIR, "data_collection/ipg/",
147+
f"extraction_{config['timestamp']}", f"ipg_records.csv")
148+
threads: 1
149+
run:
150+
import pickle
151+
import pandas as pd
108152

109-
# rule cluster_proteins:
110-
# input:
111-
# rules.features_gbk.proteins
112-
# output:
113-
# join(OUTDIR, "proteins/diamond_clust.tsv")
114-
# shell:
115-
# """
116-
# diamond cluster -d {input[0]} -o {output[0]} \
117-
# --header --approx-id
118-
# """
153+
ipg_df = pd.concat([pd.read_csv(str(f)) for f in input.files ], ignore_index=True)
154+
ipg_df.drop_duplicates(inplace=True)
155+
ipg_df.to_csv(output.csv, index=False)

workflow/rules/mashdb.smk

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
rule mashdb_sketch:
22
input:
3-
fasta = "../../results/filtering/deduplication/pls_dedup.fasta", # rules.deduplication.output.fasta
3+
fasta = filtered_fasta
44
output:
55
join(OUTDIR, "mashdb","plsdb_sketch.msh")
66
params:
@@ -70,4 +70,24 @@ rule mashdb_dist:
7070
join(OUTDIR, "mashdb","db/mash_dist.bench")
7171
threads: workflow.cores
7272
wrapper:
73-
"file:///local/plsdb/master/wrappers/mash"
73+
"file:///local/plsdb/master/wrappers/mash"
74+
75+
# UMAP
76+
#################################################
77+
78+
# Embedding using UMAP on Mash distances
79+
rule mashdb_umap:
80+
input:
81+
rules.mashdb_dist.output
82+
output:
83+
join(OUTDIR, "mashdb", "umap_mash_dist.umap")
84+
params:
85+
neighbors=config['umap']['neighbors'],
86+
components=config['umap']['components'],
87+
min_dist=config['umap']['min_dist']
88+
log:
89+
join(OUTDIR, "mashdb", "process_umap.log")
90+
conda: "../envs/py_env.yaml"
91+
script:
92+
"../scripts/process/process_umap.py"
93+

workflow/rules/module_downstream.smk

Lines changed: 50 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -69,28 +69,57 @@ rule nucc_table:
6969
nucc = filtered_pls,
7070
gc = rules.features_gbk.output.gc_tab,
7171
viral = rules.viral_curation.output.pls,
72-
pmlst = join(OUTDIR, "typing/pmlst/summary/results.tsv") # rules.pmlst_join.output[0],
72+
umap = rules.mashdb_umap.output[0]
7373
output:
74-
join(OUTDIR, "final", "nuccore.csv")
74+
#join(OUTDIR, "final", "nuccore.csv")
75+
temp(join(OUTDIR, "final", "tmp_nuccore.csv"))
7576
run:
7677
import pandas as pd
7778

7879
nucc = pd.read_csv(input.nucc)
7980
nucc.drop(columns=['NUCCORE_Topology'], inplace=True)
8081
gc = pd.read_csv(input.gc).drop_duplicates()
8182
viral = pd.read_csv(input.viral)
82-
pmlst = pd.read_table(input.pmlst)
83+
umap = pd.read_csv(input.umap, sep='\t', header=0, dtype=str)
8384

8485
print(nucc.columns)
8586
df1 = pd.merge(nucc, gc, how='left', on="NUCCORE_ACC", validate="1:1")
8687
print(df1.columns)
8788
df2 = pd.merge(df1, viral, how='left', on="NUCCORE_ACC", validate="1:1")
8889
print(df2.columns)
89-
final = pd.merge(df2, pmlst, how='left', on="NUCCORE_ACC", validate="1:1")
90+
final = pd.merge(df2, umap, how='left', on="NUCCORE_ACC", validate="1:1")
9091
print(final.columns)
9192

9293
final.to_csv(output[0], index=False)
9394

95+
rule inspect_outliers:
96+
input:
97+
nucc = rules.nucc_table.output[0],
98+
typing = rules.typing_table.output[0],
99+
amr = rules.features_gbk.output.amr_tab,
100+
proteins = rules.features_gbk.output.proteins_tab
101+
output:
102+
csv=join(OUTDIR_filtering, "suppressed_outliers.csv")
103+
conda: "../envs/r_env.yml"
104+
script:
105+
"../scripts/filtering/inspect_outliers.R"
106+
107+
rule final_nucc_table:
108+
input:
109+
nucc = rules.nucc_table.output[0],
110+
status = rules.inspect_outliers.output[0]
111+
output:
112+
join(OUTDIR, "final", "nuccore.csv")
113+
run:
114+
import pandas as pd
115+
116+
nucc = pd.read_csv(input.nucc)
117+
status = pd.read_csv(input.status)
118+
119+
df = pd.merge(nucc, status, how='left', on="NUCCORE_ACC", validate="1:1")
120+
df.to_csv(output[0], index=False)
121+
122+
94123
rule final_fasta:
95124
input: filtered_fasta
96125
output: join(OUTDIR, "final/sequences.fasta")
@@ -129,72 +158,29 @@ rule createmash:
129158
# width=10,
130159
# height=6
131160
# conda:
132-
# "../envs/requirements.yml"
161+
# "../envs/r_env.yml"
133162
# shell:
134163
# """
135164
# Rscript scripts/dstream_summary.R --tab {input} \
136165
# --pdf {output.pdf} --width {params.width} \
137166
# --height {params.height} | tee {output.txt}
138167
# """
139168

140-
141169
# # Compare to older version
142170
# ##################################################
143-
# rule dstream_compare:
144-
# input:
145-
# new = rules.process_infotable.output,
146-
# old = config['previous_table'],
147-
# new_nonfiltered = rules.retrieve_plasmid_taxid.output[1]
148-
# conda:
149-
# "../envs/requirements.yml"
150-
# output:
151-
# txt = join(OUTDIR_dstream, "changes.tsv"),
152-
# log = join(OUTDIR_dstream, "changes.tsv.log")
153-
# shell:
154-
# """
155-
# Rscript scripts/dstream_compare_tabs.R -n {input.new} \
156-
# -o {input.old} -f {input.new_nonfiltered} \
157-
# -t {output.txt} -l {output.log}
158-
# """
159-
160-
# # Server data
161-
# ##################################################
162-
# rule dstream_server_data:
163-
# input:
164-
# abr = rules.process_join_abricate.output.tsv,
165-
# changes = rules.dstream_compare.output.txt,
166-
# changes_log = rules.dstream_compare.output.log,
167-
# html = rules.dstream_krona_html.output,
168-
# msh = rules.process_mash_sketch.output,
169-
# sim = rules.dstream_sim_records.output,
170-
# infotab = rules.process_infotable.output,
171-
# fasta = rules.filter_artifacts.output.fasta
172-
# output:
173-
# dir = directory("../src/server_data"),
174-
# fasta = "../src/server_data/plsdb.fna"
175-
# conda:
176-
# "../envs/py_env.yml"
177-
# shell:
178-
# """
179-
# mkdir -p {output.dir} && \
180-
# cp {input.abr} {output.dir}/plsdb.abr &&\
181-
# cp {input.changes} {output.dir}/plsdb_changes.tsv &&\
182-
# cp {input.html} {output.dir}/plsdb.html &&\
183-
# cp {input.msh} {output.dir}/plsdb.msh &&\
184-
# cp {input.sim} {output.dir}/plsdb.sim &&\
185-
# cp {input.infotab} {output.dir}/plsdb.tsv && \
186-
# cp {input.fasta} {output.dir}/plsdb.fna && \
187-
# bzip2 -zk {input.fasta} --stdout > {output.dir}/plsdb.fna.bz2
188-
# """
189-
190-
# # BLAST DBs
191-
# ##################################################
192-
# use rule process_make_rmlst_blastdb as dstream_blastndb with:
193-
# input:
194-
# fasta=rules.dstream_server_data.output.fasta,
195-
# output:
196-
# dbs = expand(["{file}.{ext}"],
197-
# file = join("../src/server_data/", f"plsdb.fna"),
198-
# ext=['nin', 'nhr', 'nsq', 'ndb', 'njs', 'not', 'ntf', 'nto'])
199-
# params:
200-
# title = 'plsdb'
171+
rule dstream_compare:
172+
input:
173+
new = rules.final_nucc_table.output[0],
174+
old = f"../src/plsdb_{config['previous_version']}.csv",
175+
new_nonfiltered = rules.join_NABT.output.nucc
176+
conda:
177+
"../envs/r_env.yml"
178+
output:
179+
txt = join(OUTDIR, "final" "changes.tsv"),
180+
log = join(OUTDIR, "final", "changes.tsv.log")
181+
shell:
182+
"""
183+
Rscript scripts/dstream/dstream_compare_tabs.R -n {input.new} \
184+
-o {input.old} -f {input.new_nonfiltered} \
185+
-t {output.txt} -l {output.log}
186+
"""

0 commit comments

Comments
 (0)