Skip to content

Commit 7c18925

Browse files
authored
MRG: more upgrades! (#6)
* more updates and upgrades * clean up abund etc template
1 parent 966913c commit 7c18925

File tree

9 files changed

+194
-68
lines changed

9 files changed

+194
-68
lines changed

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,6 @@
22

33
dist:
44
python -m build
5+
6+
black:
7+
python -m black chill_filter_web

README.md

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# chill-filter: Rapid sample screening for shotgun sequencing data
22

3-
## Quickstart for the Web server:
3+
## Quickstart for the Web site:
44

55
0. Clone the repo:
66

@@ -19,9 +19,8 @@ conda activate chill
1919
2. Download the databases from [the Open Science Framework project](https://osf.io/m85ux/), and unpack them into `prepare-db/outputs/`.
2020

2121
```
22-
curl -JLO https://osf.io/download/pwfn8/
23-
mkdir -p prepare-db/outputs/
24-
unzip -d prepare-db/outputs/ -nu chill-filter-db-0.1.zip
22+
curl -JLO https://osf.io/download/bzu3v/
23+
unzip -d prepare-db/ -nu chill-filter-db-0.2.zip
2524
```
2625

2726
3. Run snakemake in the `sample-db/` directory to index the databases. It should take a few minutes at most.
@@ -34,11 +33,9 @@ unzip -d prepare-db/outputs/ -nu chill-filter-db-0.1.zip
3433

3534
```
3635
mkdir -p /tmp/chill
37-
python -m chill_filter_web
36+
python -m chill_filter_web -p 5000
3837
```
3938

40-
This will start a server at http://localhost:5000/, by default.
39+
This will start a server at http://localhost:5000/
4140

42-
5. Try uploading k=51, scaled=100_000 sketches!
43-
44-
e.g. there are a bunch in `examples/` to try.
41+
5. Try uploading some FASTQ or FASTA files, or checkout the examples!

chill_filter_web/__init__.py

Lines changed: 122 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,16 @@
1414
from sourmash import save_signatures_to_json
1515
from sourmash_plugin_branchwater import sourmash_plugin_branchwater as branch
1616

17-
MOLTYPE="DNA"
18-
KSIZE=51
19-
SCALED=100_000
17+
MOLTYPE = "DNA"
18+
KSIZE = 51
19+
SCALED = 100_000
2020
UPLOAD_FOLDER = "/tmp/chill"
2121
EXAMPLES_DIR = os.path.join(os.path.dirname(__file__), "../examples/")
22+
SIGNATURES = "prepare-db/plants+animals+gtdb.mf.csv"
23+
DATABASE = "prepare-db/plants+animals+gtdb.rocksdb"
2224

2325
app = Flask(__name__)
2426

25-
2627
def load_sig(fullpath):
2728
try:
2829
ss = sourmash.load_file_as_index(fullpath)
@@ -35,6 +36,76 @@ def load_sig(fullpath):
3536

3637
return None
3738

39+
40+
def run_gather(outpath, csv_filename):
41+
start = time.time()
42+
status = branch.do_fastmultigather(
43+
outpath,
44+
DATABASE,
45+
0,
46+
KSIZE,
47+
SCALED,
48+
MOLTYPE,
49+
csv_filename,
50+
False,
51+
False,
52+
)
53+
end = time.time()
54+
55+
print(f"branchwater gather status: {status}; time: {end - start:.2f}s")
56+
return status
57+
58+
59+
def sig_is_assembly(ss):
60+
mh = ss.minhash
61+
# track abundance set? => assembly
62+
if not mh.track_abundance:
63+
print('ZZZ1 - is assembly')
64+
return True
65+
66+
# count the number > 1 in abundance
67+
n_above_1 = sum(1 for (hv, ha) in mh.hashes.items() if ha > 1)
68+
print('ZZZ2', n_above_1, len(mh), n_above_1/len(mh))
69+
70+
# more than 10% > 1? => probably not assemblyy
71+
if n_above_1 / len(mh) > 0.1:
72+
return False
73+
74+
# nope! assembly!
75+
return True
76+
77+
78+
merged_hashes = None
79+
def build_merged_sig():
80+
global merged_hashes
81+
if merged_hashes is None:
82+
## build a merged sig - CTB hackity hack
83+
print('building merged sig from signatures...')
84+
idx = sourmash.load_file_as_index(SIGNATURES)
85+
merged_mh = None
86+
for ss in idx.signatures():
87+
if merged_mh is None:
88+
merged_mh = ss.minhash.copy_and_clear().flatten().to_mutable()
89+
else:
90+
merged_mh += ss.minhash.flatten()
91+
merged_hashes = merged_mh.hashes
92+
print('...done!')
93+
return merged_hashes
94+
95+
def estimate_weight_of_unknown(ss, *, CUTOFF=5):
96+
merged_hashes = build_merged_sig()
97+
mh = ss.minhash
98+
99+
print(len(mh))
100+
101+
unknown = [ (hv, ha) for (hv, ha) in mh.hashes.items() if ha not in merged_hashes ]
102+
sum_unknown = sum( ha for (hv, ha) in unknown )
103+
sum_high = sum( ha for (hv, ha) in unknown if ha >= CUTOFF )
104+
sum_low = sum( ha for (hv, ha) in unknown if ha < CUTOFF )
105+
106+
return sum_high / sum_unknown, sum_low / sum_unknown
107+
108+
38109
@app.route("/", methods=["GET", "POST"])
39110
def index():
40111
if request.method == "POST":
@@ -79,7 +150,6 @@ def sketch():
79150
with gzip.open(outpath, "wt") as fp:
80151
fp.write(f"[{sig_json}]")
81152

82-
83153
ss = load_sig(outpath)
84154
if ss:
85155
md5 = ss.md5sum()
@@ -127,54 +197,66 @@ def get_md5(path):
127197
success = True
128198

129199
if success:
130-
# @CTB check that it's weighted?
131200
assert ss is not None
132201
sample_name = ss.name or "(unnamed sample)"
133-
if action == "search":
202+
if action == 'download_csv':
203+
csv_filename = filename + ".gather.csv"
204+
return send_from_directory(UPLOAD_FOLDER, csv_filename)
205+
elif action == "search":
134206
csv_filename = outpath + ".gather.csv"
135207
if not os.path.exists(csv_filename):
136-
start = time.time()
137-
status = branch.do_fastmultigather(
138-
outpath,
139-
# "prepare-db/animals-and-gtdb.rocksdb",
140-
"prepare-db/plants+animals+gtdb.rocksdb",
141-
0,
142-
KSIZE,
143-
SCALED,
144-
MOLTYPE,
145-
csv_filename,
146-
False,
147-
False
148-
)
149-
end = time.time()
150-
151-
print(f"branchwater gather status: {status}; time: {end - start:.2f}s")
208+
status = run_gather(outpath, csv_filename)
152209
if status != 0:
153210
return "search failed, for reasons that are probably not your fault"
154211
else:
155212
print(f'output is in: "{csv_filename}"')
156213
else:
157214
print(f"using cached output in: '{csv_filename}'")
158215

159-
# load/process
160216
gather_df = pd.read_csv(csv_filename)
161-
gather_df = gather_df[gather_df["f_unique_weighted"] >= 0.001]
162-
if len(gather_df):
163-
last_row = gather_df.tail(1).squeeze()
164-
sum_weighted_found = last_row["sum_weighted_found"]
165-
total_weighted_hashes = last_row["total_weighted_hashes"]
166-
167-
f_found = sum_weighted_found / total_weighted_hashes
168-
169-
return render_template(
170-
"sample_search.html",
171-
sample_name=sample_name,
172-
sig=ss,
173-
gather_df=gather_df,
174-
f_found=f_found,
175-
)
217+
218+
# process abundance-weighted matches
219+
if not sig_is_assembly(ss):
220+
f_unknown_high, f_unknown_low = estimate_weight_of_unknown(ss)
221+
print('YYY', f_unknown_high, f_unknown_low)
222+
223+
gather_df = gather_df[gather_df["f_unique_weighted"] >= 0.001]
224+
if len(gather_df):
225+
last_row = gather_df.tail(1).squeeze()
226+
sum_weighted_found = last_row["sum_weighted_found"]
227+
total_weighted_hashes = last_row["total_weighted_hashes"]
228+
229+
230+
f_found = sum_weighted_found / total_weighted_hashes
231+
232+
return render_template(
233+
"sample_search_abund.html",
234+
sample_name=sample_name,
235+
sig=ss,
236+
gather_df=gather_df,
237+
f_found=f_found,
238+
f_unknown_high=f_unknown_high,
239+
f_unknown_low=f_unknown_low,
240+
)
241+
else:
242+
return "no matches found!"
243+
# process flat matching (assembly)
176244
else:
177-
return "no matches found!"
245+
print('running flat')
246+
gather_df = gather_df[gather_df["f_unique_weighted"] >= 0.001]
247+
if len(gather_df):
248+
last_row = gather_df.tail(1).squeeze()
249+
f_found = gather_df['f_unique_to_query'].sum()
250+
251+
return render_template(
252+
"sample_search_flat.html",
253+
sample_name=sample_name,
254+
sig=ss,
255+
gather_df=gather_df,
256+
f_found=f_found,
257+
)
258+
else:
259+
return "no matches found!"
178260

179261
elif action == "download":
180262
return send_from_directory(UPLOAD_FOLDER, filename)

chill_filter_web/templates/index.html

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,15 @@ <h4>
4545

4646
<main class="container">
4747
<h4>Examples:</h4>
48-
<a href="/example?filename=SRR606249.k51.s100_000.sig.zip">SRR606249: A mock microbial community</a><br>
49-
<a href="/example?filename=SRR5650070.k51.s100_000.sig.zip">SRR5650070: A human gut metagenome from the iHMP</a><br>
50-
<a href="/example?filename=SRR12324253.k51.s100_000.sig.zip">SRR12324253: Zymo mock microbial community</a><br>
51-
<a href="/example?filename=ERR1395610.k51.s100_000.sig.zip">ERR1395610: a human WGS sample</a><br>
52-
<a href="/example?filename=Bu5.k51.s100_000.sig.zip">Bu5: a human WGS sample</a><br>
53-
<a href="/example?filename=ERR2592340.k51.s100_000.sig.zip">ERR2592340: livestock feces sample</a><br>
54-
<a href="/example?filename=ERR2245457.k51.s100_000.sig.zip">ERR2245457: sewage </a><br>
48+
<ul>
49+
<li><a href="/example?filename=SRR606249.k51.s100_000.sig.zip">SRR606249: A mock microbial community</a>
50+
<li><a href="/example?filename=SRR5650070.k51.s100_000.sig.zip">SRR5650070: A human gut metagenome from the iHMP</a>
51+
<li><a href="/example?filename=SRR12324253.k51.s100_000.sig.zip">SRR12324253: Zymo mock microbial community</a>
52+
<li><a href="/example?filename=Bu5.abund.k51.s100_000.sig.zip">Bu5: a human WGS sample (reads)</a>
53+
<li><a href="/example?filename=Bu5.flat.k51.s100_000.sig.zip">Bu5: a human WGS sample (assembly)</a>
54+
<li><a href="/example?filename=ERR2592340.k51.s100_000.sig.zip">ERR2592340: livestock feces sample</a>
55+
<li><a href="/example?filename=ERR2245457.k51.s100_000.sig.zip">ERR2245457: sewage </a>
56+
</ul>
5557
{% include "_footer.html" %}
5658

5759
<script>
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
<!- template: sample_search_abund.html -->
2+
<head>
3+
<title>search of {{ sample_name }} - chill-filter</title>
4+
{% include "_header.html" %}
5+
{% include "_header.html" %}
6+
</head>
7+
<body>
8+
<main class="container">
9+
10+
<h2>Sample search summary for: {{ sample_name }}</h2>
11+
<p>
12+
This looks like a set of reads, right?
13+
<p>
14+
15+
Based on the search results below, we estimate that at least <b>{{ "{:.1f}".format(f_found * 100) }}%</b> of your sequencing data will map to known reference genomes.
16+
<p>
17+
Your sample is about <b>{{ "{:.1f}".format((1 - f_found) * 100) }}% unknown</b>. This could be new sequence and/or sequencing errors.
18+
<p>
19+
(<b>{{ "{:.1f}".format(f_unknown_high* (1 - f_found) * 100) }}%</b> of the sample is unknown and high abundance, so that's probably not sequencing error.)
20+
21+
<h2>Sample breakdown</h2>
22+
23+
<table border=1>
24+
<tr>
25+
<th> <b>Percent assigned</b> </th>
26+
<th> <b>Reference genome or collection </b></th>
27+
<th> <b>Estimated sequencing depth in sample</b></tr>
28+
{% for item in gather_df.to_dict(orient='records') %}
29+
<tr>
30+
<td>{{ '{:.1f}'.format(item['f_unique_weighted'] * 100) }}% </td>
31+
<td> {{ item['match_name'] }} </td>
32+
<td>{{ '{:.0f}'.format(item['median_abund']) }}x </td>
33+
</tr>
34+
{% endfor %}
35+
</table>
36+
37+
<a href='./'>Return to sample page</a> | <a href="./download_csv">Download results</a>
38+
39+
{% include "_footer.html" %}
40+
</main>
41+
</body>
42+
43+
</html>

chill_filter_web/templates/sample_search.html renamed to chill_filter_web/templates/sample_search_flat.html

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!- template: sample_search.html -->
1+
<!- template: sample_search_flat.html -->
22
<head>
33
<title>search of {{ sample_name }} - chill-filter</title>
44
{% include "_header.html" %}
@@ -8,12 +8,14 @@
88
<main class="container">
99

1010
<h2>Sample search summary for: {{ sample_name }}</h2>
11-
12-
total sample known: <b>{{ "{:.1f}".format(f_found * 100) }}%</b>
1311
<p>
14-
(This is an estimate for how much of your sequencing data will map to known reference genomes.)
12+
This looks like an assembly, right?
13+
<p>
14+
Based on the results below, we estimate that at least
15+
<b>{{ "{:.1f}".format(f_found * 100) }}%</b>
16+
of your contigs will align to a known reference genome.
1517
<p>
16-
Your sample is {{ "{:.1f}".format((1 - f_found) * 100) }}% unknown. This is a combined estimate of genomic sequence not in our database, entirely new genomic sequence, SNPs/polymorphisms, or erroneous sequence.
18+
Your sample is about {{ "{:.1f}".format((1 - f_found) * 100) }}% unknown. This is likely to be novel sequence!
1719

1820
<h2>Sample breakdown</h2>
1921

@@ -29,13 +31,6 @@ <h2>Sample breakdown</h2>
2931

3032
<p>
3133

32-
<h2>TODO</h2>
33-
<ul>
34-
<li> high abundance, low abundance split!
35-
</ul>
36-
37-
<p>
38-
3934
<a href='./'>Return to sample page</a>
4035

4136

doc/developer.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,7 @@ The client-side sketching code is sourced from
44
[here](https://github.com/sourmash-bio/branchwater/tree/main/app/static)
55
and the JavaScript is placed entirely in
66
`chill_filter_web/templates/index.html`.
7+
8+
## CSS
9+
10+
We are using [pico](https://picocss.com/docs).
911 KB
Binary file not shown.

0 commit comments

Comments
 (0)