MRG: more upgrades! (#6)

ctb · web-flow · commit 7c189256a130 · 2024-12-15T17:31:48.000-08:00
* more updates and upgrades

* clean up abund etc template
diff --git a/Makefile b/Makefile
@@ -2,3 +2,6 @@
 
 dist:
 	python -m build
+
+black:
+	python -m black chill_filter_web
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # chill-filter: Rapid sample screening for shotgun sequencing data
 
-## Quickstart for the Web server:
+## Quickstart for the Web site:
 
 0. Clone the repo:
 
@@ -19,9 +19,8 @@ conda activate chill
 2. Download the databases from [the Open Science Framework project](https://osf.io/m85ux/), and unpack them into `prepare-db/outputs/`.
 
 ```
-curl -JLO https://osf.io/download/pwfn8/
-mkdir -p prepare-db/outputs/
-unzip -d prepare-db/outputs/ -nu chill-filter-db-0.1.zip
+curl -JLO https://osf.io/download/bzu3v/
+unzip -d prepare-db/ -nu chill-filter-db-0.2.zip
 ```
 
 3. Run snakemake in the `sample-db/` directory to index the databases. It should take a few minutes at most.
@@ -34,11 +33,9 @@ unzip -d prepare-db/outputs/ -nu chill-filter-db-0.1.zip
 
 ```
 mkdir -p /tmp/chill
-python -m chill_filter_web
+python -m chill_filter_web -p 5000
 ```
 
-This will start a server at http://localhost:5000/, by default.
+This will start a server at http://localhost:5000/
 
-5. Try uploading k=51, scaled=100_000 sketches!
-
-e.g. there are a bunch in `examples/` to try.
+5. Try uploading some FASTQ or FASTA files, or checkout the examples!
diff --git a/chill_filter_web/__init__.py b/chill_filter_web/__init__.py
@@ -14,15 +14,16 @@
 from sourmash import save_signatures_to_json
 from sourmash_plugin_branchwater import sourmash_plugin_branchwater as branch
 
-MOLTYPE="DNA"
-KSIZE=51
-SCALED=100_000
+MOLTYPE = "DNA"
+KSIZE = 51
+SCALED = 100_000
 UPLOAD_FOLDER = "/tmp/chill"
 EXAMPLES_DIR = os.path.join(os.path.dirname(__file__), "../examples/")
+SIGNATURES = "prepare-db/plants+animals+gtdb.mf.csv"
+DATABASE = "prepare-db/plants+animals+gtdb.rocksdb"
 
 app = Flask(__name__)
 
-
 def load_sig(fullpath):
     try:
         ss = sourmash.load_file_as_index(fullpath)
@@ -35,6 +36,76 @@ def load_sig(fullpath):
 
     return None
 
+
+def run_gather(outpath, csv_filename):
+    start = time.time()
+    status = branch.do_fastmultigather(
+        outpath,
+        DATABASE,
+        0,
+        KSIZE,
+        SCALED,
+        MOLTYPE,
+        csv_filename,
+        False,
+        False,
+    )
+    end = time.time()
+
+    print(f"branchwater gather status: {status}; time: {end - start:.2f}s")
+    return status
+
+
+def sig_is_assembly(ss):
+    mh = ss.minhash
+    # track abundance set? => assembly
+    if not mh.track_abundance:
+        print('ZZZ1 - is assembly')
+        return True
+
+    # count the number > 1 in abundance
+    n_above_1 = sum(1 for (hv, ha) in mh.hashes.items() if ha > 1)
+    print('ZZZ2', n_above_1, len(mh), n_above_1/len(mh))
+
+    # more than 10% > 1? => probably not assemblyy
+    if n_above_1 / len(mh) > 0.1:
+        return False
+
+    # nope! assembly!
+    return True
+
+
+merged_hashes = None
+def build_merged_sig():
+    global merged_hashes
+    if merged_hashes is None:
+        ## build a merged sig - CTB hackity hack
+        print('building merged sig from signatures...')
+        idx = sourmash.load_file_as_index(SIGNATURES)
+        merged_mh = None
+        for ss in idx.signatures():
+            if merged_mh is None:
+                merged_mh = ss.minhash.copy_and_clear().flatten().to_mutable()
+            else:
+                merged_mh += ss.minhash.flatten()
+        merged_hashes = merged_mh.hashes
+        print('...done!')
+    return merged_hashes
+
+def estimate_weight_of_unknown(ss, *, CUTOFF=5):
+    merged_hashes = build_merged_sig()
+    mh = ss.minhash
+
+    print(len(mh))
+
+    unknown = [ (hv, ha) for (hv, ha) in mh.hashes.items() if ha not in merged_hashes ]
+    sum_unknown = sum( ha for (hv, ha) in unknown )
+    sum_high = sum( ha for (hv, ha) in unknown if ha >= CUTOFF )
+    sum_low = sum( ha for (hv, ha) in unknown if ha < CUTOFF )
+
+    return sum_high / sum_unknown, sum_low / sum_unknown
+
+
 @app.route("/", methods=["GET", "POST"])
 def index():
     if request.method == "POST":
@@ -79,7 +150,6 @@ def sketch():
         with gzip.open(outpath, "wt") as fp:
             fp.write(f"[{sig_json}]")
 
-
         ss = load_sig(outpath)
         if ss:
             md5 = ss.md5sum()
@@ -127,54 +197,66 @@ def get_md5(path):
             success = True
 
     if success:
-        # @CTB check that it's weighted?
         assert ss is not None
         sample_name = ss.name or "(unnamed sample)"
-        if action == "search":
+        if action == 'download_csv':
+            csv_filename = filename + ".gather.csv"
+            return send_from_directory(UPLOAD_FOLDER, csv_filename)
+        elif action == "search":
             csv_filename = outpath + ".gather.csv"
             if not os.path.exists(csv_filename):
-                start = time.time()
-                status = branch.do_fastmultigather(
-                    outpath,
-#                    "prepare-db/animals-and-gtdb.rocksdb",
-                    "prepare-db/plants+animals+gtdb.rocksdb",
-                    0,
-                    KSIZE,
-                    SCALED,
-                    MOLTYPE,
-                    csv_filename,
-                    False,
-                    False
-                )
-                end = time.time()
-
-                print(f"branchwater gather status: {status}; time: {end - start:.2f}s")
+                status = run_gather(outpath, csv_filename)
                 if status != 0:
                     return "search failed, for reasons that are probably not your fault"
                 else:
                     print(f'output is in: "{csv_filename}"')
             else:
                 print(f"using cached output in: '{csv_filename}'")
 
-            # load/process
             gather_df = pd.read_csv(csv_filename)
-            gather_df = gather_df[gather_df["f_unique_weighted"] >= 0.001]
-            if len(gather_df):
-                last_row = gather_df.tail(1).squeeze()
-                sum_weighted_found = last_row["sum_weighted_found"]
-                total_weighted_hashes = last_row["total_weighted_hashes"]
-
-                f_found = sum_weighted_found / total_weighted_hashes
-
-                return render_template(
-                    "sample_search.html",
-                    sample_name=sample_name,
-                    sig=ss,
-                    gather_df=gather_df,
-                    f_found=f_found,
-                )
+
+            # process abundance-weighted matches
+            if not sig_is_assembly(ss):
+                f_unknown_high, f_unknown_low = estimate_weight_of_unknown(ss)
+                print('YYY', f_unknown_high, f_unknown_low)
+
+                gather_df = gather_df[gather_df["f_unique_weighted"] >= 0.001]
+                if len(gather_df):
+                    last_row = gather_df.tail(1).squeeze()
+                    sum_weighted_found = last_row["sum_weighted_found"]
+                    total_weighted_hashes = last_row["total_weighted_hashes"]
+                    
+
+                    f_found = sum_weighted_found / total_weighted_hashes
+
+                    return render_template(
+                        "sample_search_abund.html",
+                        sample_name=sample_name,
+                        sig=ss,
+                        gather_df=gather_df,
+                        f_found=f_found,
+                        f_unknown_high=f_unknown_high,
+                        f_unknown_low=f_unknown_low,
+                    )
+                else:
+                    return "no matches found!"
+            # process flat matching (assembly)
             else:
-                return "no matches found!"
+                print('running flat')
+                gather_df = gather_df[gather_df["f_unique_weighted"] >= 0.001]
+                if len(gather_df):
+                    last_row = gather_df.tail(1).squeeze()
+                    f_found = gather_df['f_unique_to_query'].sum()
+
+                    return render_template(
+                        "sample_search_flat.html",
+                        sample_name=sample_name,
+                        sig=ss,
+                        gather_df=gather_df,
+                        f_found=f_found,
+                    )
+                else:
+                    return "no matches found!"
 
         elif action == "download":
             return send_from_directory(UPLOAD_FOLDER, filename)
diff --git a/chill_filter_web/templates/index.html b/chill_filter_web/templates/index.html
@@ -45,13 +45,15 @@ <h4>
 
     <main class="container">
       <h4>Examples:</h4>
-      <a href="/example?filename=SRR606249.k51.s100_000.sig.zip">SRR606249: A mock microbial community</a><br>
-      <a href="/example?filename=SRR5650070.k51.s100_000.sig.zip">SRR5650070: A human gut metagenome from the iHMP</a><br>
-      <a href="/example?filename=SRR12324253.k51.s100_000.sig.zip">SRR12324253: Zymo mock microbial community</a><br>
-      <a href="/example?filename=ERR1395610.k51.s100_000.sig.zip">ERR1395610: a human WGS sample</a><br>
-      <a href="/example?filename=Bu5.k51.s100_000.sig.zip">Bu5: a human WGS sample</a><br>
-      <a href="/example?filename=ERR2592340.k51.s100_000.sig.zip">ERR2592340: livestock feces sample</a><br>
-      <a href="/example?filename=ERR2245457.k51.s100_000.sig.zip">ERR2245457: sewage </a><br>
+      <ul>
+        <li><a href="/example?filename=SRR606249.k51.s100_000.sig.zip">SRR606249: A mock microbial community</a>
+        <li><a href="/example?filename=SRR5650070.k51.s100_000.sig.zip">SRR5650070: A human gut metagenome from the iHMP</a>
+        <li><a href="/example?filename=SRR12324253.k51.s100_000.sig.zip">SRR12324253: Zymo mock microbial community</a>
+        <li><a href="/example?filename=Bu5.abund.k51.s100_000.sig.zip">Bu5: a human WGS sample (reads)</a>
+        <li><a href="/example?filename=Bu5.flat.k51.s100_000.sig.zip">Bu5: a human WGS sample (assembly)</a>
+        <li><a href="/example?filename=ERR2592340.k51.s100_000.sig.zip">ERR2592340: livestock feces sample</a>
+        <li><a href="/example?filename=ERR2245457.k51.s100_000.sig.zip">ERR2245457: sewage </a>
+      </ul>
     {% include "_footer.html" %}
     
     <script>
diff --git a/chill_filter_web/templates/sample_search_abund.html b/chill_filter_web/templates/sample_search_abund.html
@@ -0,0 +1,43 @@
+<!- template: sample_search_abund.html -->
+<head>
+  <title>search of {{ sample_name }} - chill-filter</title>
+  {% include "_header.html" %}
+{% include "_header.html" %}
+</head>
+<body>
+<main class="container">
+
+<h2>Sample search summary for: {{ sample_name }}</h2>
+<p>
+This looks like a set of reads, right?
+<p>
+  
+Based on the search results below, we estimate that at least <b>{{ "{:.1f}".format(f_found * 100) }}%</b> of your sequencing data will map to known reference genomes.
+<p>
+  Your sample is about <b>{{ "{:.1f}".format((1 - f_found) * 100) }}% unknown</b>. This could be new sequence and/or sequencing errors.
+<p>
+(<b>{{ "{:.1f}".format(f_unknown_high*  (1 - f_found) * 100) }}%</b> of the sample is unknown and high abundance, so that's probably not sequencing error.)
+
+<h2>Sample breakdown</h2>
+
+<table border=1>
+  <tr>
+    <th> <b>Percent assigned</b> </th>
+    <th> <b>Reference genome or collection </b></th>
+    <th> <b>Estimated sequencing depth in sample</b></tr>
+  {% for item in gather_df.to_dict(orient='records') %}
+   <tr>
+     <td>{{ '{:.1f}'.format(item['f_unique_weighted'] * 100) }}% </td>
+     <td> {{ item['match_name'] }}  </td>
+     <td>{{ '{:.0f}'.format(item['median_abund']) }}x </td>
+   </tr>
+  {% endfor %}
+</table>
+
+<a href='./'>Return to sample page</a> | <a href="./download_csv">Download results</a>
+
+{% include "_footer.html" %}
+</main>
+</body>
+
+</html>
diff --git a/chill_filter_web/templates/sample_search_flat.html b/chill_filter_web/templates/sample_search_flat.html
@@ -1,4 +1,4 @@
-<!- template: sample_search.html -->
+<!- template: sample_search_flat.html -->
 <head>
   <title>search of {{ sample_name }} - chill-filter</title>
   {% include "_header.html" %}
@@ -8,12 +8,14 @@
 <main class="container">
 
 <h2>Sample search summary for: {{ sample_name }}</h2>
-
-total sample known: <b>{{ "{:.1f}".format(f_found * 100) }}%</b>
 <p>
-(This is an estimate for how much of your sequencing data will map to known reference genomes.)
+This looks like an assembly, right?
+<p>
+  Based on the results below, we estimate that at least
+  <b>{{ "{:.1f}".format(f_found * 100) }}%</b>
+  of your contigs will align to a known reference genome.
 <p>
-Your sample is {{ "{:.1f}".format((1 - f_found) * 100) }}% unknown. This is a combined estimate of genomic sequence not in our database, entirely new genomic sequence, SNPs/polymorphisms, or erroneous sequence.
+Your sample is about {{ "{:.1f}".format((1 - f_found) * 100) }}% unknown. This is likely to be novel sequence!
 
 <h2>Sample breakdown</h2>
 
@@ -29,13 +31,6 @@ <h2>Sample breakdown</h2>
 
 <p>
   
-<h2>TODO</h2>
-<ul>
-  <li> high abundance, low abundance split!
-</ul>
-
-<p>
-
 <a href='./'>Return to sample page</a>
   
 
diff --git a/doc/developer.md b/doc/developer.md
@@ -4,3 +4,7 @@ The client-side sketching code is sourced from
 [here](https://github.com/sourmash-bio/branchwater/tree/main/app/static)
 and the JavaScript is placed entirely in
 `chill_filter_web/templates/index.html`.
+
+## CSS
+
+We are using [pico](https://picocss.com/docs).
diff --git a/examples/Bu5.abund.k51.s100_000.sig.zip b/examples/Bu5.abund.k51.s100_000.sig.zip
diff --git a/examples/Bu5.flat.k51.s100_000.sig.zip b/examples/Bu5.flat.k51.s100_000.sig.zip