snakemake-workflows · johanneskoester · May 22, 2025 · May 20, 2025 · May 20, 2025 · May 20, 2025
diff --git a/workflow/envs/pysam.yaml b/workflow/envs/pysam.yaml
@@ -5,4 +5,5 @@ channels:
 dependencies:
   - python =3.9
   - pysam =0.18
-  - pandas =1.3
+  - pandas =1.3
+  - dnaio =1.2
diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk
@@ -334,13 +334,6 @@ def get_liftover_statement(wildcards, input, output):
         return f"> {output}"
 
 
-def get_read_limit_param(wildcards, input):
-    if config.get("limit-reads"):
-        return "| head -n 110000"  # a bit more than 100000 reads because we also have the header
-    else:
-        return ""
-
-
 def get_benchmark(benchmark):
     try:
         return benchmarks[benchmark]

diff --git a/workflow/rules/download.smk b/workflow/rules/download.smk
@@ -3,22 +3,15 @@ rule get_reads:
         r1="resources/reads/{benchmark}.1.fq",
         r2="resources/reads/{benchmark}.2.fq",
     params:
-        limit=get_read_limit_param,
+        limit=branch(lookup("limit-reads", within=config), then=100000, otherwise=None),
         bam_url=get_benchmark_bam_url,
     log:
         "logs/download-reads/{benchmark}.log",
     conda:
-        "../envs/tools.yaml"
-    resources:
-        sort_threads=lambda _, threads: max(threads - 2, 1),
-    threads: 32
+        "../envs/pysam.yaml"
     retries: 3
-    shell:
-        "(set +o pipefail; samtools view -f3 -h"
-        " {params.bam_url}"
-        " {params.limit} |"
-        " samtools sort -n -O BAM --threads {resources.sort_threads} | "
-        " samtools fastq -1 {output.r1} -2 {output.r2} -s /dev/null -0 /dev/null -) 2> {log}"
+    script:
+        "../scripts/get-reads.py"
 
 
 rule get_archive:

diff --git a/workflow/scripts/get-reads.py b/workflow/scripts/get-reads.py
@@ -0,0 +1,47 @@
+import sys
+
+sys.stderr = open(snakemake.log[0], "w")
+
+import pysam
+import dnaio
+
+
+def aln_to_fq(qname, aln):
+    return dnaio.SequenceRecord(
+        name=qname,
+        sequence=aln.get_forward_sequence(),
+        qualities="".join(
+            map(lambda qual: chr(qual + 33), aln.get_forward_qualities())
+        ),
+    )
+
+
+limit = snakemake.params.limit
+bam = pysam.AlignmentFile(snakemake.params.bam_url)
+
+buffer = {}
+n_written = 0
+with dnaio.open(snakemake.output[0], snakemake.output[1], mode="w") as fqwriter:
+    for aln in bam:
+        if limit is not None and n_written >= limit:
+            break
+        if aln.is_secondary or aln.is_supplementary:
+            continue
+
+        # Some aligners (e.g. minimap2) add /1 and /2 to the read name.
+        # We remove them here to get the same name for both reads of a pair.
+        qname = aln.query_name.removesuffix("/1").removesuffix("/2")
+
+        mate_aln = buffer.get(qname)
+        if mate_aln is None:
+            buffer[qname] = aln
+        else:
+            if aln.is_read2:
+                aln, mate_aln = mate_aln, aln
+            del buffer[qname]
+
+            fqwriter.write(aln_to_fq(qname, aln), aln_to_fq(qname, mate_aln))
+            n_written += 1
+
+if buffer:
+    print(f"Warning: {len(buffer)} reads had no mate pairs and were skipped", file=sys.stderr)