From b415c7cac46b6dfde35cde5c5c9b4601ec1f539f Mon Sep 17 00:00:00 2001 From: Johannes Koester Date: Tue, 20 May 2025 09:59:13 +0200 Subject: [PATCH 01/15] fix: use buffered and script based get-reads implementation --- workflow/envs/pysam.yaml | 3 ++- workflow/rules/common.smk | 7 ------- workflow/rules/download.smk | 15 ++++---------- workflow/scripts/get-reads.py | 39 +++++++++++++++++++++++++++++++++++ 4 files changed, 45 insertions(+), 19 deletions(-) create mode 100644 workflow/scripts/get-reads.py diff --git a/workflow/envs/pysam.yaml b/workflow/envs/pysam.yaml index 598f0f0..1ab0c09 100644 --- a/workflow/envs/pysam.yaml +++ b/workflow/envs/pysam.yaml @@ -5,4 +5,5 @@ channels: dependencies: - python =3.9 - pysam =0.18 - - pandas =1.3 \ No newline at end of file + - pandas =1.3 + - dnaio =1.2 \ No newline at end of file diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 031de8c..ddcb917 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -334,13 +334,6 @@ def get_liftover_statement(wildcards, input, output): return f"> {output}" -def get_read_limit_param(wildcards, input): - if config.get("limit-reads"): - return "| head -n 110000" # a bit more than 100000 reads because we also have the header - else: - return "" - - def get_benchmark(benchmark): try: return benchmarks[benchmark] diff --git a/workflow/rules/download.smk b/workflow/rules/download.smk index f474a89..65be7aa 100644 --- a/workflow/rules/download.smk +++ b/workflow/rules/download.smk @@ -3,22 +3,15 @@ rule get_reads: r1="resources/reads/{benchmark}.1.fq", r2="resources/reads/{benchmark}.2.fq", params: - limit=get_read_limit_param, + limit=branch(lookup("limit-reads", within=config), then=100000, otherwise=None), bam_url=get_benchmark_bam_url, log: "logs/download-reads/{benchmark}.log", conda: - "../envs/tools.yaml" - resources: - sort_threads=lambda _, threads: max(threads - 2, 1), - threads: 32 + "../envs/pysam.yaml" retries: 3 - shell: - "(set +o pipefail; samtools view -f3 -h" - " {params.bam_url}" - " {params.limit} |" - " samtools sort -n -O BAM --threads {resources.sort_threads} | " - " samtools fastq -1 {output.r1} -2 {output.r2} -s /dev/null -0 /dev/null -) 2> {log}" + script: + "scripts/get-reads.py" rule get_archive: diff --git a/workflow/scripts/get-reads.py b/workflow/scripts/get-reads.py new file mode 100644 index 0000000..01f6192 --- /dev/null +++ b/workflow/scripts/get-reads.py @@ -0,0 +1,39 @@ +import sys + +sys.stderr = open(snakemake.log[0], "w") + +import pysam +import dnaio + + +def aln_to_fq(aln): + return dnaio.SequenceRecord( + name=aln.query_name, + sequence=aln.get_forward_sequence(), + quality=aln.query_qualities, + ) + + +limit = snakemake.params.limit +bam = pysam.AlignmentFile(snakemake.params.bam_url) + +buffer = {} +n_written = 0 +with dnaio.open(snakemake.output[0], snakemake.output[1], "w") as fqwriter: + for aln in bam: + if n_written >= limit: + break + if not aln.is_secondary or aln.is_supplementary: + continue + + qname = aln.query_name.removesuffix("/1").removesuffix("/2") + + mate_aln = buffer.get(aln.query_name) + if mate_aln is None: + buffer[aln.query_name] = aln + else: + if aln.is_read2: + aln, mate_aln = mate_aln, aln + + fqwriter.write(aln_to_fq(aln), aln_to_fq(mate_aln)) + n_written += 1 From 30b94d8fa289f316106632f76c8e1ad2928f88b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=B6ster?= Date: Tue, 20 May 2025 10:09:38 +0200 Subject: [PATCH 02/15] Update workflow/scripts/get-reads.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- workflow/scripts/get-reads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/scripts/get-reads.py b/workflow/scripts/get-reads.py index 01f6192..b3a8ee4 100644 --- a/workflow/scripts/get-reads.py +++ b/workflow/scripts/get-reads.py @@ -21,7 +21,7 @@ def aln_to_fq(aln): n_written = 0 with dnaio.open(snakemake.output[0], snakemake.output[1], "w") as fqwriter: for aln in bam: - if n_written >= limit: + if limit is not None and n_written >= limit: break if not aln.is_secondary or aln.is_supplementary: continue From 38ee6f517b5a814ed1e93b4a12d4562711840916 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=B6ster?= Date: Tue, 20 May 2025 10:09:54 +0200 Subject: [PATCH 03/15] Update workflow/scripts/get-reads.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- workflow/scripts/get-reads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/scripts/get-reads.py b/workflow/scripts/get-reads.py index b3a8ee4..349285b 100644 --- a/workflow/scripts/get-reads.py +++ b/workflow/scripts/get-reads.py @@ -23,7 +23,7 @@ def aln_to_fq(aln): for aln in bam: if limit is not None and n_written >= limit: break - if not aln.is_secondary or aln.is_supplementary: + if aln.is_secondary or aln.is_supplementary: continue qname = aln.query_name.removesuffix("/1").removesuffix("/2") From 27bad317d2391cc060351a8f32338bdf53a6020e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=B6ster?= Date: Tue, 20 May 2025 10:10:18 +0200 Subject: [PATCH 04/15] Update workflow/rules/download.smk --- workflow/rules/download.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/download.smk b/workflow/rules/download.smk index 65be7aa..47a3be8 100644 --- a/workflow/rules/download.smk +++ b/workflow/rules/download.smk @@ -11,7 +11,7 @@ rule get_reads: "../envs/pysam.yaml" retries: 3 script: - "scripts/get-reads.py" + "../scripts/get-reads.py" rule get_archive: From 7f7a4c67991e996cdb4f977d441e41e2bdda8c6a Mon Sep 17 00:00:00 2001 From: Johannes Koester Date: Tue, 20 May 2025 11:49:52 +0200 Subject: [PATCH 05/15] mode --- workflow/scripts/get-reads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/scripts/get-reads.py b/workflow/scripts/get-reads.py index 349285b..0a5dfb6 100644 --- a/workflow/scripts/get-reads.py +++ b/workflow/scripts/get-reads.py @@ -19,7 +19,7 @@ def aln_to_fq(aln): buffer = {} n_written = 0 -with dnaio.open(snakemake.output[0], snakemake.output[1], "w") as fqwriter: +with dnaio.open(snakemake.output[0], snakemake.output[1], mode="w") as fqwriter: for aln in bam: if limit is not None and n_written >= limit: break From e063c9338687fc4bc533cec8ca7f394049d74f3b Mon Sep 17 00:00:00 2001 From: Johannes Koester Date: Tue, 20 May 2025 12:51:51 +0200 Subject: [PATCH 06/15] fixes --- workflow/scripts/get-reads.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/workflow/scripts/get-reads.py b/workflow/scripts/get-reads.py index 0a5dfb6..ac31981 100644 --- a/workflow/scripts/get-reads.py +++ b/workflow/scripts/get-reads.py @@ -6,11 +6,11 @@ import dnaio -def aln_to_fq(aln): +def aln_to_fq(qname, aln): return dnaio.SequenceRecord( - name=aln.query_name, + name=qname, sequence=aln.get_forward_sequence(), - quality=aln.query_qualities, + qualities=aln.query_qualities, ) @@ -28,12 +28,12 @@ def aln_to_fq(aln): qname = aln.query_name.removesuffix("/1").removesuffix("/2") - mate_aln = buffer.get(aln.query_name) + mate_aln = buffer.get(qname) if mate_aln is None: - buffer[aln.query_name] = aln + buffer[qname] = aln else: if aln.is_read2: aln, mate_aln = mate_aln, aln - fqwriter.write(aln_to_fq(aln), aln_to_fq(mate_aln)) + fqwriter.write(aln_to_fq(qname, aln), aln_to_fq(qname, mate_aln)) n_written += 1 From d0820d36c7b87e0db0ac0c6c7c585d1150316ddf Mon Sep 17 00:00:00 2001 From: Johannes Koester Date: Tue, 20 May 2025 13:51:03 +0200 Subject: [PATCH 07/15] fix --- workflow/scripts/get-reads.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/workflow/scripts/get-reads.py b/workflow/scripts/get-reads.py index ac31981..758264d 100644 --- a/workflow/scripts/get-reads.py +++ b/workflow/scripts/get-reads.py @@ -10,7 +10,9 @@ def aln_to_fq(qname, aln): return dnaio.SequenceRecord( name=qname, sequence=aln.get_forward_sequence(), - qualities=aln.query_qualities, + qualities="".join( + map(lambda qual: chr(qual + 33), aln.get_forward_qualities()) + ), ) From 0f7538fa033a6c7b22b43875dd5c221f60297ba7 Mon Sep 17 00:00:00 2001 From: Johannes Koester Date: Tue, 20 May 2025 13:55:37 +0200 Subject: [PATCH 08/15] remove read from buffer when writing fastq record --- workflow/scripts/get-reads.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflow/scripts/get-reads.py b/workflow/scripts/get-reads.py index 758264d..02fee15 100644 --- a/workflow/scripts/get-reads.py +++ b/workflow/scripts/get-reads.py @@ -28,6 +28,8 @@ def aln_to_fq(qname, aln): if aln.is_secondary or aln.is_supplementary: continue + # Some aligners (e.g. minimap2) add /1 and /2 to the read name. + # We remove them here to get the same name for both reads of a pair. qname = aln.query_name.removesuffix("/1").removesuffix("/2") mate_aln = buffer.get(qname) @@ -36,6 +38,7 @@ def aln_to_fq(qname, aln): else: if aln.is_read2: aln, mate_aln = mate_aln, aln + del buffer[qname] fqwriter.write(aln_to_fq(qname, aln), aln_to_fq(qname, mate_aln)) n_written += 1 From 11961b914ec79c0ea295a705469b1f4481c2a0f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20K=C3=B6ster?= Date: Tue, 20 May 2025 14:36:22 +0200 Subject: [PATCH 09/15] Update workflow/scripts/get-reads.py --- workflow/scripts/get-reads.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflow/scripts/get-reads.py b/workflow/scripts/get-reads.py index 02fee15..2f4ccb8 100644 --- a/workflow/scripts/get-reads.py +++ b/workflow/scripts/get-reads.py @@ -42,3 +42,6 @@ def aln_to_fq(qname, aln): fqwriter.write(aln_to_fq(qname, aln), aln_to_fq(qname, mate_aln)) n_written += 1 + +if buffer: + print(f"Warning: {len(buffer)} reads had no mate pairs and were skipped", file=sys.stderr) From 8c7c4a9717fcb4abc79236dd4124b64352a029e5 Mon Sep 17 00:00:00 2001 From: Johannes Koester Date: Tue, 20 May 2025 14:39:54 +0200 Subject: [PATCH 10/15] dbg --- workflow/rules/download.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/download.smk b/workflow/rules/download.smk index 47a3be8..744d9e5 100644 --- a/workflow/rules/download.smk +++ b/workflow/rules/download.smk @@ -168,7 +168,7 @@ rule samtools_faidx: log: "logs/samtools-faidx.log", wrapper: - "v1.7.2/bio/samtools/faidx" + "v6.2.0/bio/samtools/faidx" rule bwa_index: From 9cdd2c931ee612f9726e9c8470962a8ec1ce9fe5 Mon Sep 17 00:00:00 2001 From: Johannes Koester Date: Tue, 20 May 2025 15:25:55 +0200 Subject: [PATCH 11/15] fix lookup default --- config/config.yaml | 2 +- workflow/rules/download.smk | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 7daf13a..6692f1b 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -48,7 +48,7 @@ custom-benchmarks: # whether given target regions are on hg19 grch37: false -# Optional for testing: limit to first 10000 reads +# Optional for testing: limit to first 100000 reads limit-reads: false # whether evaluation shall happen on grch37 or grch38 diff --git a/workflow/rules/download.smk b/workflow/rules/download.smk index 744d9e5..ec5bc4b 100644 --- a/workflow/rules/download.smk +++ b/workflow/rules/download.smk @@ -3,7 +3,7 @@ rule get_reads: r1="resources/reads/{benchmark}.1.fq", r2="resources/reads/{benchmark}.2.fq", params: - limit=branch(lookup("limit-reads", within=config), then=100000, otherwise=None), + limit=branch(lookup("limit-reads", within=config, default=False), then=100000, otherwise=None), bam_url=get_benchmark_bam_url, log: "logs/download-reads/{benchmark}.log", From 3c3e5336ca8c2021a7aebab674ccb7bc1a0e4f7a Mon Sep 17 00:00:00 2001 From: Johannes Koester Date: Tue, 20 May 2025 15:27:03 +0200 Subject: [PATCH 12/15] assertion --- workflow/scripts/get-reads.py | 1 + 1 file changed, 1 insertion(+) diff --git a/workflow/scripts/get-reads.py b/workflow/scripts/get-reads.py index 2f4ccb8..14a75a0 100644 --- a/workflow/scripts/get-reads.py +++ b/workflow/scripts/get-reads.py @@ -17,6 +17,7 @@ def aln_to_fq(qname, aln): limit = snakemake.params.limit +assert limit is None or limit > 0, "limit must be None or a positive integer" bam = pysam.AlignmentFile(snakemake.params.bam_url) buffer = {} From fbef952f29600e1709f59dba6021810112f8051d Mon Sep 17 00:00:00 2001 From: Johannes Koester Date: Tue, 20 May 2025 15:28:53 +0200 Subject: [PATCH 13/15] fix --- workflow/scripts/get-reads.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/scripts/get-reads.py b/workflow/scripts/get-reads.py index 14a75a0..c7b9a75 100644 --- a/workflow/scripts/get-reads.py +++ b/workflow/scripts/get-reads.py @@ -16,7 +16,7 @@ def aln_to_fq(qname, aln): ) -limit = snakemake.params.limit +limit = snakemake.params.limit or None assert limit is None or limit > 0, "limit must be None or a positive integer" bam = pysam.AlignmentFile(snakemake.params.bam_url) From 2e86719283247be95550befb0c1c61ad5c6adc59 Mon Sep 17 00:00:00 2001 From: Johannes Koester Date: Tue, 20 May 2025 22:52:37 +0200 Subject: [PATCH 14/15] fmt --- workflow/rules/download.smk | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/workflow/rules/download.smk b/workflow/rules/download.smk index ec5bc4b..67fb720 100644 --- a/workflow/rules/download.smk +++ b/workflow/rules/download.smk @@ -3,7 +3,11 @@ rule get_reads: r1="resources/reads/{benchmark}.1.fq", r2="resources/reads/{benchmark}.2.fq", params: - limit=branch(lookup("limit-reads", within=config, default=False), then=100000, otherwise=None), + limit=branch( + lookup("limit-reads", within=config, default=False), + then=100000, + otherwise=None, + ), bam_url=get_benchmark_bam_url, log: "logs/download-reads/{benchmark}.log", From 7b2cbccf21bc0dc5969adb4945cb9b39f7ca5428 Mon Sep 17 00:00:00 2001 From: Johannes Koester Date: Wed, 21 May 2025 11:42:11 +0200 Subject: [PATCH 15/15] update wrapper --- workflow/rules/utils.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk index 89df5a2..2c76bc1 100644 --- a/workflow/rules/utils.smk +++ b/workflow/rules/utils.smk @@ -17,4 +17,4 @@ rule index_bcf: log: "logs/bcftools-index-bcf/{prefix}.log", wrapper: - "v1.9.0/bio/bcftools/index" + "v6.2.0/bio/bcftools/index"