diff --git a/config/config.yaml b/config/config.yaml index 7daf13a..6692f1b 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -48,7 +48,7 @@ custom-benchmarks: # whether given target regions are on hg19 grch37: false -# Optional for testing: limit to first 10000 reads +# Optional for testing: limit to first 100000 reads limit-reads: false # whether evaluation shall happen on grch37 or grch38 diff --git a/workflow/envs/pysam.yaml b/workflow/envs/pysam.yaml index 598f0f0..1ab0c09 100644 --- a/workflow/envs/pysam.yaml +++ b/workflow/envs/pysam.yaml @@ -5,4 +5,5 @@ channels: dependencies: - python =3.9 - pysam =0.18 - - pandas =1.3 \ No newline at end of file + - pandas =1.3 + - dnaio =1.2 \ No newline at end of file diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 031de8c..ddcb917 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -334,13 +334,6 @@ def get_liftover_statement(wildcards, input, output): return f"> {output}" -def get_read_limit_param(wildcards, input): - if config.get("limit-reads"): - return "| head -n 110000" # a bit more than 100000 reads because we also have the header - else: - return "" - - def get_benchmark(benchmark): try: return benchmarks[benchmark] diff --git a/workflow/rules/download.smk b/workflow/rules/download.smk index f474a89..67fb720 100644 --- a/workflow/rules/download.smk +++ b/workflow/rules/download.smk @@ -3,22 +3,19 @@ rule get_reads: r1="resources/reads/{benchmark}.1.fq", r2="resources/reads/{benchmark}.2.fq", params: - limit=get_read_limit_param, + limit=branch( + lookup("limit-reads", within=config, default=False), + then=100000, + otherwise=None, + ), bam_url=get_benchmark_bam_url, log: "logs/download-reads/{benchmark}.log", conda: - "../envs/tools.yaml" - resources: - sort_threads=lambda _, threads: max(threads - 2, 1), - threads: 32 + "../envs/pysam.yaml" retries: 3 - shell: - "(set +o pipefail; samtools view -f3 -h" - " {params.bam_url}" - " {params.limit} |" - " samtools sort -n -O BAM --threads {resources.sort_threads} | " - " samtools fastq -1 {output.r1} -2 {output.r2} -s /dev/null -0 /dev/null -) 2> {log}" + script: + "../scripts/get-reads.py" rule get_archive: @@ -175,7 +172,7 @@ rule samtools_faidx: log: "logs/samtools-faidx.log", wrapper: - "v1.7.2/bio/samtools/faidx" + "v6.2.0/bio/samtools/faidx" rule bwa_index: diff --git a/workflow/rules/utils.smk b/workflow/rules/utils.smk index 89df5a2..2c76bc1 100644 --- a/workflow/rules/utils.smk +++ b/workflow/rules/utils.smk @@ -17,4 +17,4 @@ rule index_bcf: log: "logs/bcftools-index-bcf/{prefix}.log", wrapper: - "v1.9.0/bio/bcftools/index" + "v6.2.0/bio/bcftools/index" diff --git a/workflow/scripts/get-reads.py b/workflow/scripts/get-reads.py new file mode 100644 index 0000000..c7b9a75 --- /dev/null +++ b/workflow/scripts/get-reads.py @@ -0,0 +1,48 @@ +import sys + +sys.stderr = open(snakemake.log[0], "w") + +import pysam +import dnaio + + +def aln_to_fq(qname, aln): + return dnaio.SequenceRecord( + name=qname, + sequence=aln.get_forward_sequence(), + qualities="".join( + map(lambda qual: chr(qual + 33), aln.get_forward_qualities()) + ), + ) + + +limit = snakemake.params.limit or None +assert limit is None or limit > 0, "limit must be None or a positive integer" +bam = pysam.AlignmentFile(snakemake.params.bam_url) + +buffer = {} +n_written = 0 +with dnaio.open(snakemake.output[0], snakemake.output[1], mode="w") as fqwriter: + for aln in bam: + if limit is not None and n_written >= limit: + break + if aln.is_secondary or aln.is_supplementary: + continue + + # Some aligners (e.g. minimap2) add /1 and /2 to the read name. + # We remove them here to get the same name for both reads of a pair. + qname = aln.query_name.removesuffix("/1").removesuffix("/2") + + mate_aln = buffer.get(qname) + if mate_aln is None: + buffer[qname] = aln + else: + if aln.is_read2: + aln, mate_aln = mate_aln, aln + del buffer[qname] + + fqwriter.write(aln_to_fq(qname, aln), aln_to_fq(qname, mate_aln)) + n_written += 1 + +if buffer: + print(f"Warning: {len(buffer)} reads had no mate pairs and were skipped", file=sys.stderr)