From e6a0a166c057dbca2a3e9f07cdaa4adfdf730dd8 Mon Sep 17 00:00:00 2001 From: Sam W Date: Mon, 19 Aug 2024 12:40:29 +0100 Subject: [PATCH 01/11] 1.4.0 pre-release version (#135) * Deprecate old tools and black format * Support fragmented data * use new trim primer arg in minion * reverse the (revolting) primer position logic * Fix find primer logic * Improve the cursed report writer logic * explicitly named function args, what a novel idea * add temp debugging prints to find_primer * Add logic to handle segments outside of primer scheme * fix typo * General bugfixes, test fix for overlapping variant issue * add min mapq argument * properly provide min_mapq to handle_segment * Fix unmatched read group issue * Fix 8 bit int error * First attempt to integrate clair3 as optional variant caller * Slightly change env yaml format * Specify env name in mamba gh action * Checkout repo before installing env * python 3.6.15 * Run on ubuntu 20.04 * unpin python * Unpin python entirely * Separate out tests * correct unit test path * remove retvar hints for compatibility * Fix silly relative imports * Switch to cyvcf and fix unit tests * Fix pipeline parse unit test * Fix fstring * Large changes to make clair3 work, various unit test updates * Fix fuzzy primer site match logic * Fix minion_validator path * Fix vcf merge bug with empty VCF files * Increase fuzzy match default threshold to 20 * Fix CHROM bug in vcf merge properly * Report failed workflow when variants are missed * Fix vcf merge again * Separate out tests * No counter anymore * Add C241T to SP1 variants * Add limit on ref pileup * Single copy of CVR1 12733 var * re-enable other consensus tests * First attempt to make minion validator less fragile * Unify testvariants, improve test logic * medaka has to be different smh * Index ref fasta when using the clair3 workflow * Add mean amplicon depth TSV output * Update CLI and scheme fetcher * Update align trim unit test * Use proper version of V3 bedfile * Fix pipeline unit test * Fix align trim amp depth bug * Call variants on non primertrimmed bam * Don't test MT vs clair * properly disable test * Add docker build push action on release * Grab wget in dockerfile * Update docs and readme * docker push on release only * Remove debugging print statement * Make fuzzy primer match logic clearer * Install procps for nxf compatibility --- .github/workflows/docker-build-push.yml | 39 ++ .github/workflows/unittests.yml | 45 ++ .gitignore | 14 +- .travis.yml | 38 -- Dockerfile | 28 + README.md | 18 +- artic/.gitignore | 3 + artic/align_trim.py | 519 +++++++++++---- artic/artic_mqc.py | 109 ++- artic/{ => deprecated}/align_trim_n.py | 0 artic/{ => deprecated}/basecaller.py | 0 artic/{ => deprecated}/conftest.py | 0 artic/{ => deprecated}/demultiplex.py | 0 artic/{ => deprecated}/extract.py | 0 artic/{ => deprecated}/gather.py | 0 artic/{ => deprecated}/margin_cons_medaka.py | 97 +-- artic/deprecated/setup.py | 48 ++ artic/export.py | 86 ++- artic/fasta_header.py | 14 +- artic/filter_reads.py | 34 +- artic/guppyplex.py | 53 +- artic/make_depth_mask.py | 74 ++- artic/margin_cons.py | 91 ++- artic/mask.py | 48 +- artic/minion.py | 431 ++++++------ artic/minion_validator.py | 331 ---------- artic/pipeline.py | 349 ++++++---- artic/rampart.py | 124 ++-- artic/runs.py | 20 +- artic/utils.py | 625 ++++++++++++++++++ artic/vcf_filter.py | 77 +-- artic/vcf_merge.py | 103 ++- artic/vcfextract.py | 52 +- artic/vcftagprimersites.py | 157 ----- artic/version.py | 2 +- docs/commands.md | 70 +- docs/faq.md | 4 +- docs/installation.md | 16 +- environment.yml | 42 +- requirements.txt | 8 - setup.cfg | 39 ++ setup.py | 52 +- .../IturiEBOV/V1/IturiEBOV.scheme.bed | 244 +++---- .../nCoV-2019/V1/nCoV-2019.scheme.bed | 392 +++++------ test-runner.sh | 105 ++- {artic => tests}/align_trim_unit_test.py | 133 ++-- tests/get_scheme_unit_test.py | 34 + tests/minion_validator.py | 379 +++++++++++ {artic => tests}/pipeline_unit_test.py | 14 +- .../utils_unit_test.py | 40 +- 50 files changed, 3166 insertions(+), 2035 deletions(-) create mode 100644 .github/workflows/docker-build-push.yml create mode 100644 .github/workflows/unittests.yml delete mode 100644 .travis.yml create mode 100644 Dockerfile create mode 100644 artic/.gitignore rename artic/{ => deprecated}/align_trim_n.py (100%) rename artic/{ => deprecated}/basecaller.py (100%) rename artic/{ => deprecated}/conftest.py (100%) rename artic/{ => deprecated}/demultiplex.py (100%) rename artic/{ => deprecated}/extract.py (100%) rename artic/{ => deprecated}/gather.py (100%) rename artic/{ => deprecated}/margin_cons_medaka.py (58%) create mode 100644 artic/deprecated/setup.py delete mode 100644 artic/minion_validator.py create mode 100644 artic/utils.py delete mode 100755 artic/vcftagprimersites.py delete mode 100644 requirements.txt create mode 100644 setup.cfg rename {artic => tests}/align_trim_unit_test.py (60%) create mode 100644 tests/get_scheme_unit_test.py create mode 100644 tests/minion_validator.py rename {artic => tests}/pipeline_unit_test.py (78%) rename artic/vcftagprimersites_unit_test.py => tests/utils_unit_test.py (52%) diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml new file mode 100644 index 00000000..6c9f8827 --- /dev/null +++ b/.github/workflows/docker-build-push.yml @@ -0,0 +1,39 @@ +name: quay_build_push + +on: + release: + types: [published] + +jobs: + docker: + runs-on: ubuntu-latest + steps: + - name: Docker meta + id: meta + uses: docker/metadata-action@v5 + with: + images: quay.io/artic-network/fieldbioinformatics + flavor: | + latest=true + tags: | + type=semver,pattern={{version}} + - name: Checkout + uses: actions/checkout@v3 + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + registry: quay.io + username: ${{ secrets.QUAY_USERNAME }} + password: ${{ secrets.QUAY_PASSWORD }} + - name: Build and push + uses: docker/build-push-action@v5 + with: + context: . + platforms: linux/amd64 + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} \ No newline at end of file diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml new file mode 100644 index 00000000..8a9c8abe --- /dev/null +++ b/.github/workflows/unittests.yml @@ -0,0 +1,45 @@ +name: fieldbioinformatics_unit_tests + +on: [push, pull_request] + +jobs: + build: + runs-on: ubuntu-latest + strategy: + fail-fast: false + steps: + - uses: actions/checkout@v3 + - uses: mamba-org/setup-micromamba@v1 + name: Setup dependencies + with: + environment-file: environment.yml + init-shell: >- + bash + cache-environment: true + post-cleanup: "all" + + - name: Pip install artic + run: | + python3 -m pip install -e . + shell: micromamba-shell {0} + + - name: Run unit tests + run: | + pytest -s tests/*_unit_test.py + shell: micromamba-shell {0} + + - name: Run medaka test + run: | + ./test-runner.sh medaka + shell: micromamba-shell {0} + + - name: Run clair3 test + run: | + ./test-runner.sh clair3 + shell: micromamba-shell {0} + + - name: Run minion_validator tests + run: | + pytest -s tests/minion_validator.py + shell: micromamba-shell {0} + diff --git a/.gitignore b/.gitignore index f3259711..c61e3270 100644 --- a/.gitignore +++ b/.gitignore @@ -26,6 +26,14 @@ dist/ # test data *.index* -CVR1/ -NRW01/ -SP1/ \ No newline at end of file +CVR1.* +NRW01.* +SP1.* +MT007544* +tmp/ +test-data/CVR1/ +test-data/NRW01/ +test-data/SP1/ +test-data/MT007544/ +rg_* +primer-schemes/ \ No newline at end of file diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 8289448f..00000000 --- a/.travis.yml +++ /dev/null @@ -1,38 +0,0 @@ -dist: xenial -language: python # this works for Linux but is an error on macOS or Windows -jobs: - include: - - name: "Python 3.6 on Xenial Linux" - python: 3.6 # this works for Linux but is ignored on macOS or Windows - - name: "Python 3.6 on macOS" - os: osx - language: shell # 'language: python' is an error on Travis CI macOS - python: 3.6 - -install: - - - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then MINICONDA_OS=Linux; else MINICONDA_OS=MacOSX; fi - - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-$MINICONDA_OS-x86_64.sh -O miniconda.sh - - bash miniconda.sh -b -p "$HOME"/miniconda - - source "$HOME/miniconda/etc/profile.d/conda.sh" - - hash -r - - conda config --set always_yes yes --set changeps1 no - - conda update -q conda - - conda info -a - - conda env create -f environment.yml - - conda activate artic - - python setup.py install - -script: - # run the unit tests - - pytest -s artic/*_unit_test.py - - # run the pipeline test for the medaka pipeline variant - - ./test-runner.sh medaka - - # run the pipeline test for the nanopolish pipeline variant - - ./test-runner.sh nanopolish - - # run some of the variant set validation functional tests (will download test data) - - pytest -s artic/minion_validator.py --workFlow nanopolish --numValidations 1 - - pytest -s artic/minion_validator.py --workFlow medaka --numValidations 2 diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..272a109a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,28 @@ +FROM mambaorg/micromamba:1.5.8 + +COPY . ./fieldbioinformatics/ + +USER root + +RUN apt-get update && apt-get install -y --no-install-recommends build-essential wget procps + +USER $MAMBA_USER + +COPY --chown=$MAMBA_USER:$MAMBA_USER environment.yml /tmp/env.yml + +RUN sed -i 's/name: artic/name: base/' /tmp/env.yml + +RUN micromamba install --yes --file /tmp/env.yml && \ + micromamba clean --all --yes + +ARG MAMBA_DOCKERFILE_ACTIVATE=1 + +USER root + +RUN python3 -m pip install ./fieldbioinformatics + +USER $MAMBA_USER + +ENTRYPOINT ["/usr/local/bin/_entrypoint.sh"] + +CMD ["/bin/bash"] \ No newline at end of file diff --git a/README.md b/README.md index 48223c83..a2d5cf9a 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ Features include: There are **2 workflows** baked into this pipeline, one which uses signal data (via [nanopolish](https://github.com/jts/nanopolish)) and one that does not (via [medaka](https://github.com/nanoporetech/medaka)). -## Installation + ### Via source #### 1. downloading the source: -Download a [release](https://github.com/artic-network/fieldbioinformatics/releases) or use the latest master (which tracks the current release): + + +Clone the repository then checkout the 1.4.0-dev branch to test the 1.4.0 pre-release. ```sh git clone https://github.com/artic-network/fieldbioinformatics cd fieldbioinformatics +git checkout 1.4.0-dev ``` #### 2. installing dependencies: -The `artic pipeline` has several [software dependencies](https://github.com/artic-network/fieldbioinformatics/blob/master/environment.yml). You can solve these dependencies using the minimal conda environment we have provided: +The `artic pipeline` has several [software dependencies](https://github.com/artic-network/fieldbioinformatics/blob/master/environment.yml). You can solve these dependencies using the minimal conda environment we have provided, Please make sure you are using either mamba or conda version >= 23.10.0 where libmamba solver was made the default conda solver. +: ```sh conda env create -f environment.yml @@ -64,7 +64,7 @@ conda activate artic #### 3. installing the pipeline: ```sh -python setup.py install +pip install . ``` #### 4. testing the pipeline: diff --git a/artic/.gitignore b/artic/.gitignore new file mode 100644 index 00000000..096b5eb5 --- /dev/null +++ b/artic/.gitignore @@ -0,0 +1,3 @@ +# pixi environments +.pixi +*.egg-info diff --git a/artic/align_trim.py b/artic/align_trim.py index 85074db2..2a1b3f69 100755 --- a/artic/align_trim.py +++ b/artic/align_trim.py @@ -1,12 +1,13 @@ #!/usr/bin/env python -# Written by Nick Loman - from copy import copy -from collections import defaultdict +import csv import pysam import sys -from .vcftagprimersites import read_bed_file +import numpy as np +import random +import argparse +from artic.utils import read_bed_file # consumesReference lookup for if a CIGAR operation consumes the reference sequence consumesReference = [True, False, True, True, False, False, False, True] @@ -15,7 +16,7 @@ consumesQuery = [True, True, False, False, True, False, False, True] -def find_primer(bed, pos, direction): +def find_primer(bed, pos, direction, threshold=20): """Given a reference position and a direction of travel, walk out and find the nearest primer site. Parameters @@ -29,17 +30,33 @@ def find_primer(bed, pos, direction): Returns ------- - tuple - The offset, distance and bed entry for the closest primer to the query position + tuple[int, int, dict] | bool + A tuple containing the distance to the primer, the relative position of the primer, and the primer site, or False if no primer found """ from operator import itemgetter - if direction == '+': - closest = min([(abs(p['start'] - pos), p['start'] - pos, p) - for p in bed if p['direction'] == direction], key=itemgetter(0)) + if direction == "+": + primer_distances = [ + (abs(p["start"] - pos), p["start"] - pos, p) + for p in bed + if (p["direction"] == direction) and (pos >= (p["start"] - threshold)) + ] + else: - closest = min([(abs(p['end'] - pos), p['end'] - pos, p) - for p in bed if p['direction'] == direction], key=itemgetter(0)) + primer_distances = [ + (abs(p["end"] - pos), p["end"] - pos, p) + for p in bed + if (p["direction"] == direction) and (pos <= (p["end"] + threshold)) + ] + + if not primer_distances: + return False + + closest = min( + primer_distances, + key=itemgetter(0), + ) + return closest @@ -80,18 +97,20 @@ def trim(segment, primer_pos, end, debug): print("Chomped a %s, %s" % (flag, length), file=sys.stderr) except IndexError: print( - "Ran out of cigar during soft masking - completely masked read will be ignored", file=sys.stderr) + "Ran out of cigar during soft masking - completely masked read will be ignored", + file=sys.stderr, + ) break # if the CIGAR operation consumes the reference sequence, increment/decrement the position by the CIGAR operation length - if (consumesReference[flag]): + if consumesReference[flag]: if not end: pos += length else: pos -= length # if the CIGAR operation consumes the query sequence, increment the number of CIGAR operations eaten by the CIGAR operation length - if (consumesQuery[flag]): + if consumesQuery[flag]: eaten += length # stop processing the CIGAR if we've gone far enough to mask the primer @@ -125,7 +144,9 @@ def trim(segment, primer_pos, end, debug): if cigar[0][0] == 2: if debug: print( - "softmask created a leading deletion in the CIGAR, shuffling the alignment", file=sys.stderr) + "softmask created a leading deletion in the CIGAR, shuffling the alignment", + file=sys.stderr, + ) while 1: if cigar[0][0] != 2: break @@ -146,6 +167,271 @@ def trim(segment, primer_pos, end, debug): return +def handle_segment( + segment: pysam.AlignedSegment, + bed: dict, + args: argparse.Namespace, + min_mapq: int, + report_writer: csv.DictWriter = False, +): + """Handle the alignment segment including + + Args: + segment (pysam.AlignedSegment): The alignment segment to process + bed (dict): The primer scheme + reportfh (typing.IO): The report file handle + args (argparse.Namespace): The command line arguments + + Returns: + tuple [int, pysam.AlignedSegment] | bool: A tuple containing the amplicon number and the alignment segment, or False if the segment is to be skipped + """ + + # filter out unmapped and supplementary alignment segments + if segment.is_unmapped: + print("%s skipped as unmapped" % (segment.query_name), file=sys.stderr) + return False + if segment.is_supplementary: + print("%s skipped as supplementary" % (segment.query_name), file=sys.stderr) + return False + if segment.mapping_quality < min_mapq: + print( + "%s skipped as mapping quality below threshold" % (segment.query_name), + file=sys.stderr, + ) + return False + + # locate the nearest primers to this alignment segment + p1 = find_primer(bed, segment.reference_start, "+", args.primer_match_threshold) + p2 = find_primer(bed, segment.reference_end, "-", args.primer_match_threshold) + + if not p1 or not p2: + print( + "%s skipped as no primer found for segment" % (segment.query_name), + file=sys.stderr, + ) + return False + + # check if primers are correctly paired and then assign read group + # NOTE: removed this as a function as only called once + # TODO: will try improving this / moving it to the primer scheme processing code + correctly_paired = p1[2]["Primer_ID"].replace("_LEFT", "") == p2[2][ + "Primer_ID" + ].replace("_RIGHT", "") + + if not args.no_read_groups: + if correctly_paired: + segment.set_tag("RG", p1[2]["PoolName"]) + else: + segment.set_tag("RG", "unmatched") + + # get the amplicon number + amplicon = p1[2]["Primer_ID"].split("_")[1] + + if args.report: + # update the report with this alignment segment + primer details + report = { + "QueryName": segment.query_name, + "ReferenceStart": segment.reference_start, + "ReferenceEnd": segment.reference_end, + "PrimerPair": f"{p1[2]['Primer_ID']}_{p2[2]['Primer_ID']}", + "Primer1": p1[2]["Primer_ID"], + "Primer1Start": abs(p1[1]), + "Primer2": p2[2]["Primer_ID"], + "Primer2Start": abs(p2[1]), + "IsSecondary": segment.is_secondary, + "IsSupplementary": segment.is_supplementary, + "Start": p1[2]["start"], + "End": p2[2]["end"], + "CorrectlyPaired": correctly_paired, + } + + report_writer.writerow(report) + + if args.remove_incorrect_pairs and not correctly_paired: + print( + "%s skipped as not correctly paired" % (segment.query_name), + file=sys.stderr, + ) + return False + + if args.verbose: + # Dont screw with the order of the dict + report_str = "\t".join(str(x) for x in report.values()) + print(report_str, file=sys.stderr) + + # get the primer positions + if args.trim_primers: + p1_position = p1[2]["end"] + p2_position = p2[2]["start"] + else: + p1_position = p1[2]["start"] + p2_position = p2[2]["end"] + + # softmask the alignment if left primer start/end inside alignment + if segment.reference_start < p1_position: + try: + trim(segment, p1_position, False, args.verbose) + if args.verbose: + print( + "ref start %s >= primer_position %s" + % (segment.reference_start, p1_position), + file=sys.stderr, + ) + except Exception as e: + print( + "problem soft masking left primer in {} (error: {}), skipping".format( + segment.query_name, e + ), + file=sys.stderr, + ) + return False + + # softmask the alignment if right primer start/end inside alignment + if segment.reference_end > p2_position: + try: + trim(segment, p2_position, True, args.verbose) + if args.verbose: + print( + "ref start %s >= primer_position %s" + % (segment.reference_start, p2_position), + file=sys.stderr, + ) + except Exception as e: + print( + "problem soft masking right primer in {} (error: {}), skipping".format( + segment.query_name, e + ), + file=sys.stderr, + ) + return False + + # check the the alignment still contains bases matching the reference + if "M" not in segment.cigarstring: + print( + "%s dropped as does not match reference post masking" + % (segment.query_name), + file=sys.stderr, + ) + return False + + return (amplicon, segment) + + +def generate_amplicons(bed: list): + """Generate a dictionary of amplicons from a primer scheme list (generated by vcftagprimersites/read_bed_file) + + Args: + bed (list): A list of dictionaries, where each dictionary contains a row of bedfile data (generated by vcftagprimersites/read_bed_file), assumes that all redundant primers have been expanded + + Raises: + ValueError: Primer direction not recognised + + Returns: + dict: A dictionary of amplicons, where each key is the amplicon number and the value is a dictionary containing the primer start, primer end, insert start, insert end, length and circularity + """ + + amplicons = {} + for primer in bed: + + amplicon = primer["Primer_ID"].split("_")[1] + + amplicons.setdefault(amplicon, {}) + + if primer["direction"] == "+": + amplicons[amplicon]["p_start"] = primer["start"] + amplicons[amplicon]["start"] = primer["end"] + 1 + + elif primer["direction"] == "-": + amplicons[amplicon]["p_end"] = primer["end"] + amplicons[amplicon]["end"] = primer["start"] - 1 + + else: + raise ValueError("Primer direction not recognised") + + for amplicon in amplicons: + if not all([x in amplicons[amplicon] for x in ["p_start", "p_end"]]): + raise ValueError(f"Primer scheme for amplicon {amplicon} is incomplete") + + # Check if primer runs accross reference start / end -> circular virus + amplicons[amplicon]["circular"] = ( + amplicons[amplicon]["p_start"] > amplicons[amplicon]["p_end"] + ) + + # Calculate amplicon length considering that the "length" may be negative if the genome is circular + amplicons[amplicon]["length"] = abs( + amplicons[amplicon]["p_end"] - amplicons[amplicon]["p_start"] + ) + + return amplicons + + +def normalise(trimmed_segments: dict, normalise: int, bed: list): + """Normalise the depth of the trimmed segments to a given value. Perform per-amplicon normalisation using numpy vector maths to determine whether the segment in question would take the depth closer to the desired depth accross the amplicon. + + Args: + trimmed_segments (dict): Dict containing amplicon number as key and list of pysam.AlignedSegment as value + normalise (int): Desired normalised depth + bed (list): Primer scheme list (generated by vcftagprimersites/read_bed_file) + trim_primers (bool): Whether to trim primers from the reads + + Raises: + ValueError: Amplicon assigned to segment not found in primer scheme file + + Returns: + list : List of pysam.AlignedSegment to output + """ + + amplicons = generate_amplicons(bed) + + output_segments = [] + + mean_depths = {x: 0 for x in amplicons} + + for amplicon, segments in trimmed_segments.items(): + if amplicon not in amplicons: + raise ValueError(f"Segment {amplicon} not found in primer scheme file") + + desired_depth = np.full_like( + (amplicons[amplicon]["length"],), normalise, dtype=int + ) + + amplicon_depth = np.zeros((amplicons[amplicon]["length"],), dtype=int) + + if not segments: + print( + f"No segments assigned to amplicon {amplicon}, skipping", + file=sys.stderr, + ) + continue + + random.shuffle(segments) + + distance = np.mean(np.abs(amplicon_depth - desired_depth)) + + for segment in segments: + test_depths = np.copy(amplicon_depth) + + relative_start = segment.reference_start - amplicons[amplicon]["p_start"] + + if relative_start < 0: + relative_start = 0 + + relative_end = segment.reference_end - amplicons[amplicon]["p_start"] + + test_depths[relative_start:relative_end] += 1 + + test_distance = np.mean(np.abs(test_depths - desired_depth)) + + if test_distance < distance: + amplicon_depth = test_depths + distance = test_distance + output_segments.append(segment) + + mean_depths[amplicon] = np.mean(amplicon_depth) + + return output_segments, mean_depths + + def go(args): """Filter and soft mask an alignment file so that the alignment boundaries match the primer start and end sites. @@ -154,119 +440,90 @@ def go(args): # prepare the report outfile if args.report: reportfh = open(args.report, "w") - print("QueryName\tReferenceStart\tReferenceEnd\tPrimerPair\tPrimer1\tPrimer1Start\tPrimer2\tPrimer2Start\tIsSecondary\tIsSupplementary\tStart\tEnd\tCorrectlyPaired", file=reportfh) - - # set up a counter to track amplicon abundance - counter = defaultdict(int) + report_headers = [ + "QueryName", + "ReferenceStart", + "ReferenceEnd", + "PrimerPair", + "Primer1", + "Primer1Start", + "Primer2", + "Primer2Start", + "IsSecondary", + "IsSupplementary", + "Start", + "End", + "CorrectlyPaired", + ] + report_writer = csv.DictWriter(reportfh, fieldnames=report_headers) + report_writer.writeheader() # open the primer scheme and get the pools bed = read_bed_file(args.bedfile) - pools = set([row['PoolName'] for row in bed]) - pools.add('unmatched') + pools = set([row["PoolName"] for row in bed]) + pools.add("unmatched") # open the input SAM file and process read groups infile = pysam.AlignmentFile("-", "rb") bam_header = infile.header.copy().to_dict() if not args.no_read_groups: - bam_header['RG'] = [] + bam_header["RG"] = [] for pool in pools: read_group = {} - read_group['ID'] = pool - bam_header['RG'].append(read_group) + read_group["ID"] = pool + bam_header["RG"].append(read_group) # prepare the alignment outfile outfile = pysam.AlignmentFile("-", "wh", header=bam_header) + trimmed_segments = {} + # iterate over the alignment segments in the input SAM file for segment in infile: - - # filter out unmapped and supplementary alignment segments - if segment.is_unmapped: - print("%s skipped as unmapped" % - (segment.query_name), file=sys.stderr) - continue - if segment.is_supplementary: - print("%s skipped as supplementary" % - (segment.query_name), file=sys.stderr) - continue - - # locate the nearest primers to this alignment segment - p1 = find_primer(bed, segment.reference_start, '+') - p2 = find_primer(bed, segment.reference_end, '-') - - # check if primers are correctly paired and then assign read group - # NOTE: removed this as a function as only called once - # TODO: will try improving this / moving it to the primer scheme processing code - correctly_paired = p1[2]['Primer_ID'].replace( - '_LEFT', '') == p2[2]['Primer_ID'].replace('_RIGHT', '') - if not args.no_read_groups: - if correctly_paired: - segment.set_tag('RG', p1[2]['PoolName']) - else: - segment.set_tag('RG', 'unmatched') - if args.remove_incorrect_pairs and not correctly_paired: - print("%s skipped as not correctly paired" % - (segment.query_name), file=sys.stderr) - continue - - # update the report with this alignment segment + primer details - report = "%s\t%s\t%s\t%s_%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d" % (segment.query_name, segment.reference_start, segment.reference_end, p1[2]['Primer_ID'], p2[2]['Primer_ID'], p1[2]['Primer_ID'], abs( - p1[1]), p2[2]['Primer_ID'], abs(p2[1]), segment.is_secondary, segment.is_supplementary, p1[2]['start'], p2[2]['end'], correctly_paired) if args.report: - print(report, file=reportfh) - if args.verbose: - print(report, file=sys.stderr) - - # get the primer positions - if args.start: - p1_position = p1[2]['start'] - p2_position = p2[2]['end'] + trimming_tuple = handle_segment( + segment=segment, + bed=bed, + args=args, + report_writer=report_writer, + min_mapq=args.min_mapq, + ) else: - p1_position = p1[2]['end'] - p2_position = p2[2]['start'] - - # softmask the alignment if left primer start/end inside alignment - if segment.reference_start < p1_position: - try: - trim(segment, p1_position, False, args.verbose) - if args.verbose: - print("ref start %s >= primer_position %s" % - (segment.reference_start, p1_position), file=sys.stderr) - except Exception as e: - print("problem soft masking left primer in {} (error: {}), skipping" .format( - segment.query_name, e), file=sys.stderr) - continue - - # softmask the alignment if right primer start/end inside alignment - if segment.reference_end > p2_position: - try: - trim(segment, p2_position, True, args.verbose) - if args.verbose: - print("ref start %s >= primer_position %s" % - (segment.reference_start, p2_position), file=sys.stderr) - except Exception as e: - print("problem soft masking right primer in {} (error: {}), skipping" .format( - segment.query_name, e), file=sys.stderr) - continue - - # normalise if requested - if args.normalise: - pair = "%s-%s-%d" % (p1[2]['Primer_ID'], - p2[2]['Primer_ID'], segment.is_reverse) - counter[pair] += 1 - if counter[pair] > args.normalise: - print("%s dropped as abundance theshold reached" % - (segment.query_name), file=sys.stderr) - continue - - # check the the alignment still contains bases matching the reference - if 'M' not in segment.cigarstring: - print("%s dropped as does not match reference post masking" % - (segment.query_name), file=sys.stderr) + trimming_tuple = handle_segment( + segment=segment, + bed=bed, + args=args, + min_mapq=args.min_mapq, + ) + if not trimming_tuple: continue - # current alignment segment has passed filters, send it to the outfile - outfile.write(segment) + # unpack the trimming tuple since segment passed trimming + amplicon, trimmed_segment = trimming_tuple + trimmed_segments.setdefault(amplicon, []) + + if trimmed_segment: + trimmed_segments[amplicon].append(trimmed_segment) + + # normalise if requested + if args.normalise: + output_segments, mean_amp_depths = normalise( + trimmed_segments, args.normalise, bed + ) + + # write mean amplicon depths to file + if args.amp_depth_report: + with open(args.amp_depth_report, "w") as amp_depth_report_fh: + amp_depth_report_fh.write("amplicon\tmean_depth\n") + for amplicon, depth in mean_amp_depths.items(): + amp_depth_report_fh.write(f"{amplicon}\t{depth}\n") + + for output_segment in output_segments: + outfile.write(output_segment) + else: + for amplicon, segments in trimmed_segments.items(): + for segment in segments: + outfile.write(segment) # close up the file handles infile.close() @@ -276,21 +533,39 @@ def go(args): def main(): - import argparse - parser = argparse.ArgumentParser( - description='Trim alignments from an amplicon scheme.') + description="Trim alignments from an amplicon scheme." + ) + parser.add_argument("bedfile", help="BED file containing the amplicon scheme") + parser.add_argument( + "--normalise", type=int, help="Subsample to n coverage per strand" + ) + parser.add_argument( + "--min-mapq", type=int, default=20, help="Minimum mapping quality to keep" + ) + parser.add_argument( + "--primer-match-threshold", + type=int, + default=5, + help="Fuzzy match primer positions within this threshold", + ) + parser.add_argument("--report", type=str, help="Output report to file") + parser.add_argument( + "--amp-depth-report", type=str, help="Output amplicon depth tsv to path" + ) + parser.add_argument( + "--trim-primers", + action="store_true", + help="Trims primers from reads", + ) parser.add_argument( - 'bedfile', help='BED file containing the amplicon scheme') - parser.add_argument('--normalise', type=int, - help='Subsample to n coverage per strand') - parser.add_argument('--report', type=str, help='Output report to file') - parser.add_argument('--start', action='store_true', - help='Trim to start of primers instead of ends') - parser.add_argument('--no-read-groups', dest='no_read_groups', - action='store_true', help='Do not divide reads into groups in SAM output') - parser.add_argument('--verbose', action='store_true', help='Debug mode') - parser.add_argument('--remove-incorrect-pairs', action='store_true') + "--no-read-groups", + dest="no_read_groups", + action="store_true", + help="Do not divide reads into groups in SAM output", + ) + parser.add_argument("--verbose", action="store_true", help="Debug mode") + parser.add_argument("--remove-incorrect-pairs", action="store_true") args = parser.parse_args() go(args) diff --git a/artic/artic_mqc.py b/artic/artic_mqc.py index 763316fc..08ef102a 100644 --- a/artic/artic_mqc.py +++ b/artic/artic_mqc.py @@ -2,9 +2,8 @@ import json import re import sys -from collections import OrderedDict -from .vcftagprimersites import read_bed_file +from artic.utils import read_bed_file # Alignment_Length_Threshold drops binned reads that are len(REF): - print("Skipping insertion at position: %s" % (record.POS), file=sys.stderr) + print( + "Skipping insertion at position: %s" % (record.POS), file=sys.stderr + ) continue - if depths[record.CHROM][record.POS] >= args.depth and record.QUAL >= args.quality: + if ( + depths[record.CHROM][record.POS] >= args.depth + and record.QUAL >= args.quality + ): if len(REF) > len(ALT): - print("N-masking confident deletion at %s" % (record.POS), file=sys.stderr) + print( + "N-masking confident deletion at %s" % (record.POS), + file=sys.stderr, + ) for n in range(len(REF)): - cons[record.CHROM][record.POS-1+n] = 'N' + cons[record.CHROM][record.POS - 1 + n] = "N" continue reporter.report(record, "variant", ALT) @@ -117,7 +130,7 @@ def go(args): if len(ALT) > len(REF): print("insertion", file=sys.stderr) continue - cons[record.CHROM][record.POS-1] = str(ALT) + cons[record.CHROM][record.POS - 1] = str(ALT) elif len(REF) > len(ALT): continue else: @@ -125,25 +138,33 @@ def go(args): reporter.report(record, "low_depth_variant", "n") else: reporter.report(record, "low_qual_variant", "n") - #cons[record.CHROM][record.POS-1] = 'N' - continue + # cons[record.CHROM][record.POS-1] = 'N' + continue - #print >>sys.stderr, str(sett) + # print >>sys.stderr, str(sett) for k in seqs.keys(): - print(">%s-%s" % (args.bamfile, k)) - print("".join(cons[k])) + print(">%s-%s" % (args.bamfile, k)) + print("".join(cons[k])) + def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--depth', type=int, default=5, help='minimum depth to call a variant') - parser.add_argument('--quality', type=int, default=0, help='minimum quality to call a variant') - parser.add_argument('--masked', help='Regions to mask (contig:start-end,contig:start-end)') - parser.add_argument('reference') - parser.add_argument('vcffile') - parser.add_argument('bamfile') - args = parser.parse_args() - go(args) + parser = argparse.ArgumentParser() + parser.add_argument( + "--depth", type=int, default=5, help="minimum depth to call a variant" + ) + parser.add_argument( + "--quality", type=int, default=0, help="minimum quality to call a variant" + ) + parser.add_argument( + "--masked", help="Regions to mask (contig:start-end,contig:start-end)" + ) + parser.add_argument("reference") + parser.add_argument("vcffile") + parser.add_argument("bamfile") + args = parser.parse_args() + go(args) + if __name__ == "__main__": main() diff --git a/artic/deprecated/setup.py b/artic/deprecated/setup.py new file mode 100644 index 00000000..84e1575a --- /dev/null +++ b/artic/deprecated/setup.py @@ -0,0 +1,48 @@ +import os +from setuptools import setup + +version_py = os.path.join(os.path.dirname(__file__), "artic", "version.py") +version = open(version_py).read().strip().split("=")[-1].replace('"', "").strip() +long_description = """ +``artic`` is a pipeline for working with virus sequencing data sequenced with nanopore +""" + +HERE = os.path.dirname(__file__) + +setup( + name="artic", + version=version, + requires=["python (>=3.5)"], + packages=["artic"], + author="Nick Loman", + description="A toolset for working with nanopore sequencing data", + long_description=long_description, + url="https://artic.network/ncov-2019/ncov2019-bioinformatics-sop.html", + package_dir={"artic": "artic"}, + package_data={"artic": []}, + zip_safe=False, + include_package_data=True, + entry_points={ + "console_scripts": [ + "artic=artic.pipeline:main", + "align_trim=artic.align_trim:main", + "align_trim_n=artic.align_trim_n:main", + "margin_cons=artic.margin_cons:main", + "margin_cons_medaka=artic.margin_cons_medaka:main", + "vcfextract=artic.vcfextract:main", + "artic_vcf_merge=artic.vcf_merge:main", + "artic_vcf_filter=artic.vcf_filter:main", + "artic_make_depth_mask=artic.make_depth_mask:main", + "artic_fasta_header=artic.fasta_header:main", + "artic_mask=artic.mask:main", + "artic_get_stats=artic.artic_mqc:main", + ], + }, + author_email="n.j.loman@bham.ac.uk", + classifiers=[ + "Development Status :: 4 - Beta", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT", + "Topic :: Scientific/Engineering :: Bio-Informatics", + ], +) diff --git a/artic/export.py b/artic/export.py index 77d6c01b..0d32484a 100755 --- a/artic/export.py +++ b/artic/export.py @@ -1,38 +1,60 @@ -#Written by Nick Loman (@pathogenomenick) +# Written by Nick Loman (@pathogenomenick) import os -import sys import pandas as pd from Bio import SeqIO -def run(parser, args): - if not os.path.exists(args.output_directory): - os.mkdir(args.output_directory) - - query_set = set() - - fqfn = "%s/%s.fastq" % (args.output_directory, args.prefix,) - cmd = "samtools view -h -F 4 %s | samtools fastq - -0 %s" % (args.bamfile, fqfn) - print (cmd) - os.system(cmd) - - fn = "%s/%s.reads.list" % (args.output_directory, args.prefix,) - fh = open(fn, "w") - for rec in SeqIO.parse(open(fqfn), "fastq"): - fh.write("%s\n" % (rec.id)) - fh.close() - - cmd = "fast5_subset -r -i \"%s\" -l \"%s/%s.reads.list\" -f \"%s_\" -n 4000 -s \"%s/fast5\"" % ( - args.fast5_directory, args.output_directory, args.prefix, args.prefix, args.output_directory) - os.system(cmd) - mappingfn = "%s/fast5/filename_mapping.txt" % (args.output_directory) - mapping_file = pd.read_csv(mappingfn, names=('read_id', 'filename'), sep='\t', index_col='read_id', header=None) - - summary_file = pd.read_csv(args.sequencing_summary, sep='\t', low_memory=False) - filtered_summary_file = summary_file[summary_file.read_id.isin(query_set)].copy() - - filtered_summary_file['filename'] = filtered_summary_file.apply(lambda row: mapping_file.loc[row['read_id']]['filename'] , axis=1) - - summaryfn = "%s/%s_sequencing_summary.txt" % (args.output_directory, args.prefix) - filtered_summary_file.to_csv(summaryfn, sep="\t", index=False) +def run(parser, args): + if not os.path.exists(args.output_directory): + os.mkdir(args.output_directory) + + query_set = set() + + fqfn = "%s/%s.fastq" % ( + args.output_directory, + args.prefix, + ) + cmd = "samtools view -h -F 4 %s | samtools fastq - -0 %s" % (args.bamfile, fqfn) + print(cmd) + os.system(cmd) + + fn = "%s/%s.reads.list" % ( + args.output_directory, + args.prefix, + ) + fh = open(fn, "w") + for rec in SeqIO.parse(open(fqfn), "fastq"): + fh.write("%s\n" % (rec.id)) + fh.close() + + cmd = ( + 'fast5_subset -r -i "%s" -l "%s/%s.reads.list" -f "%s_" -n 4000 -s "%s/fast5"' + % ( + args.fast5_directory, + args.output_directory, + args.prefix, + args.prefix, + args.output_directory, + ) + ) + os.system(cmd) + + mappingfn = "%s/fast5/filename_mapping.txt" % (args.output_directory) + mapping_file = pd.read_csv( + mappingfn, + names=("read_id", "filename"), + sep="\t", + index_col="read_id", + header=None, + ) + + summary_file = pd.read_csv(args.sequencing_summary, sep="\t", low_memory=False) + filtered_summary_file = summary_file[summary_file.read_id.isin(query_set)].copy() + + filtered_summary_file["filename"] = filtered_summary_file.apply( + lambda row: mapping_file.loc[row["read_id"]]["filename"], axis=1 + ) + + summaryfn = "%s/%s_sequencing_summary.txt" % (args.output_directory, args.prefix) + filtered_summary_file.to_csv(summaryfn, sep="\t", index=False) diff --git a/artic/fasta_header.py b/artic/fasta_header.py index 16c89432..b470ae22 100644 --- a/artic/fasta_header.py +++ b/artic/fasta_header.py @@ -1,11 +1,11 @@ from Bio import SeqIO -import sys + def fasta_header(fn, header): fh = open(fn) rec = list(SeqIO.parse(fh, "fasta")) if len(rec) > 1: - print ("Sorry, this script doesn't support multi-FASTA files!") + print("Sorry, this script doesn't support multi-FASTA files!") raise SystemExit fh.close() @@ -14,15 +14,19 @@ def fasta_header(fn, header): SeqIO.write([rec[0]], fh, "fasta") fh.close() + def main(): import argparse - parser = argparse.ArgumentParser(description='Trim alignments from an amplicon scheme.') - parser.add_argument('filename') - parser.add_argument('header') + parser = argparse.ArgumentParser( + description="Trim alignments from an amplicon scheme." + ) + parser.add_argument("filename") + parser.add_argument("header") args = parser.parse_args() fasta_header(args.filename, args.header) + if __name__ == "__main__": main() diff --git a/artic/filter_reads.py b/artic/filter_reads.py index 571d990b..fa39baef 100644 --- a/artic/filter_reads.py +++ b/artic/filter_reads.py @@ -1,31 +1,27 @@ import sys from Bio import SeqIO -import tempfile -import os -import glob -import shutil # extract with constraints: # -- only one group ever # -- only one flowcell ID ever # -- always unique read ID -def run(parser, args): - dups = set() - uniq = 0 - total = 0 - for rec in SeqIO.parse(open(args.filename), "fastq"): - if args.max_length and len(rec) > args.max_length: - continue - if args.min_length and len(rec) < args.min_length: - continue - total += 1 - if rec.id not in dups: - SeqIO.write([rec], sys.stdout, "fastq") +def run(parser, args): + dups = set() + uniq = 0 + total = 0 + for rec in SeqIO.parse(open(args.filename), "fastq"): + if args.max_length and len(rec) > args.max_length: + continue + if args.min_length and len(rec) < args.min_length: + continue - dups.add(rec.id) - uniq += 1 + total += 1 + if rec.id not in dups: + SeqIO.write([rec], sys.stdout, "fastq") - print("%s\t%s\t%s" % (total, uniq), file=sys.stderr) + dups.add(rec.id) + uniq += 1 + print("%s\t%s\t%s" % (total, uniq), file=sys.stderr) diff --git a/artic/guppyplex.py b/artic/guppyplex.py index 7889d52e..d6dbf6f0 100644 --- a/artic/guppyplex.py +++ b/artic/guppyplex.py @@ -1,13 +1,9 @@ import sys from Bio import SeqIO -import tempfile import os -import glob import gzip import fnmatch -import shutil import pandas as pd -from collections import defaultdict from mimetypes import guess_type from functools import partial from math import log10 @@ -24,21 +20,33 @@ def get_read_mean_quality(record): - return -10 * log10((10 ** (pd.Series(record.letter_annotations["phred_quality"]) / -10)).mean()) + return -10 * log10( + (10 ** (pd.Series(record.letter_annotations["phred_quality"]) / -10)).mean() + ) def run(parser, args): files = os.listdir(args.directory) - fastq_files = [os.path.join(args.directory, f) for f in files if fnmatch.fnmatch(f, '*.fastq*') and not f.endswith('.temp')] + fastq_files = [ + os.path.join(args.directory, f) + for f in files + if fnmatch.fnmatch(f, "*.fastq*") and not f.endswith(".temp") + ] if fastq_files: if not args.output: - fastq_outfn = "%s_%s.fastq" % (args.prefix, os.path.basename(args.directory)) + fastq_outfn = "%s_%s.fastq" % ( + args.prefix, + os.path.basename(args.directory), + ) else: fastq_outfn = args.output outfh = open(fastq_outfn, "w") - print("Processing %s files in %s" % (len(fastq_files), args.directory), file=sys.stderr) + print( + "Processing %s files in %s" % (len(fastq_files), args.directory), + file=sys.stderr, + ) dups = set() @@ -51,20 +59,23 @@ def run(parser, args): with _open(fn) as f: try: for rec in SeqIO.parse(f, "fastq"): - if args.max_length and len(rec) > args.max_length: - continue - if args.min_length and len(rec) < args.min_length: - continue - if not args.skip_quality_check and get_read_mean_quality(rec) < args.quality: - continue - if args.sample < 1: - r = random() - if r >= args.sample: - continue + if args.max_length and len(rec) > args.max_length: + continue + if args.min_length and len(rec) < args.min_length: + continue + if ( + not args.skip_quality_check + and get_read_mean_quality(rec) < args.quality + ): + continue + if args.sample < 1: + r = random() + if r >= args.sample: + continue - if rec.id not in dups: - SeqIO.write([rec], outfh, "fastq") - dups.add(rec.id) + if rec.id not in dups: + SeqIO.write([rec], outfh, "fastq") + dups.add(rec.id) except ValueError: pass diff --git a/artic/make_depth_mask.py b/artic/make_depth_mask.py index 14bfd1a4..579eca18 100755 --- a/artic/make_depth_mask.py +++ b/artic/make_depth_mask.py @@ -38,13 +38,12 @@ def collect_depths(bamfile, refName, minDepth, ignoreDeletions, warnRGcov): raise Exception("bamfile doesn't exist (%s)" % bamfile) # open the BAM file - bamFile = pysam.AlignmentFile(bamfile, 'rb') + bamFile = pysam.AlignmentFile(bamfile, "rb") # get the TID for the reference tid = bamFile.get_tid(refName) if tid == -1: - raise Exception( - "bamfile does not contain specified reference (%s)" % refName) + raise Exception("bamfile does not contain specified reference (%s)" % refName) # create a depth vector to hold the depths at each reference position depths = [0] * bamFile.get_reference_length(refName) @@ -53,10 +52,10 @@ def collect_depths(bamfile, refName, minDepth, ignoreDeletions, warnRGcov): rgDepths = {} # get the read groups and init the depth vectors - for rg in bamFile.header['RG']: - if rg['ID'] == 'unmatched': + for rg in bamFile.header["RG"]: + if rg["ID"] == "unmatched": continue - rgDepths[rg['ID']] = [0] * bamFile.get_reference_length(refName) + rgDepths[rg["ID"]] = [0] * bamFile.get_reference_length(refName) # flag to state if BAM file has low readgroup coverage lowRGcov = False @@ -65,13 +64,15 @@ def collect_depths(bamfile, refName, minDepth, ignoreDeletions, warnRGcov): lowRGvec = [] # generate the pileup - for pileupcolumn in bamFile.pileup(refName, max_depth=10000, truncate=False, min_base_quality=0): + for pileupcolumn in bamFile.pileup( + refName, start=0, stop=bamFile.get_reference_length(refName), max_depth=10000, truncate=False, min_base_quality=0 + ): # process the pileup column for pileupread in pileupcolumn.pileups: # get the read group for this pileup read and check it's in the BAM header - rg = pileupread.alignment.get_tag('RG') + rg = pileupread.alignment.get_tag("RG") assert rg in rgDepths, "alignment readgroup not in BAM header: %s" % rg # process the pileup read @@ -112,9 +113,12 @@ def collect_depths(bamfile, refName, minDepth, ignoreDeletions, warnRGcov): # get the regions and print warning regions = list(intervals_extract(lowRGvec)) sys.stderr.write( - "alignment has unmasked regions where individual readgroup depth < {}: {}\n" .format(minDepth, bamfile)) + "alignment has unmasked regions where individual readgroup depth < {}: {}\n".format( + minDepth, bamfile + ) + ) for region in regions: - sys.stderr.write("region: %s\n" % str(region).strip('[]')) + sys.stderr.write("region: %s\n" % str(region).strip("[]")) # close file and return depth vector bamFile.close() @@ -138,8 +142,13 @@ def go(args): # collect the depths from the pileup, replacing any depth len(REF): - print("Skipping insertion at position: %s" % (record.POS), file=sys.stderr) + print( + "Skipping insertion at position: %s" % (record.POS), file=sys.stderr + ) continue if qual >= 200 and total_reads >= 20: if len(REF) > len(ALT): - print("N-masking confident deletion at %s" % (record.POS), file=sys.stderr) + print( + "N-masking confident deletion at %s" % (record.POS), + file=sys.stderr, + ) for n in range(len(REF)): - cons[record.POS-1+n] = 'N' + cons[record.POS - 1 + n] = "N" continue reporter.report(record, "variant", ALT) @@ -107,26 +123,29 @@ def go(args): print("insertion", file=sys.stderr) continue - cons[record.POS-1] = str(ALT) + cons[record.POS - 1] = str(ALT) elif len(REF) > len(ALT): continue else: reporter.report(record, "low_qual_variant", "n") - cons[record.POS-1] = 'N' - continue + cons[record.POS - 1] = "N" + continue print(">%s" % (sys.argv[3])) print("".join(cons)) def main(): - parser = argparse.ArgumentParser() - parser.add_argument('--depth', type=int, default=20, help='minimum depth to call a variant') - parser.add_argument('reference') - parser.add_argument('vcffile') - parser.add_argument('bamfile') - args = parser.parse_args() - go(args) + parser = argparse.ArgumentParser() + parser.add_argument( + "--depth", type=int, default=20, help="minimum depth to call a variant" + ) + parser.add_argument("reference") + parser.add_argument("vcffile") + parser.add_argument("bamfile") + args = parser.parse_args() + go(args) + if __name__ == "__main__": main() diff --git a/artic/mask.py b/artic/mask.py index e00c0f6e..97134715 100755 --- a/artic/mask.py +++ b/artic/mask.py @@ -1,25 +1,25 @@ #!/usr/bin/env python from Bio import SeqIO -import sys -import vcf -import subprocess -from collections import defaultdict -import os.path -import operator -from .vcftagprimersites import read_bed_file +from cyvcf2 import VCF import argparse import pandas as pd + def read_3col_bed(fn): # read the primer scheme into a pandas dataframe and run type, length and null checks - bedfile = pd.read_csv(fn, sep='\t', header=None, - names=['chrom', 'start', 'end'], - dtype={'chrom': str, 'start': int, 'end': int}, - usecols=(0, 1, 2), - skiprows=0) + bedfile = pd.read_csv( + fn, + sep="\t", + header=None, + names=["chrom", "start", "end"], + dtype={"chrom": str, "start": int, "end": int}, + usecols=(0, 1, 2), + skiprows=0, + ) return bedfile + def go(args): seqs = dict([(rec.id, rec) for rec in SeqIO.parse(open(args.reference), "fasta")]) cons = {} @@ -28,30 +28,30 @@ def go(args): bedfile = read_3col_bed(args.maskfile) for _, region in bedfile.iterrows(): - for n in range(region['start'], region['end']): - cons[region['chrom']][n] = 'N' + for n in range(region["start"], region["end"]): + cons[region["chrom"]][n] = "N" - sett = set() - vcf_reader = vcf.Reader(open(args.maskvcf, 'r')) + vcf_reader = VCF(args.maskvcf) for record in vcf_reader: for n in range(0, len(record.REF)): - cons[record.CHROM][record.POS-1+n] = 'N' + cons[record.CHROM][record.POS - 1 + n] = "N" - fh = open(args.output, 'w') + fh = open(args.output, "w") for k in seqs.keys(): - fh.write(">%s\n" % (k)) - fh.write(("".join(cons[k]))+'\n') + fh.write(">%s\n" % (k)) + fh.write(("".join(cons[k])) + "\n") fh.close() def main(): parser = argparse.ArgumentParser() - parser.add_argument('reference') - parser.add_argument('maskfile') - parser.add_argument('maskvcf') - parser.add_argument('output') + parser.add_argument("reference") + parser.add_argument("maskfile") + parser.add_argument("maskvcf") + parser.add_argument("output") args = parser.parse_args() go(args) + if __name__ == "__main__": main() diff --git a/artic/minion.py b/artic/minion.py index 017549f7..60e065e1 100755 --- a/artic/minion.py +++ b/artic/minion.py @@ -1,161 +1,80 @@ -#Written by Nick Loman (@pathogenomenick) +# Written by Nick Loman (@pathogenomenick) -from clint.textui import colored, puts, indent -from Bio import SeqIO +from clint.textui import colored import os import sys import time -import requests -import hashlib -from .vcftagprimersites import read_bed_file - -def get_nanopolish_header(ref): - """Checks the reference sequence for a single fasta entry. - - Parameters - ---------- - ref : str - The fasta file containing the reference sequence - - Returns - ------- - str - A formatted header string for nanopolish - """ - recs = list(SeqIO.parse(open(ref), "fasta")) - if len (recs) != 1: - print("FASTA has more than one sequence", file=sys.stderr) - raise SystemExit(1) - return "%s:%d-%d" % (recs[0].id, 1, len(recs[0])+1) +from artic.utils import read_bed_file, get_scheme + + +def run(parser, args): -def check_scheme_hashes(filepath, manifest_hash): - with open(filepath, "rb") as fh: - data = fh.read() - hash_sha256 = hashlib.sha256(data).hexdigest() - if hash_sha256 != manifest_hash: + if any([args.bed, args.ref]) and any( + [ + args.scheme_name, + args.scheme_version, + args.scheme_length, + ] + ): print( - colored.yellow( - f"sha256 hash for {str(filepath)} does not match manifest" + colored.red( + "You cannot mix 'Remote Scheme Options' with 'Local Scheme Options', for more information run 'artic minion --help'" ), file=sys.stderr, ) raise SystemExit(1) -def get_scheme(scheme_name, scheme_directory, scheme_version="1"): - """Get and check the ARTIC primer scheme. - When determining a version, the original behaviour (parsing the scheme_name and - separating on /V ) is used over a specified scheme_version. If neither are - provided, the version defaults to 1. - If 0 is provided as version, the latest scheme will be downloaded. - - Parameters - ---------- - scheme_name : str - The primer scheme name - scheme_directory : str - The directory containing the primer scheme and reference sequence - scheme_version : str - The primer scheme version (optional) - Returns - ------- - str - The location of the checked primer scheme - str - The location of the checked reference sequence - str - The version being used - """ - # try getting the version from the scheme name (old behaviour) - if scheme_name.find('/V') != -1: - scheme_name, scheme_version = scheme_name.split('/V') - - # create the filenames and check they exist - bed = "%s/%s/V%s/%s.scheme.bed" % (scheme_directory, scheme_name, scheme_version, scheme_name) - ref = "%s/%s/V%s/%s.reference.fasta" % (scheme_directory, scheme_name, scheme_version, scheme_name) - if os.path.exists(bed) and os.path.exists(ref): - return bed, ref, scheme_version - - # if they don't exist, try downloading them to the current directory - print( - colored.yellow( - "could not find primer scheme and reference sequence, downloading" - ), - file=sys.stderr, - ) - - try: - manifest = requests.get("https://raw.githubusercontent.com/artic-network/primer-schemes/master/schemes_manifest.json").json() - except requests.exceptions.RequestException as error: - print("Manifest Exception:", error) - raise SystemExit(2) - - for scheme, scheme_contents in dict(manifest["schemes"]).items(): - if scheme == scheme_name.lower() or scheme_name.lower() in scheme_contents["aliases"]: - print( - colored.yellow( - f"\tfound requested scheme:\t{scheme} (using alias {scheme_name})" - ), - file=sys.stderr, - ) - if scheme_version == 0: - print( - colored.yellow( - f"Latest version for scheme {scheme} is -> {scheme_contents['latest_version']}" - ), - file=sys.stderr, - ) - scheme_version = scheme_contents["latest_version"] - elif scheme_version not in dict(scheme_contents["primer_urls"]).keys(): - print( - colored.yellow( - f"Requested scheme version {scheme_version} not found; using latest version ({scheme_contents['latest_version']}) instead" - ), - file=sys.stderr, - ) - scheme_version = scheme_contents["latest_version"] - bed = "%s/%s/V%s/%s.scheme.bed" % (scheme_directory, scheme_name, scheme_version, scheme_name) - ref = "%s/%s/V%s/%s.reference.fasta" % (scheme_directory, scheme_name, scheme_version, scheme_name) - - - os.makedirs(os.path.dirname(bed), exist_ok=True) - with requests.get(scheme_contents["primer_urls"][scheme_version]) as fh: - open(bed, 'wt').write(fh.text) - - os.makedirs(os.path.dirname(ref), exist_ok=True) - with requests.get(scheme_contents["reference_urls"][scheme_version]) as fh: - open(ref, 'wt').write(fh.text) - - check_scheme_hashes(bed, scheme_contents["primer_sha256_checksums"][scheme_version]) - check_scheme_hashes(ref, scheme_contents["reference_sha256_checksums"][scheme_version]) - - return bed, ref, scheme_version - - print( - colored.yellow( - f"\tRequested scheme:\t{scheme_name} could not be found, exiting" - ), - file=sys.stderr, - ) - raise SystemExit(1) + if any([args.bed, args.ref]) and not all([args.bed, args.ref]): + print( + colored.red( + "You must provide both a BED file and a reference sequence for local scheme options" + ), + file=sys.stderr, + ) + raise SystemExit(1) -def run(parser, args): + if any([args.scheme_name, args.scheme_version, args.scheme_length]) and not all( + [args.scheme_name, args.scheme_version] + ): + print( + colored.red( + "You must provide a scheme name and a scheme version at minimum for remote scheme options" + ), + file=sys.stderr, + ) + raise SystemExit(1) - # check for medaka-model - if args.medaka and (args.medaka_model is None): - print(colored.red('Must specify --medaka-model if using the --medaka workflow.')) + # check for model + if not args.model: + print( + colored.red("Must specify --model for clair3 or medaka variant calling"), + ) raise SystemExit(1) - # 1) check the parameters and set up the filenames + # check the parameters and set up the filenames ## find the primer scheme, reference sequence and confirm scheme version - bed, ref, _ = get_scheme(args.scheme, args.scheme_directory, args.scheme_version) - ## if in strict mode, validate the primer scheme - if args.strict: - checkScheme = "artic-tools validate_scheme %s" % (bed) - print(colored.green("Running: "), checkScheme, file=sys.stderr) - if (os.system(checkScheme) != 0): - print(colored.red("primer scheme failed strict checking"), file=sys.stderr) - raise SystemExit(1) + if args.bed and args.ref: + bed = args.bed + ref = args.ref + else: + bed, ref, _ = get_scheme( + scheme_name=args.scheme_name, + scheme_version=args.scheme_version, + scheme_directory=args.scheme_directory, + scheme_length=args.scheme_length, + ) + + if not os.path.exists(bed) or not os.path.exists(ref): + print( + colored.red( + "Failed to find primer scheme or reference sequence: {} {}".format( + bed, ref + ) + ), + file=sys.stderr, + ) + raise SystemExit(1) ## set up the read file if args.read_file: @@ -163,135 +82,197 @@ def run(parser, args): else: read_file = "%s.fasta" % (args.sample) if not os.path.exists(read_file): - print(colored.red("failed to find read-file: {}" .format(read_file)), file=sys.stderr) + print( + colored.red("failed to find read-file: {}".format(read_file)), + file=sys.stderr, + ) raise SystemExit(1) ## collect the primer pools - pools = set([row['PoolName'] for row in read_bed_file(bed)]) + pools = set([row["PoolName"] for row in read_bed_file(bed)]) ## create a holder to keep the pipeline commands in cmds = [] - # 2) if using nanopolish, set up the reference header and run the nanopolish indexing - nanopolish_header = get_nanopolish_header(ref) - if not args.medaka and not args.skip_nanopolish: - if not args.fast5_directory or not args.sequencing_summary: - print(colored.red('Must specify FAST5 directory and sequencing summary for nanopolish mode.')) - raise SystemExit(1) - cmds.append("nanopolish index -s %s -d %s %s" % (args.sequencing_summary, args.fast5_directory, args.read_file,)) - - # 3) index the ref & align with minimap or bwa - if not args.bwa: - cmds.append("minimap2 -a -x map-ont -t %s %s %s | samtools view -bS -F 4 - | samtools sort -o %s.sorted.bam -" % (args.threads, ref, read_file, args.sample)) - else: - cmds.append("bwa index %s" % (ref,)) - cmds.append("bwa mem -t %s -x ont2d %s %s | samtools view -bS -F 4 - | samtools sort -o %s.sorted.bam -" % (args.threads, ref, read_file, args.sample)) - cmds.append("samtools index %s.sorted.bam" % (args.sample,)) + # 2) check the reference fasta has and index and create one if not + if not os.path.exists("%s.fai" % (ref)) and args.clair3: + cmds.append("samtools faidx %s" % (ref)) + + # 3) index the ref & align with minimap + cmds.append( + f"minimap2 -a -x map-ont -t {args.threads} {ref} {read_file} | samtools view -bS -F 4 - | samtools sort -o {args.sample}.sorted.bam -" + ) + + cmds.append(f"samtools index {args.sample}.sorted.bam") # 4) trim the alignments to the primer start sites and normalise the coverage to save time if args.normalise: - normalise_string = '--normalise %d' % (args.normalise) + normalise_string = f"--normalise {args.normalise}" else: - normalise_string = '' - cmds.append("align_trim %s %s --start --remove-incorrect-pairs --report %s.alignreport.txt < %s.sorted.bam 2> %s.alignreport.er | samtools sort -T %s - -o %s.trimmed.rg.sorted.bam" % (normalise_string, bed, args.sample, args.sample, args.sample, args.sample, args.sample)) - cmds.append("align_trim %s %s --remove-incorrect-pairs --report %s.alignreport.txt < %s.sorted.bam 2> %s.alignreport.er | samtools sort -T %s - -o %s.primertrimmed.rg.sorted.bam" % (normalise_string, bed, args.sample, args.sample, args.sample, args.sample, args.sample)) - cmds.append("samtools index %s.trimmed.rg.sorted.bam" % (args.sample)) - cmds.append("samtools index %s.primertrimmed.rg.sorted.bam" % (args.sample)) - - # 6) do variant calling on each read group, either using the medaka or nanopolish workflow - if args.medaka: - for p in pools: - if os.path.exists("%s.%s.hdf" % (args.sample, p)): - os.remove("%s.%s.hdf" % (args.sample, p)) - cmds.append("medaka consensus --model %s --threads %s --chunk_len 800 --chunk_ovlp 400 --RG %s %s.trimmed.rg.sorted.bam %s.%s.hdf" % (args.medaka_model, args.threads, p, args.sample, args.sample, p)) - if args.no_indels: - cmds.append("medaka snp %s %s.%s.hdf %s.%s.vcf" % (ref, args.sample, p, args.sample, p)) + normalise_string = "" + + cmds.append( + f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --remove-incorrect-pairs --min-mapq {args.min_mapq} --report {args.sample}.alignreport.csv < {args.sample}.sorted.bam 2> {args.sample}.alignreport.er | samtools sort -T {args.sample} - -o {args.sample}.trimmed.rg.sorted.bam" + ) + + cmds.append( + f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --min-mapq {args.min_mapq} --remove-incorrect-pairs --trim-primers --report {args.sample}.alignreport.csv --amp-depth-report {args.sample}.amplicon_depths.tsv < {args.sample}.sorted.bam 2> {args.sample}.alignreport.er | samtools sort -T {args.sample} - -o {args.sample}.primertrimmed.rg.sorted.bam" + ) + + cmds.append(f"samtools index {args.sample}.trimmed.rg.sorted.bam") + cmds.append(f"samtools index {args.sample}.primertrimmed.rg.sorted.bam") + + # 6) do variant calling on each read group + for p in pools: + if os.path.exists("%s.%s.hdf" % (args.sample, p)): + os.remove("%s.%s.hdf" % (args.sample, p)) + + if args.clair3: + # Use a specific model path if provided else use the default conda path + if args.model_path: + model_path = f"'{args.model_path}/{args.model}'" + else: - cmds.append("medaka variant %s %s.%s.hdf %s.%s.vcf" % (ref, args.sample, p, args.sample, p)) - - ## if not using longshot, annotate VCF with read depth info etc. so we can filter it - if args.no_longshot: - cmds.append("medaka tools annotate --pad 25 --RG %s %s.%s.vcf %s %s.trimmed.rg.sorted.bam tmp.medaka-annotate.vcf" % (p, args.sample, p, ref, args.sample)) - cmds.append("mv tmp.medaka-annotate.vcf %s.%s.vcf" % (args.sample, p)) + model_path = f"'{os.getenv('CONDA_PREFIX')}/bin/models/{args.model}'" - else: - if not args.skip_nanopolish: - indexed_nanopolish_file = read_file + # Split the BAM by read group + for p in pools: + cmds.append( + f"samtools view -b -r {p} {args.sample}.trimmed.rg.sorted.bam -o {args.sample}.{p}.trimmed.rg.sorted.bam" + ) + + cmds.append(f"samtools index {args.sample}.{p}.trimmed.rg.sorted.bam") + + cmds.append( + f"run_clair3.sh --chunk_size=10000 --no_phasing_for_fa --bam_fn='{args.sample}.{p}.trimmed.rg.sorted.bam' --ref_fn='{ref}' --output='{args.sample}_rg_{p}' --threads='{args.threads}' --platform='ont' --model_path={model_path} --include_all_ctgs" + ) + + cmds.append( + f"bgzip -dc {args.sample}_rg_{p}/merge_output.vcf.gz > {args.sample}.{p}.vcf" + ) + + else: + cmds.append( + f"medaka consensus --model {args.model} --threads {args.threads} --chunk_len 800 --chunk_ovlp 400 --RG {p} {args.sample}.trimmed.rg.sorted.bam {args.sample}.{p}.hdf" + ) if args.no_indels: - nanopolish_extra_args = " --snps" + cmds.append( + "medaka snp %s %s.%s.hdf %s.%s.vcf" + % (ref, args.sample, p, args.sample, p) + ) else: - nanopolish_extra_args = "" - for p in pools: - cmds.append("nanopolish variants --min-flanking-sequence 10 -x %s --progress -t %s --reads %s -o %s.%s.vcf -b %s.trimmed.rg.sorted.bam -g %s -w \"%s\" --ploidy 1 -m 0.15 --read-group %s %s" % (args.max_haplotypes, args.threads, indexed_nanopolish_file, args.sample, p, args.sample, ref, nanopolish_header, p, nanopolish_extra_args)) + cmds.append( + "medaka variant %s %s.%s.hdf %s.%s.vcf" + % (ref, args.sample, p, args.sample, p) + ) + + if args.no_longshot: + cmds.append( + f"medaka tools annotate --pad 25 --RG {p} {args.sample}.{p}.vcf {ref} {args.sample}.primertrimmed.rg.sorted.bam tmp.medaka-annotate.vcf" + ) + cmds.append(f"mv tmp.medaka-annotate.vcf {args.sample}.{p}.vcf") # 7) merge the called variants for each read group - merge_vcf_cmd = "artic_vcf_merge %s %s 2> %s.primersitereport.txt" % (args.sample, bed, args.sample) + merge_vcf_cmd = "artic_vcf_merge %s %s 2> %s.primersitereport.txt" % ( + args.sample, + bed, + args.sample, + ) for p in pools: merge_vcf_cmd += " %s:%s.%s.vcf" % (p, args.sample, p) + cmds.append(merge_vcf_cmd) # 8) check and filter the VCFs ## if using strict, run the vcf checker to remove vars present only once in overlap regions (this replaces the original merged vcf from the previous step) - if args.strict: - cmds.append("bgzip -f %s.merged.vcf" % (args.sample)) - cmds.append("tabix -p vcf %s.merged.vcf.gz" % (args.sample)) - cmds.append("artic-tools check_vcf --dropPrimerVars --dropOverlapFails --vcfOut %s.merged.filtered.vcf %s.merged.vcf.gz %s 2> %s.vcfreport.txt" % (args.sample, args.sample, bed, args.sample)) - cmds.append("mv %s.merged.filtered.vcf %s.merged.vcf" % (args.sample, args.sample)) - - ## if doing the medaka workflow and longshot required, do it on the merged VCF - if args.medaka and not args.no_longshot: - cmds.append("bgzip -f %s.merged.vcf" % (args.sample)) - cmds.append("tabix -f -p vcf %s.merged.vcf.gz" % (args.sample)) - cmds.append("longshot -P 0 -F -A --no_haps --bam %s.primertrimmed.rg.sorted.bam --ref %s --out %s.merged.vcf --potential_variants %s.merged.vcf.gz" % (args.sample, ref, args.sample, args.sample)) - - ## set up some name holder vars for ease - if args.medaka: - method = 'medaka' - else: - method = 'nanopolish' - vcf_file = "%s.pass.vcf" % (args.sample) + # if args.strict: + # cmds.append("bgzip -f %s.merged.vcf" % (args.sample)) + # cmds.append("tabix -p vcf %s.merged.vcf.gz" % (args.sample)) + # cmds.append( + # "artic-tools check_vcf --dropPrimerVars --dropOverlapFails --vcfOut %s.merged.filtered.vcf %s.merged.vcf.gz %s 2> %s.vcfreport.txt" + # % (args.sample, args.sample, bed, args.sample) + # ) + # cmds.append( + # "mv %s.merged.filtered.vcf %s.merged.vcf" % (args.sample, args.sample) + # ) + pre_filter_vcf = f"{args.sample}.merged.vcf" + cmds.append(f"bgzip -kf {pre_filter_vcf}") + cmds.append(f"tabix -f -p vcf {pre_filter_vcf}.gz") + + # if doing the medaka workflow and longshot required, do it on the merged VCF + if not args.no_longshot: + pre_filter_vcf = f"{args.sample}.longshot.vcf" + cmds.append( + f"longshot --min_mapq {args.min_mapq} -P 0 -F --max_cov 200000 --no_haps --bam {args.sample}.primertrimmed.rg.sorted.bam --ref {ref} --out {pre_filter_vcf} --potential_variants {args.sample}.merged.vcf.gz" + ) + cmds.append(f"bgzip -kf {pre_filter_vcf}") + cmds.append(f"tabix -f -p vcf {pre_filter_vcf}.gz") ## filter the variants to produce PASS and FAIL lists, then index them - if args.no_frameshifts and not args.no_indels: - cmds.append("artic_vcf_filter --%s --no-frameshifts %s.merged.vcf %s.pass.vcf %s.fail.vcf" % (method, args.sample, args.sample, args.sample)) - else: - cmds.append("artic_vcf_filter --%s %s.merged.vcf %s.pass.vcf %s.fail.vcf" % (method, args.sample, args.sample, args.sample)) - cmds.append("bgzip -f %s" % (vcf_file)) - cmds.append("tabix -p vcf %s.gz" % (vcf_file)) + fs_str = "--no-frameshifts" if args.no_frameshifts else "" + indel_str = "--no-indels" if args.no_indels else "" + cmds.append( + f"artic_vcf_filter {fs_str} {indel_str} {pre_filter_vcf}.gz {args.sample}.pass.vcf {args.sample}.fail.vcf" + ) # 9) get the depth of coverage for each readgroup, create a coverage mask and plots, and add failed variants to the coverage mask (artic_mask must be run before bcftools consensus) - cmds.append("artic_make_depth_mask --store-rg-depths %s %s.primertrimmed.rg.sorted.bam %s.coverage_mask.txt" % (ref, args.sample, args.sample)) - cmds.append("artic_mask %s %s.coverage_mask.txt %s.fail.vcf %s.preconsensus.fasta" % (ref, args.sample, args.sample, args.sample)) + cmds.append( + f"artic_make_depth_mask --store-rg-depths {ref} {args.sample}.primertrimmed.rg.sorted.bam {args.sample}.coverage_mask.txt" + ) + + cmds.append( + f"artic_mask {ref} {args.sample}.coverage_mask.txt {args.sample}.fail.vcf {args.sample}.preconsensus.fasta" + ) + + post_filter_vcf_file = f"{args.sample}.pass.vcf" # 10) generate the consensus sequence - cmds.append("bcftools consensus -f %s.preconsensus.fasta %s.gz -m %s.coverage_mask.txt -o %s.consensus.fasta" % (args.sample, vcf_file, args.sample, args.sample)) + cmds.append(f"bgzip -kf {post_filter_vcf_file}") + cmds.append(f"tabix -f -p vcf {post_filter_vcf_file}.gz") + + # Normalise variants in the pass/fail VCF files + post_normalisation_vcf_file = f"{args.sample}.normalised.vcf" + cmds.append( + f"bcftools norm --check-ref x --fasta-ref {args.sample}.preconsensus.fasta -O z -o {post_normalisation_vcf_file} {post_filter_vcf_file}.gz" + ) + cmds.append(f"bgzip -kf {post_normalisation_vcf_file}") + cmds.append(f"tabix -f -p vcf {post_normalisation_vcf_file}.gz") + cmds.append( + f"bcftools consensus -f {args.sample}.preconsensus.fasta {post_normalisation_vcf_file}.gz -m {args.sample}.coverage_mask.txt -o {args.sample}.consensus.fasta" + ) # 11) apply the header to the consensus sequence and run alignment against the reference sequence - fasta_header = "%s/ARTIC/%s" % (args.sample, method) - cmds.append("artic_fasta_header %s.consensus.fasta \"%s\"" % (args.sample, fasta_header)) - cmds.append("cat %s.consensus.fasta %s > %s.muscle.in.fasta" % (args.sample, ref, args.sample)) - cmds.append("muscle -in %s.muscle.in.fasta -out %s.muscle.out.fasta" % (args.sample, args.sample)) + fasta_header = "%s/ARTIC/medaka" % (args.sample) + cmds.append( + 'artic_fasta_header %s.consensus.fasta "%s"' % (args.sample, fasta_header) + ) - # 12) get some QC stats - if args.strict: - cmds.append("artic_get_stats --scheme {} --align-report {}.alignreport.txt --vcf-report {}.vcfreport.txt {}" .format(bed, args.sample, args.sample, args.sample)) + if args.use_muscle: + cmds.append( + "cat %s.consensus.fasta %s > %s.muscle.in.fasta" + % (args.sample, ref, args.sample) + ) + cmds.append( + "muscle -in %s.muscle.in.fasta -out %s.muscle.out.fasta" + % (args.sample, args.sample) + ) # 13) setup the log file and run the pipeline commands log = "%s.minion.log.txt" % (args.sample) - logfh = open(log, 'w') + logfh = open(log, "w") for cmd in cmds: print(colored.green("Running: ") + cmd, file=sys.stderr) if not args.dry_run: timerStart = time.perf_counter() retval = os.system(cmd) if retval != 0: - print(colored.red('Command failed:' ) + cmd, file=sys.stderr) + print(colored.red("Command failed:") + cmd, file=sys.stderr) raise SystemExit(20) timerStop = time.perf_counter() ## print the executed command and the runtime to the log file - print("{}\t{}" .format(cmd, timerStop-timerStart), file=logfh) - + print("{}\t{}".format(cmd, timerStop - timerStart), file=logfh) + ## if it's a dry run, print just the command else: print(cmd, file=logfh) diff --git a/artic/minion_validator.py b/artic/minion_validator.py deleted file mode 100644 index f43cd949..00000000 --- a/artic/minion_validator.py +++ /dev/null @@ -1,331 +0,0 @@ -"""minion_validator.py runs the functional tests of the minion pipeline, for both nanopolish and medaka workflows. - -For each test dataset, it will make the following checks for each workflow: - - * check for existence of the final consensus sequence - * check the final consensus matches the expected consensus - * check for existence of the final VCF file (generated by artic_vcf_filter) - * check all expected variants are reported - * check no unexpected variants are reported - * if there is an expected deletion, the consensus sequence will be checked for it's absence - -NOTE: - - * only a basic grep is used for checking the deletion in the consensus - test will fail if deletion sequence is present multiple times (I should improve this part of the test...) - -""" -from Bio import SeqIO -from tqdm import tqdm -import argparse -import errno -import glob -import os -import pathlib -import pytest -import requests -import sys -import tarfile -import vcf - - -from . import pipeline - -# help pytest resolve where test data is kept -dataDir = str(pathlib.Path(__file__).parent.parent) + "/test-data/" - -# testData is a lookup of sampleIDs to download urls -testData = { - "MT007544": "https://raw.githubusercontent.com/artic-network/fieldbioinformatics/master/test-data/MT007544/MT007544.fastq", - "CVR1": "https://artic.s3.climb.ac.uk/validation-sets/CVR1.tgz", - "NRW01": "http://artic.s3.climb.ac.uk/validation-teams/NRW01.tgz", - "SP1": "http://artic.s3.climb.ac.uk/validation-teams/SP1.tgz" -} - -# refConsensuses is a nested dict of sample IDs and their expected consensus sequences from the nanopolish workflow -refConsensuses = { - "CVR1": dataDir + "consensus-sequences/CVR1.consensus.fasta", - "NRW01": dataDir + "consensus-sequences/NRW01.consensus.fasta", - "SP1": dataDir + "consensus-sequences/SP1.consensus.fasta" -} - -# refMedakaConsensuses is a nested dict of sample IDs and their expected consensus sequences from the medaka workflow -refMedakaConsensuses = { - "MT007544": dataDir + "consensus-sequences/MT007544.consensus.medaka.fasta", - "CVR1": dataDir + "consensus-sequences/CVR1.consensus.medaka.fasta", - "NRW01": dataDir + "consensus-sequences/NRW01.consensus.medaka.fasta", - "SP1": dataDir + "consensus-sequences/SP1.consensus.medaka.fasta" -} - -# nanopolishTestVariants is a nested dict of sample IDs and their expected variants when using the nanopolish workflow -nanopolishTestVariants = { - "CVR1": - { - # pos: (ref, alt, type, count) - 241: ['C', 'T', "snp", 1], - 3037: ['C', 'T', "snp", 1], - 12733: ['C', 'T', "snp", 2], # count 2 indicates this is reported twice in the VCF (due to pool groups) - 14408: ['C', 'T', "snp", 1], - 23403: ['A', 'G', "snp", 1], - 27752: ['C', 'T', "snp", 1], - 28881: ['G', 'A', "snp", 1], - 28882: ['G', 'A', "snp", 1], - 28883: ['G', 'C', "snp", 1] - }, - "NRW01": - { - 1440: ['G', 'A', "snp", 1], - 2891: ['G', 'A', "snp", 1], - 4655: ['C', 'T', "snp", 1], - 8422: ['G', 'A', "snp", 1], - 22323: ['C', 'T', "snp", 2], - 29546: ['C', 'A', "snp", 2] - }, - "SP1": - { - 241: ['C', 'T', "snp", 1], - 3037: ['C', 'T', "snp", 1], - 14408: ['C', 'T', "snp", 1], - 23403: ['A', 'G', "snp", 1] - }, -} - -# medakaTestVariants is a nested dict of sample IDs and their expected variants when using the medaka workflow -medakaTestVariants = { - "MT007544": - { - # pos: (ref, alt, type, count) - 29749: ["ACGATCGAGTG", 'A', "del", 1] - }, - "CVR1": - { - 241: ['C', 'T', "snp", 1], - 3037: ['C', 'T', "snp", 1], - 12733: ['C', 'T', "snp", 2], - 14408: ['C', 'T', "snp", 1], - 23403: ['A', 'G', "snp", 1], - 27752: ['C', 'T', "snp", 1], - 28881: ['GGG', 'AAC', "indel", 1], - }, - "NRW01": - { - 1440: ['G', 'A', "snp", 1], - 2891: ['G', 'A', "snp", 1], - 4655: ['C', 'T', "snp", 1], - 8422: ['G', 'A', "snp", 1], - 22323: ['C', 'T', "snp", 2], - 29546: ['C', 'A', "snp", 2] - }, - "SP1": - { - 241: ['C', 'T', "snp", 1], - 3037: ['C', 'T', "snp", 1], - 14408: ['C', 'T', "snp", 1], - 23403: ['A', 'G', "snp", 1] - }, -} - -# extraFlags is a way to add extra sample-specific commands to the validation test cmd -extraFlags = { - "medaka": - { - "SP1": ["--no-frameshifts"], - }, - "nanopolish": - { - }, -} - -# dataChecker will run before the tests and download all the test datasets if not present -@pytest.fixture(scope="session", autouse=True) -def dataChecker(numValidations): - print("checking for validation datasets...") - for sampleID, url in testData.items(): - targetPath = dataDir + sampleID - if os.path.exists(targetPath) == False: - print("\tno data for {}" .format(sampleID)) - print("\tmaking dir at {}" .format(targetPath)) - os.mkdir(targetPath) - print("\tdownloading from {}" .format(url)) - try: - download(url, dataDir, sampleID) - except Exception as e: - print("download failed: ", e) - sys.exit(1) - else: - print("\tfound data dir for {}" .format(sampleID)) - print("validation datasets ready\n") - -# download will download and untar a test dataset -def download(url, dataDir, sampleID): - filename = url.rsplit('/', 1)[1] - with open(f'{dataDir}/{filename}', 'wb+') as f: - response = requests.get(url, stream=True) - total = int(response.headers.get('content-length')) - if total is None: - f.write(response.content) - else: - with tqdm(total=total, unit='B', unit_scale=True, desc=filename) as pbar: - for data in tqdm(response.iter_content(chunk_size=1024)): - f.write(data) - pbar.update(1024) - - tar = tarfile.open(dataDir + "/" + filename, "r:gz") - tar.extractall(dataDir) - tar.close() - os.remove(dataDir + "/" + filename) - -# genCommand will create the minion command for the requested workflow (nanopolish or medaka) -def genCommand(sampleID, workflow): - cmd = [ - "minion", - "--threads", - "2", - "--read-file", - dataDir + sampleID + "/" + sampleID + ".fastq", - "--scheme-directory", - dataDir + "primer-schemes", - "--fast5-directory", - dataDir + sampleID + "/fast5", - "--sequencing-summary", - dataDir + sampleID + "/" + sampleID + "_sequencing_summary.txt", - ] - if workflow=="medaka": - cmd.append("--medaka") - cmd.append("--medaka-model") - cmd.append("r941_min_high_g351") - if sampleID in extraFlags[workflow]: - for flag in extraFlags[workflow][sampleID]: - cmd.append(flag) - cmd.append("nCoV-2019/V1") - cmd.append(sampleID) - return cmd - -# cleanUp will remove all files generated by the pipeline for a given sampleID -def cleanUp(sampleID): - fileList = glob.glob("{}*" .format(sampleID)) - for filePath in fileList: - try: - os.remove(filePath) - except: - print("Error while deleting file : ", filePath) - -# checkConsensus will return 1 if a subsequence is present in a consensus file, or 0 if absent -def checkConsensus(consensusFile, subSeq): - for record in SeqIO.parse(open(consensusFile, 'r'), 'fasta'): - if subSeq in record.seq: - return 1 - return 0 - -# runner is the test runner -def runner(workflow, numValidations): - - # get the workflow data - if workflow == "nanopolish": - data = nanopolishTestVariants - elif workflow == "medaka": - data = medakaTestVariants - else: - print("error setting up test runner") - assert False - - # check the number of validation datasets requested - counter = numValidations - if (numValidations < 0) or (numValidations > len(testData)): - counter = len(testData) - - # run the data through the requested workflow, then validate the output - print("running the {} workflow with {} of the validation datasets...\n" .format(workflow, counter)) - for sampleID, expVariants in data.items(): - - # break when the requested number of datasets have been run - if counter == 0: - break - - # generate the command - cmd = genCommand(sampleID, workflow) - - # setup and run the minion parser - parser = pipeline.init_pipeline_parser() - - # parse the arguments - try: - args = parser.parse_args(cmd) - except SystemExit: - print("failed to parse valid command for `artic minion`") - assert False - - # run the minion pipeline - try: - args.func(parser, args) - except SystemExit: - print("artic minion exited early with an error") - assert False - - # check the ARTIC consensus was created - consensusFile = "%s.consensus.fasta" % sampleID - assert os.path.exists(consensusFile) == True, "no consensus produced for {}" .format(sampleID) - testSeqs = SeqIO.parse(open(consensusFile, 'r'), 'fasta') - testConsensus = next(testSeqs) - - # check the ARTIC consensus sequence matches the one on record - if workflow == "medaka": - refSeqs = SeqIO.parse(open(refMedakaConsensuses[sampleID], 'r'), 'fasta') - else: - refSeqs = SeqIO.parse(open(refConsensuses[sampleID], 'r'), 'fasta') - refConsensus = next(refSeqs) - assert testConsensus.seq == refConsensus.seq, "produced ARTIC consensus does not match expected consensus for {}" .format(sampleID) - - # check the ARTIC VCF was created - vcfFile = "%s.pass.vcf.gz" % sampleID - assert os.path.exists( - vcfFile) == True, "no VCF produced for {}" .format(sampleID) - - # open the VCF and check the reported variants match the expected - for record in vcf.Reader(filename=vcfFile): - if record.POS in expVariants: - assert record.REF == expVariants[record.POS][0], "incorrect REF reported in VCF for {} at position {}" .format(sampleID, record.POS) - assert str(record.ALT[0]) == expVariants[record.POS][1], "incorrect ALT reported in VCF for {} at position {}" .format(sampleID, record.POS) - - # if this is an expected deletion, check the consensus sequence for it's absence - if expVariants[record.POS][2] == "del": - assert checkConsensus(consensusFile, record.REF) == 0, "expected deletion for {} was reported but was left in consensus" .format(sampleID) - - # also check that the VCF record is correctly labelled as DEL - assert record.is_deletion, "deletion for {} not formatted correctly in VCF" .format(sampleID) - - # if this is an expected indel, check that the VCF record is correctly labelled as INDEL - if expVariants[record.POS][2] == "indel": - assert record.is_indel, "indel for {} not formatted correctly in VCF" .format(sampleID) - - # else, check that the VCF record is correctly labelled as SNP - if expVariants[record.POS][2] == "snp": - assert record.is_snp, "snp for {} not formatted correctly in VCF" .format(sampleID) - - # decrement/remove the variant from the expected list, so we can keep track of checked variants - expVariants[record.POS][3] -= 1 - if (expVariants[record.POS][3] == 0): - del expVariants[record.POS] - else: - print("unexpected variant found for {}: {} at {}" .format(sampleID, str(record.ALT[0]), record.POS)) - assert False - - # check we've confirmed all the expected variants - if len(expVariants) != 0: - print("variants missed during test for {}" .format(sampleID)) - for key, val in expVariants.items(): - print("\t{}: {}" .format(key, val)) - assert False - - # clean up pipeline files - cleanUp(sampleID) - counter -= 1 - -# test_NanopolishMinion is the unit test runner to test the minion pipeline with the nanopolish workflow -@pytest.mark.env("nanopolish") -def test_NanopolishMinion(numValidations): - runner("nanopolish", numValidations) - -# test_MedakaMinion is the unit test runner to test the minion pipeline with the medaka workflow -@pytest.mark.env("medaka") -def test_MedakaMinion(numValidations): - runner("medaka", numValidations) diff --git a/artic/pipeline.py b/artic/pipeline.py index f8a82506..76f56656 100755 --- a/artic/pipeline.py +++ b/artic/pipeline.py @@ -5,30 +5,23 @@ import argparse import sys +import os from . import version def run_subtool(parser, args): - if args.command == 'extract': - from . import extract as submodule - if args.command == 'basecaller': - from . import basecaller as submodule - if args.command == 'demultiplex': - from . import demultiplex as submodule - if args.command == 'minion': + if args.command == "minion": from . import minion as submodule - if args.command == 'gather': - from . import gather as submodule - if args.command == 'guppyplex': + if args.command == "guppyplex": from . import guppyplex as submodule - if args.command == 'rampart': + if args.command == "rampart": from . import rampart as submodule - if args.command == 'filter': + if args.command == "filter": from . import filter_reads as submodule - if args.command == 'run': + if args.command == "run": from . import run as submodule - if args.command == 'export': + if args.command == "export": from . import export as submodule # run the chosen submodule. @@ -38,9 +31,13 @@ def run_subtool(parser, args): class ArgumentParserWithDefaults(argparse.ArgumentParser): def __init__(self, *args, **kwargs): super(ArgumentParserWithDefaults, self).__init__(*args, **kwargs) - self.add_argument("-q", "--quiet", help="Do not output warnings to stderr", - action="store_true", - dest="quiet") + self.add_argument( + "-q", + "--quiet", + help="Do not output warnings to stderr", + action="store_true", + dest="quiet", + ) def init_pipeline_parser(): @@ -52,154 +49,244 @@ def init_pipeline_parser(): The initialised argparse Argument Parser for the pipeline """ parser = argparse.ArgumentParser( - prog='artic', formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument("-v", "--version", help="Installed Artic version", - action="version", - version="%(prog)s " + str(version.__version__)) + prog="artic", formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "-v", + "--version", + help="Installed Artic version", + action="version", + version="%(prog)s " + str(version.__version__), + ) subparsers = parser.add_subparsers( - title='[sub-commands]', dest='command', parser_class=ArgumentParserWithDefaults) - - # extract - parser_extract = subparsers.add_parser('extract', - help='Create an empty poredb database') - parser_extract.add_argument('directory', metavar='directory', - help='The name of the database.') - parser_extract.add_argument('--basecaller', metavar='basecaller', - default='ONT Albacore Sequencing Software', - help='The name of the basecaller') - parser_extract.set_defaults(func=run_subtool) - - # callers - parser_extract = subparsers.add_parser( - 'basecaller', help='Display basecallers in files') - parser_extract.add_argument( - 'directory', metavar='directory', help='Directory of FAST5 files.') - parser_extract.set_defaults(func=run_subtool) - - # demultiplex - parser_demultiplex = subparsers.add_parser( - 'demultiplex', help='Run demultiplex') - parser_demultiplex.add_argument( - 'fasta', metavar='fasta', help='Undemultiplexed FASTA file.') - parser_demultiplex.add_argument( - '--threads', type=int, default=8, help='Number of threads') - parser_demultiplex.add_argument( - '--prefix', help='Prefix for demultiplexed files') - parser_demultiplex.add_argument( - '--no-remove-directory', dest='no_remove_directory', action='store_true') - parser_demultiplex.set_defaults(func=run_subtool) + title="[sub-commands]", dest="command", parser_class=ArgumentParserWithDefaults + ) # minion - parser_minion = subparsers.add_parser('minion', help='Run the alignment/variant-call/consensus pipeline') + parser_minion = subparsers.add_parser( + "minion", help="Run the alignment/variant-call/consensus pipeline" + ) + # parser_minion.add_argument( + # "scheme", metavar="scheme", help="The name of the scheme" + # ) parser_minion.add_argument( - 'scheme', metavar='scheme', help='The name of the scheme') + "sample", metavar="sample", help="The name of the sample" + ) parser_minion.add_argument( - 'sample', metavar='sample', help='The name of the sample') - parser_minion.add_argument('--medaka', dest='medaka', action='store_true', - help='Use medaka instead of nanopolish for variants') - parser_minion.add_argument('--medaka-model', metavar='medaka_model', help='The model to use for medaka (required if using --medaka)') - parser_minion.add_argument('--no-longshot', dest='no_longshot', action='store_true', help='Do not use Longshot for variant filtering after medaka') - parser_minion.add_argument('--minimap2', dest='minimap2', default=True, - action='store_true', help='Use minimap2 (default)') + "--model", + help="The model to use for medaka or clair3 (must be the specific model for the variant caller in question)", + required=True, + ) parser_minion.add_argument( - '--bwa', dest='bwa', action='store_true', help='Use bwa instead of minimap2') - parser_minion.add_argument('--normalise', dest='normalise', type=int, - default=100, help='Normalise down to moderate coverage to save runtime (default: %(default)d, deactivate with `--normalise 0`)') + "--clair3", + action="store_true", + help="Use Clair3 for variant calling, currently experimental (default: medaka)", + ) parser_minion.add_argument( - '--threads', type=int, default=8, help='Number of threads (default: %(default)d)') - parser_minion.add_argument('--scheme-directory', metavar='scheme_directory', - default='./primer-schemes', help='Default scheme directory') - parser_minion.add_argument('--scheme-version', metavar='scheme_version', - default=1, help='Primer scheme version (default: %(default)d)') - parser_minion.add_argument('--max-haplotypes', type=int, default=1000000, - metavar='max_haplotypes', help='max-haplotypes value for nanopolish') - parser_minion.add_argument('--read-file', metavar='read_file', - help='Use alternative FASTA/FASTQ file to .fasta') - parser_minion.add_argument('--fast5-directory', help='FAST5 Directory') + "--model-path", + metavar="model_path", + help="Path containing clair3 models, defaults to models packaged with conda installation by default", + type=str, + ) parser_minion.add_argument( - '--sequencing-summary', help='Path to Guppy sequencing summary') - parser_minion.add_argument('--skip-nanopolish', action='store_true') - parser_minion.add_argument('--no-indels', action='store_true', help='Do not report InDels (uses SNP-only mode of nanopolish/medaka)') - parser_minion.add_argument('--no-frameshifts', action='store_true', help='Remove variants which induce frameshifts (ignored when --no-indels set)') - parser_minion.add_argument('--dry-run', action='store_true') - parser_minion.add_argument('--strict', action='store_true', help='Run with strict filtering of variants against primer scheme') - parser_minion.set_defaults(func=run_subtool) + "--no-longshot", + dest="no_longshot", + action="store_true", + help="Do not use Longshot for variant filtering after medaka", + ) + parser_minion.add_argument( + "--min-mapq", + type=int, + default=20, + help="Minimum mapping quality to consider (default: %(default)d)", + ) + parser_minion.add_argument( + "--normalise", + dest="normalise", + type=int, + default=100, + help="Normalise down to moderate coverage to save runtime (default: %(default)d, deactivate with `--normalise 0`)", + ) + parser_minion.add_argument( + "--primer-match-threshold", + type=int, + default=35, + help="Allow fuzzy primer matching within this threshold (default: %(default)d)", + ) + parser_minion.add_argument( + "--threads", + type=int, + default=8, + help="Number of threads (default: %(default)d)", + ) + + remote_scheme_options = parser_minion.add_argument_group( + "Remote Scheme Options", + "Options for automagically fetching primer schemes from the scheme repository (https://github.com/quick-lab/primerschemes)", + ) + remote_scheme_options.add_argument( + "--scheme-directory", + metavar="scheme_directory", + default=f"{os.getcwd()}/primer-schemes", + help="Default scheme directory (default: %(default)s)", + ) + remote_scheme_options.add_argument( + "--scheme-name", + metavar="scheme_name", + type=str, + help="Name of the scheme, e.g. sars-cov-2, mpxv to fetch from the scheme repository (https://github.com/quick-lab/primerschemes)", + ) + remote_scheme_options.add_argument( + "--scheme-length", + type=int, + metavar="scheme_length", + default=False, + help="Length of the scheme to fetch from the scheme repository (https://github.com/quick-lab/primerschemes). This is only required if the --scheme-name has multiple possible lengths.", + ) + remote_scheme_options.add_argument( + "--scheme-version", + metavar="vX.X.X", + type=str, + help="Primer scheme version", + ) - # gather - parser_gather = subparsers.add_parser( - 'gather', help='Gather up demultiplexed files') - parser_gather.add_argument('--directory', nargs='+', metavar='directory', - help='Basecalled (guppy) results directory or directories.') - parser_gather.add_argument( - '--max-length', type=int, metavar='max_length', help='remove reads greater than read length') - parser_gather.add_argument( - '--min-length', type=int, metavar='min_length', help='remove reads less than read length') - parser_gather.add_argument('--prefix', help='Prefix for gathered files') - parser_gather.add_argument('--prompt-directory', metavar='run_directory', - help='The run directory for interactive prompts', default='/var/lib/minknown/data') - parser_gather.add_argument( - '--fast5-directory', metavar='fast5_directory', help='The directory with fast5 files') - parser_gather.add_argument('--no-fast5s', action='store_true', - help='Do not use fast5s and nanopolish', default=0) - parser_gather.add_argument('--limit', type=int, help='Only gather n reads') - parser_gather.set_defaults(func=run_subtool) + local_scheme_options = parser_minion.add_argument_group( + "Local Scheme Options", + "Options for providing a local primer scheme and reference sequence (BED file and FASTA file) directly to the pipeline", + ) + local_scheme_options.add_argument( + "--bed", + metavar="bed", + help="BED file containing primer scheme", + ) + local_scheme_options.add_argument( + "--ref", + help="Reference sequence for the scheme", + ) + + parser_minion.add_argument( + "--read-file", + metavar="read_file", + help="FASTQ file containing reads", + required=True, + ) + parser_minion.add_argument( + "--no-indels", + action="store_true", + help="Do not report InDels (uses SNP-only mode of nanopolish/medaka)", + ) + parser_minion.add_argument( + "--no-frameshifts", + action="store_true", + help="Remove variants which induce frameshifts (ignored when --no-indels set)", + ) + parser_minion.add_argument( + "--use-muscle", + action="store_true", + help="Run muscle alignment of consensus to reference", + ) + parser_minion.add_argument("--dry-run", action="store_true") + parser_minion.set_defaults(func=run_subtool) # guppyplex # This is a workflow that aggregates the previous gather and demultiplex steps into a single task. # This is making an assumption that the results from MinKnow demultiplex are good-enough. parser_guppyplex = subparsers.add_parser( - 'guppyplex', help='Aggregate pre-demultiplexed reads from MinKNOW/Guppy') - parser_guppyplex.add_argument('--directory', metavar='directory', - help='Basecalled and demultiplexed (guppy) results directory', required=True) + "guppyplex", help="Aggregate pre-demultiplexed reads from MinKNOW/Guppy" + ) parser_guppyplex.add_argument( - '--max-length', type=int, metavar='max_length', help='remove reads greater than read length') + "--directory", + metavar="directory", + help="Basecalled and demultiplexed (guppy) results directory", + required=True, + ) parser_guppyplex.add_argument( - '--min-length', type=int, metavar='min_length', help='remove reads less than read length') - parser_guppyplex.add_argument('--quality', type=float, metavar='quality', - default=7, help='remove reads against this quality filter') - parser_guppyplex.add_argument('--sample', type=float, metavar='sample', default=1, - help='sampling frequency for random sample of sequence to reduce excess') + "--max-length", + type=int, + metavar="max_length", + help="remove reads greater than read length", + ) parser_guppyplex.add_argument( - '--skip-quality-check', action='store_true', help='Do not filter on quality score (speeds up)') + "--min-length", + type=int, + metavar="min_length", + help="remove reads less than read length", + ) parser_guppyplex.add_argument( - '--prefix', help='Prefix for guppyplex files') - parser_guppyplex.add_argument('--output', metavar='output', - help='FASTQ file to write') + "--quality", + type=float, + metavar="quality", + default=7, + help="remove reads against this quality filter", + ) + parser_guppyplex.add_argument( + "--sample", + type=float, + metavar="sample", + default=1, + help="sampling frequency for random sample of sequence to reduce excess", + ) + parser_guppyplex.add_argument( + "--skip-quality-check", + action="store_true", + help="Do not filter on quality score (speeds up)", + ) + parser_guppyplex.add_argument("--prefix", help="Prefix for guppyplex files") + parser_guppyplex.add_argument( + "--output", metavar="output", help="FASTQ file to write" + ) parser_guppyplex.set_defaults(func=run_subtool) # filter - parser_filter = subparsers.add_parser( - 'filter', help='Filter FASTQ files by length') - parser_filter.add_argument( - 'filename', metavar='filename', help='FASTQ file.') + parser_filter = subparsers.add_parser("filter", help="Filter FASTQ files by length") + parser_filter.add_argument("filename", metavar="filename", help="FASTQ file.") parser_filter.add_argument( - '--max-length', type=int, metavar='max_length', help='remove reads greater than read length') + "--max-length", + type=int, + metavar="max_length", + help="remove reads greater than read length", + ) parser_filter.add_argument( - '--min-length', type=int, metavar='min_length', help='remove reads less than read length') + "--min-length", + type=int, + metavar="min_length", + help="remove reads less than read length", + ) parser_filter.set_defaults(func=run_subtool) # rampart parser_rampart = subparsers.add_parser( - 'rampart', help='Interactive prompts to start RAMPART') - parser_rampart.add_argument('--protocol-directory', metavar='protocol_directory', - help='The RAMPART protocols directory.', default='/home/artic/artic/artic-ebov/rampart') - parser_rampart.add_argument('--run-directory', metavar='run_directory', - help='The run directory', default='/var/lib/MinKNOW/data') + "rampart", help="Interactive prompts to start RAMPART" + ) + parser_rampart.add_argument( + "--protocol-directory", + metavar="protocol_directory", + help="The RAMPART protocols directory.", + default="/home/artic/artic/artic-ebov/rampart", + ) + parser_rampart.add_argument( + "--run-directory", + metavar="run_directory", + help="The run directory", + default="/var/lib/MinKNOW/data", + ) parser_rampart.set_defaults(func=run_subtool) # export parser_export = subparsers.add_parser( - 'export', help='Export reads and fAST5 into a neat archive') - parser_export.add_argument('prefix') - parser_export.add_argument('bamfile') - parser_export.add_argument('sequencing_summary') - parser_export.add_argument('fast5_directory') - parser_export.add_argument('output_directory') + "export", help="Export reads and fAST5 into a neat archive" + ) + parser_export.add_argument("prefix") + parser_export.add_argument("bamfile") + parser_export.add_argument("sequencing_summary") + parser_export.add_argument("fast5_directory") + parser_export.add_argument("output_directory") parser_export.set_defaults(func=run_subtool) # run parser_run = subparsers.add_parser( - 'run', help='Process an entire run folder interactively') + "run", help="Process an entire run folder interactively" + ) parser_run.set_defaults(func=run_subtool) # return the parser @@ -212,7 +299,7 @@ def main(): parser = init_pipeline_parser() # collect the args - args = parser.parse_args(sys.argv[1:]) + args = parser.parse_args() # if args.quiet: # logger.setLevel(logging.ERROR) diff --git a/artic/rampart.py b/artic/rampart.py index b851db22..754bb40d 100755 --- a/artic/rampart.py +++ b/artic/rampart.py @@ -1,79 +1,91 @@ -#Written by Nick Loman (@pathogenomenick) +# Written by Nick Loman (@pathogenomenick) import os import sys from Bio import SeqIO from clint.textui import colored, puts, indent + def chooser(directories): - print ("Found the following directories:") - for n, d in enumerate(directories): - print (" [%d]: %s" % (n+1, d)) - print ("Choose 0 to quit") - - while True: - choice = input("=> ") - try: - choice_number = int(choice) - if choice_number == 0: - raise SystemExit(1) - path = directories[choice_number-1] - break - except Exception: - print ("Invalid choice, please select from the list above.") - - return (path) + print("Found the following directories:") + for n, d in enumerate(directories): + print(" [%d]: %s" % (n + 1, d)) + print("Choose 0 to quit") + + while True: + choice = input("=> ") + try: + choice_number = int(choice) + if choice_number == 0: + raise SystemExit(1) + path = directories[choice_number - 1] + break + except Exception: + print("Invalid choice, please select from the list above.") + + return path + def run(parser, args): - directories = [] + directories = [] - for root, dirs, files in os.walk(args.run_directory, topdown=False): - for d in dirs: - if d == 'fastq_pass': - directories.append(root+'/'+d) + for root, dirs, files in os.walk(args.run_directory, topdown=False): + for d in dirs: + if d == "fastq_pass": + directories.append(root + "/" + d) - basecalledPath = chooser(directories) + basecalledPath = chooser(directories) - directories = os.listdir(args.protocol_directory) - directories = [d for d in directories if os.path.isdir(args.protocol_directory+'/'+d)] + directories = os.listdir(args.protocol_directory) + directories = [ + d for d in directories if os.path.isdir(args.protocol_directory + "/" + d) + ] - protocolPath = chooser(directories) + protocolPath = chooser(directories) - print (basecalledPath, protocolPath) + print(basecalledPath, protocolPath) - skip_barcoding = False - if os.path.exists('run_configuration.json'): - reenter = input ("Do you want to enter sample names again? (Y/N): ") - if reenter.lower().startswith('n'): - skip_barcoding = True + skip_barcoding = False + if os.path.exists("run_configuration.json"): + reenter = input("Do you want to enter sample names again? (Y/N): ") + if reenter.lower().startswith("n"): + skip_barcoding = True - if not skip_barcoding: - barcodes = [] - print ("Enter sample names:") - for barcode in range(1,25): - print (" NB%02d: " % (barcode,), end = "") - barcodes.append(input("> ")) + if not skip_barcoding: + barcodes = [] + print("Enter sample names:") + for barcode in range(1, 25): + print(" NB%02d: " % (barcode,), end="") + barcodes.append(input("> ")) - fh = open("run_configuration.json", "w") - fh.write("""{ + fh = open("run_configuration.json", "w") + fh.write( + """{ \"samples\": [ - """) - first = True - for n, b in enumerate(barcodes): - if b: - if not first: - fh.write(",\n") - first = False - fh.write("""{ + """ + ) + first = True + for n, b in enumerate(barcodes): + if b: + if not first: + fh.write(",\n") + first = False + fh.write( + """{ "name": "%s", "description": "", "barcodes": [ "NB%02d" ] - }""" % (barcodes[n].replace("\"", "\\\""), n+1)) - fh.write("] }") - fh.close() - - cmd = "rampart --basecalledPath %s --protocol %s/%s --clearAnnotated" % (basecalledPath, args.protocol_directory, protocolPath) - print (cmd) + }""" + % (barcodes[n].replace('"', '\\"'), n + 1) + ) + fh.write("] }") + fh.close() - os.system(cmd) + cmd = "rampart --basecalledPath %s --protocol %s/%s --clearAnnotated" % ( + basecalledPath, + args.protocol_directory, + protocolPath, + ) + print(cmd) + os.system(cmd) diff --git a/artic/runs.py b/artic/runs.py index cab54e44..5e733570 100644 --- a/artic/runs.py +++ b/artic/runs.py @@ -1,13 +1,13 @@ -import sys import csv + def load_runs(fn): - rows = [] - skipcomments = (row for row in open(fn) if not row.startswith('#')) - for row in csv.DictReader(skipcomments, dialect='excel-tab'): - if 'Included' not in row: - rows.append(row) - elif int(row['Included']): - rows.append(row) - - return rows + rows = [] + skipcomments = (row for row in open(fn) if not row.startswith("#")) + for row in csv.DictReader(skipcomments, dialect="excel-tab"): + if "Included" not in row: + rows.append(row) + elif int(row["Included"]): + rows.append(row) + + return rows diff --git a/artic/utils.py b/artic/utils.py new file mode 100644 index 00000000..86dc5ffe --- /dev/null +++ b/artic/utils.py @@ -0,0 +1,625 @@ +#!/usr/bin/env python + +import sys +import os +import requests +import hashlib +import re +import pandas as pd +from clint.textui import colored + + +def getPrimerDirection(primerID): + """Infer the primer direction based on it's ID containing LEFT/RIGHT + + Parameters + ---------- + primerID : string + The primer ID from the 4th field of the primer scheme + """ + if "LEFT" in primerID: + return "+" + elif "RIGHT" in primerID: + return "-" + else: + print("LEFT/RIGHT must be specified in Primer ID", file=sys.stderr) + raise SystemExit(1) + + +def merge_sites(canonical, alt): + """Merges a canonical primer site with an alt site, producing an interval that encompasses both + + Parameters + ---------- + canonical : dict + The canonical primer site, provided as a dictionary of the bed file row + alt : dict + The alt primer site, provided as a dictionary of the bed file row + + Returns + ------- + dict + A dictionary of the merged site, where the dict represents a bed file row + """ + # base the merged site on the canonical + mergedSite = canonical + + # check the both the canonical and alt are the same direction + if canonical["direction"] != alt["direction"]: + print( + "could not merge alt with different orientation to canonical", + file=sys.stderr, + ) + raise SystemExit(1) + + # merge the start/ends of the alt with the canonical to get the largest window possible + if alt["start"] < canonical["start"]: + mergedSite["start"] = alt["start"] + if alt["end"] > canonical["end"]: + mergedSite["end"] = alt["end"] + return mergedSite + + +def identify_bed_file(bed_file): + """Identifies the version of the primer ID format used in the bed file + + Parameters + ---------- + bed_file : str + The bed file to identify the primer ID format + + Returns + ------- + int + The version of the primer ID format used in the bed file + """ + + V1_pattern = re.compile( + r"[a-zA-Z0-9\-]+_[0-9]+_(LEFT|RIGHT)(_ALT[0-9]*|_alt[0-9]*)*" + ) + + V2_pattern = re.compile(r"[a-zA-Z0-9\-]+_[0-9]+_(LEFT|RIGHT)_[0-9]+") + + version = False + + with open(bed_file, "r") as bed_fh: + bed = bed_fh.readlines() + + for line in bed: + if line.startswith("#"): + continue + + splits = line.strip().split("\t") + + if len(splits) < 6: + print( + colored.red( + f"Invalid bed file format, only found {len(splits)} columns. For valid formats please see https://github.com/quick-lab/primerschemes" + ) + ) + raise SystemExit(1) + + if not V1_pattern.match(splits[3]) and not V2_pattern.match(splits[3]): + print( + colored.red( + f"Invalid primer ID format, {splits[3]} does not match the expected format" + ) + ) + raise SystemExit(1) + + if V2_pattern.match(splits[3]): + if len(splits) < 7: + print( + colored.red( + f"Invalid bed file format, only found {len(splits)} columns. For valid formats please see https://github.com/ChrisgKent/primal-page" + ) + ) + + if not version: + version = 3 + + if version != 3: + print( + colored.red( + f"Scheme BED does not appear to be a consistent scheme version, for primer scheme formats please see https://github.com/ChrisgKent/primal-page" + ) + ) + raise SystemExit(1) + + elif V1_pattern.match(splits[3]): + + if len(splits) == 7: + if not version: + version = 2 + elif len(splits) == 6: + if not version: + version = 1 + else: + print( + colored.red( + f"Invalid bed file format, found {len(splits)} columns with V1 primer names. For valid formats please see https://github.com/ChrisgKent/primal-page" + ) + ) + + if version == 3: + print( + colored.red( + f"Scheme BED mixed primer ID formats, please ensure your BED file is consistent" + ) + ) + raise SystemExit(1) + + return version + + +def read_bed_file(fn): + """Parses a V2/V3 bed file and collapses primers into canonical primer sites + + Parameters + ---------- + fn : str + The bedfile to parse + + Returns + ------- + list + A list of dictionaries, where each dictionary contains a row of the parsed bedfile. + The available dictionary keys are - Primer_ID, direction, start, end + """ + + # read the primer scheme into a pandas dataframe and run type, length and null checks + version = identify_bed_file(fn) + + if version in (1, 2): + return read_bed_file_legacy(fn) + + primers = pd.read_csv( + fn, + sep="\t", + comment="#", + header=None, + names=["chrom", "start", "end", "Primer_ID", "PoolName", "direction"], + dtype={ + "chrom": str, + "start": int, + "end": int, + "Primer_ID": str, + "PoolName": str, + }, + usecols=(0, 1, 2, 3, 4, 5), + skiprows=0, + ) + if len(primers.index) < 1: + print("primer scheme file is empty", file=sys.stderr) + raise SystemExit(1) + if primers.isnull().sum().sum(): + print("malformed primer scheme file", file=sys.stderr) + raise SystemExit(1) + + canonical_primers = {} + for _, row in primers.iterrows(): + scheme_name, primer_id, direction, primer_n = row["Primer_ID"].split("_") + + if (primer_id, direction) not in canonical_primers: + canonical_primers[(primer_id, direction)] = row.to_dict() + continue + + canonical_primers[(primer_id, direction)] = merge_sites( + canonical_primers[(primer_id, direction)], row + ) + + # return the bedFile as a list + return [value for value in canonical_primers.values()] + + +def read_bed_file_legacy(fn): + """Parses a bed file and collapses alts into canonical primer sites + + Parameters + ---------- + fn : str + The bedfile to parse + + Returns + ------- + list + A list of dictionaries, where each dictionary contains a row of the parsed bedfile. + The available dictionary keys are - Primer_ID, direction, start, end + """ + + # read the primer scheme into a pandas dataframe and run type, length and null checks + primers = pd.read_csv( + fn, + sep="\t", + header=None, + names=["chrom", "start", "end", "Primer_ID", "PoolName", "direction"], + dtype={ + "chrom": str, + "start": int, + "end": int, + "Primer_ID": str, + "PoolName": str, + }, + usecols=(0, 1, 2, 3, 4, 5), + skiprows=0, + ) + if len(primers.index) < 1: + print("primer scheme file is empty", file=sys.stderr) + raise SystemExit(1) + if primers.isnull().sum().sum(): + print("malformed primer scheme file", file=sys.stderr) + raise SystemExit(1) + + # separate alt primers into a new dataframe + altFilter = primers["Primer_ID"].str.contains("_alt") + alts = pd.DataFrame( + columns=("chrom", "start", "end", "Primer_ID", "PoolName", "direction") + ) + alts = pd.concat([alts, primers[altFilter]]) + primers = primers.drop(primers[altFilter].index.values) + + # convert the primers dataframe to dictionary, indexed by Primer_ID + # - verify_integrity is used to prevent duplicate Primer_IDs being processed + bedFile = primers.set_index( + "Primer_ID", drop=False, verify_integrity=True + ).T.to_dict() + + # if there were no alts, return the bedfile as a list of dicts + if len(alts.index) == 0: + return list(bedFile.values()) + + # merge alts + for _, row in alts.iterrows(): + primerID = row["Primer_ID"].split("_alt")[0] + + # check the bedFile if another version of this primer exists + if primerID not in bedFile: + + # add to the bed file and continue + bedFile[primerID] = row.to_dict() + continue + + # otherwise, we've got a primer ID we've already seen so merge the alt + mergedSite = merge_sites(bedFile[primerID], row) + + # update the bedFile + bedFile[primerID] = mergedSite + + # return the bedFile as a list + return [value for value in bedFile.values()] + + +def overlaps(coords, pos): + for v in coords: + if pos >= v["start"] and pos <= v["end"]: + return v + return False + + +def check_hash(filepath, manifest_hash): + with open(filepath, "rb") as fh: + data = fh.read() + hash_md5 = hashlib.md5(data).hexdigest() + if hash_md5 != manifest_hash: + print( + colored.yellow( + f"md5 hash for {str(filepath)} does not match manifest, if this scheme has been downloaded previously, please ensure the file is not corrupted or has been tampered with. If this is a new download, please raise this as an issue here: https://github.com/quick-lab/primerschemes/issues" + ), + file=sys.stderr, + ) + raise SystemExit(1) + + +def get_scheme( + scheme_name: str, + scheme_version: str, + scheme_directory: str, + scheme_length: int = False, +): + """Get the primer scheme and reference fasta file from the manifest + + Args: + scheme_name (str): Name of the scheme + scheme_version (str): Version of the scheme + scheme_directory (str): Directory where schemes are stored + scheme_length (int, optional): Length of the scheme. Defaults to False. + + Returns: + str: Path to the primer bed file + str: Path to the reference fasta file + str: Version of the scheme + """ + + try: + response = requests.get( + "https://raw.githubusercontent.com/quick-lab/primerschemes/main/index.json" + ) + except requests.exceptions.RequestException as error: + print(colored.red(f"Failed to fetch manifest with Exception: {error}")) + raise SystemExit(1) + + if response.status_code != 200: + print( + colored.red( + f"Failed to fetch primer schemes manifest with status code: {response.status_code}" + ) + ) + raise SystemExit(1) + + manifest = response.json() + + try: + response = requests.get( + "https://raw.githubusercontent.com/quick-lab/primerschemes/main/aliases.json" + ) + except requests.exceptions.RequestException as error: + print(colored.red(f"Failed to fetch scheme aliases with Exception: {error}")) + raise SystemExit(1) + + if response.status_code != 200: + print( + colored.red( + f"Failed to fetch primer schemes alias file with status code: {response.status_code}" + ) + ) + raise SystemExit(1) + + aliases = response.json() + + scheme_name = scheme_name.lower().rstrip() + + if not manifest["primerschemes"].get(scheme_name): + if not aliases.get(scheme_name): + print( + colored.red( + f"Requested scheme: {scheme_name} could not be found, please check https://github.com/quick-lab/primerschemes for available schemes or provide a scheme bed and fasta reference directly using --bed and --ref" + ) + ) + raise SystemExit(1) + + scheme_name = aliases[scheme_name] + + if not manifest["primerschemes"].get(scheme_name): + print( + colored.red( + f"Scheme name alias does not exist in manifest, this should never happen, please create an issue on https://github.com/quick-lab/primerschemes/issues or https://github.com/artic-network/fieldbioinformatics/issues if you see this message" + ) + ) + raise SystemExit(1) + + scheme = manifest["primerschemes"][scheme_name] + + if len(scheme.keys()) > 1: + if not scheme_length: + print( + colored.red( + f"Multiple lengths of the scheme {scheme_name} found, please provide a scheme length using --scheme-length, available lengths are: {', '.join(scheme.keys())}" + ) + ) + raise SystemExit(1) + + else: + scheme_length = list(scheme.keys())[0] + + if not scheme.get(scheme_length): + print( + colored.red( + f"Provided length: {scheme_length} of the scheme {scheme_name} not found, please provide one of the following lengths using --scheme-length: {', '.join(scheme.keys())}" + ) + ) + raise SystemExit(1) + + scheme_version = scheme_version.lower().rstrip() + + version_pattern = re.compile(r"v\d+\.\d+\.\d+") + + if not version_pattern.match(scheme_version): + print( + colored.red( + f"Invalid scheme version format, please provide a version in the format 'vX.X.X', e.g. v1.0.0" + ) + ) + raise SystemExit(1) + + if not scheme[scheme_length].get(scheme_version): + print( + colored.red( + f"Requested scheme version: {scheme_version} not found, available versions are: {', '.join(scheme[scheme_length].keys())}" + ) + ) + raise SystemExit(1) + + scheme = scheme[scheme_length][scheme_version] + + scheme_path = os.path.join( + scheme_directory, scheme_name, scheme_length, scheme_version + ) + + if not os.path.exists(scheme_path): + os.makedirs(scheme_path, exist_ok=True) + + bed_url = scheme["primer_bed_url"] + + bed_name = bed_url.split("/")[-1] + + bed_path = os.path.join(scheme_path, bed_name) + + if not os.path.exists(bed_path): + try: + response = requests.get(bed_url) + except requests.exceptions.RequestException as error: + print( + colored.red(f"Failed to fetch primer bed file with Exception: {error}") + ) + raise SystemExit(1) + + if response.status_code != 200: + print( + colored.red( + f"Failed to fetch primer bed file with status code: {response.status_code}" + ) + ) + raise SystemExit(1) + + with open(bed_path, "w") as bed_file: + bed_file.write(response.text) + + check_hash(bed_path, scheme["primer_bed_md5"]) + + ref_url = scheme["reference_fasta_url"] + + ref_name = ref_url.split("/")[-1] + + ref_path = os.path.join(scheme_path, ref_name) + + if not os.path.exists(ref_path): + try: + response = requests.get(ref_url) + except requests.exceptions.RequestException as error: + print( + colored.red( + f"Failed to fetch reference fasta file with Exception: {error}" + ) + ) + raise SystemExit(1) + + if response.status_code != 200: + print( + colored.red( + f"Failed to fetch reference fasta file with status code: {response.status_code}" + ) + ) + raise SystemExit(1) + + with open(ref_path, "w") as ref_file: + ref_file.write(response.text) + + check_hash(ref_path, scheme["reference_fasta_md5"]) + + return bed_path, ref_path, scheme_version + + +def get_scheme_legacy(scheme_name, scheme_directory, scheme_version="1"): + """Get and check the ARTIC primer scheme. + When determining a version, the original behaviour (parsing the scheme_name and + separating on /V ) is used over a specified scheme_version. If neither are + provided, the version defaults to 1. + If 0 is provided as version, the latest scheme will be downloaded. + + Parameters + ---------- + scheme_name : str + The primer scheme name + scheme_directory : str + The directory containing the primer scheme and reference sequence + scheme_version : str + The primer scheme version (optional) + Returns + ------- + str + The location of the checked primer scheme + str + The location of the checked reference sequence + str + The version being used + """ + # try getting the version from the scheme name (old behaviour) + if scheme_name.find("/V") != -1: + scheme_name, scheme_version = scheme_name.split("/V") + + # create the filenames and check they exist + bed = "%s/%s/V%s/%s.scheme.bed" % ( + scheme_directory, + scheme_name, + scheme_version, + scheme_name, + ) + ref = "%s/%s/V%s/%s.reference.fasta" % ( + scheme_directory, + scheme_name, + scheme_version, + scheme_name, + ) + if os.path.exists(bed) and os.path.exists(ref): + return bed, ref, scheme_version + + # if they don't exist, try downloading them to the current directory + print( + colored.yellow( + "could not find primer scheme and reference sequence, downloading" + ), + file=sys.stderr, + ) + + try: + manifest = requests.get( + "https://raw.githubusercontent.com/artic-network/primer-schemes/master/schemes_manifest.json" + ).json() + except requests.exceptions.RequestException as error: + print("Manifest Exception:", error) + raise SystemExit(2) + + for scheme, scheme_contents in dict(manifest["schemes"]).items(): + if ( + scheme == scheme_name.lower() + or scheme_name.lower() in scheme_contents["aliases"] + ): + print( + colored.yellow( + f"\tfound requested scheme:\t{scheme} (using alias {scheme_name})" + ), + file=sys.stderr, + ) + if scheme_version == 0: + print( + colored.yellow( + f"Latest version for scheme {scheme} is -> {scheme_contents['latest_version']}" + ), + file=sys.stderr, + ) + scheme_version = scheme_contents["latest_version"] + elif scheme_version not in dict(scheme_contents["primer_urls"]).keys(): + print( + colored.yellow( + f"Requested scheme version {scheme_version} not found; using latest version ({scheme_contents['latest_version']}) instead" + ), + file=sys.stderr, + ) + scheme_version = scheme_contents["latest_version"] + bed = "%s/%s/V%s/%s.scheme.bed" % ( + scheme_directory, + scheme_name, + scheme_version, + scheme_name, + ) + ref = "%s/%s/V%s/%s.reference.fasta" % ( + scheme_directory, + scheme_name, + scheme_version, + scheme_name, + ) + + os.makedirs(os.path.dirname(bed), exist_ok=True) + with requests.get(scheme_contents["primer_urls"][scheme_version]) as fh: + open(bed, "wt").write(fh.text) + + os.makedirs(os.path.dirname(ref), exist_ok=True) + with requests.get(scheme_contents["reference_urls"][scheme_version]) as fh: + open(ref, "wt").write(fh.text) + + # check_scheme_hashes( + # bed, scheme_contents["primer_sha256_checksums"][scheme_version] + # ) + # check_scheme_hashes( + # ref, scheme_contents["reference_sha256_checksums"][scheme_version] + # ) + + return bed, ref, scheme_version + + print( + colored.yellow( + f"\tRequested scheme:\t{scheme_name} could not be found, exiting" + ), + file=sys.stderr, + ) + raise SystemExit(1) diff --git a/artic/vcf_filter.py b/artic/vcf_filter.py index 1ea2d474..91e2a648 100644 --- a/artic/vcf_filter.py +++ b/artic/vcf_filter.py @@ -1,31 +1,30 @@ -import vcf -import sys -from operator import attrgetter +from cyvcf2 import VCF, Writer from collections import defaultdict -from .vcftagprimersites import read_bed_file + def in_frame(v): if len(v.ALT) > 1: - print ("This code does not support multiple genotypes!") - raise SystemExit + print("This code does not support multiple genotypes!") + raise SystemExit ref = v.REF alt = v.ALT[0] bases = len(alt) - len(ref) if not bases: - return True + return True if bases % 3 == 0: - return True + return True return False + class NanoporeFilter: def __init__(self, no_frameshifts): self.no_frameshifts = no_frameshifts pass def check_filter(self, v): - total_reads = float(v.INFO['TotalReads']) + total_reads = float(v.INFO["TotalReads"]) qual = v.QUAL - strandbias = float(v.INFO['StrandFisherTest']) + # strandbias = float(v.INFO["StrandFisherTest"]) if qual / total_reads < 3: return False @@ -34,8 +33,8 @@ def check_filter(self, v): return False if v.is_indel: - strand_fraction_by_strand = v.INFO['SupportFractionByStrand'] - if float(strand_fraction_by_strand[0]) < 0.5: + strand_fraction_by_strand = v.INFO["SupportFractionByStrand"] + if float(strand_fraction_by_strand[0]) < 0.5: return False if float(strand_fraction_by_strand[1]) < 0.5: @@ -46,12 +45,13 @@ def check_filter(self, v): return True + class MedakaFilter: def __init__(self, no_frameshifts): self.no_frameshifts = no_frameshifts def check_filter(self, v): - depth = v.INFO['DP'] + depth = v.INFO["DP"] if depth < 20: return False @@ -62,17 +62,14 @@ def check_filter(self, v): return False return True + def go(args): - vcf_reader = vcf.Reader(filename=args.inputvcf) - vcf_writer = vcf.Writer(open(args.output_pass_vcf, 'w'), vcf_reader) - vcf_writer_filtered = vcf.Writer(open(args.output_fail_vcf, 'w'), vcf_reader) - if args.nanopolish: - filter = NanoporeFilter(args.no_frameshifts) - elif args.medaka: - filter = MedakaFilter(args.no_frameshifts) - else: - print("Please specify a VCF type, i.e. --nanopolish or --medaka\n") - raise SystemExit + vcf_reader = VCF(args.inputvcf) + vcf_writer = Writer(args.output_pass_vcf, vcf_reader, "w") + vcf_writer.write_header() + vcf_writer_filtered = Writer(args.output_fail_vcf, vcf_reader, "w") + vcf_writer_filtered.write_header() + filter = MedakaFilter(args.no_frameshifts) variants = [v for v in vcf_reader] @@ -80,15 +77,15 @@ def go(args): for v in variants: indx = "%s-%s" % (v.CHROM, v.POS) group_variants[indx].append(v) - + for v in variants: - # if using medaka, we need to do a quick pre-filter to remove rubbish that we don't want adding to the mask - if args.medaka: - if v.INFO['DP'] <= 1: - continue - if v.QUAL < 20: - continue + # quick pre-filter to remove rubbish that we don't want adding to the mask + # if v.INFO["DP"] <= 1: + # continue + + if v.QUAL < 20: + continue # now apply the filter to send variants to PASS or FAIL file if filter.check_filter(v): @@ -100,29 +97,27 @@ def go(args): if len(group_variants[indx]) > 1: for check_variant in group_variants[indx]: if filter.check_filter(check_variant): - variant_passes = True - + variant_passes = True + if not variant_passes: vcf_writer_filtered.write_record(v) else: - print ("Suppress variant %s\n" % (v.POS)) + print("Suppress variant %s\n" % (v.POS)) + def main(): import argparse parser = argparse.ArgumentParser() - parser.add_argument('--nanopolish', action='store_true') - parser.add_argument('--medaka', action='store_true') - parser.add_argument('--no-frameshifts', action='store_true') - parser.add_argument('inputvcf') - parser.add_argument('output_pass_vcf') - parser.add_argument('output_fail_vcf') + parser.add_argument("--no-frameshifts", action="store_true") + parser.add_argument("inputvcf") + parser.add_argument("output_pass_vcf") + parser.add_argument("output_fail_vcf") args = parser.parse_args() go(args) + if __name__ == "__main__": main() - - diff --git a/artic/vcf_merge.py b/artic/vcf_merge.py index a900d697..57346a3d 100644 --- a/artic/vcf_merge.py +++ b/artic/vcf_merge.py @@ -1,59 +1,90 @@ -import vcf +from cyvcf2 import VCF, Writer import sys from operator import attrgetter from collections import defaultdict -from .vcftagprimersites import read_bed_file +from artic.utils import read_bed_file + def vcf_merge(args): - bed = read_bed_file(args.bedfile) + bed = read_bed_file(args.bedfile) + + primer_map = defaultdict(dict) + + for p in bed: + for n in range(p["start"], p["end"] + 1): + primer_map[p["PoolName"]][n] = p["Primer_ID"] - primer_map = defaultdict(dict) + template_vcf = None - for p in bed: - for n in range(p['start'], p['end']+1): - primer_map[p['PoolName']][n] = p['Primer_ID'] + pool_map = {} + for param in args.vcflist: + pool_name, file_name = param.split(":") + pool_map[file_name] = pool_name + if not template_vcf: + vcf_reader = VCF(file_name) + if vcf_reader.seqnames: + template_vcf = vcf_reader + else: + print( + f"Not using {file_name} as VCF template since it has no variants", + file=sys.stderr, + ) - first_vcf = None + # vcf_reader.infos["Pool"] = vcf.parser._Format("Pool", 1, "String", "The pool name") + vcf_reader.add_info_to_header( + {"ID": "Pool", "Number": 1, "Type": "String", "Description": "The pool name"} + ) + vcf_writer = Writer(f"{args.prefix}.merged.vcf", template_vcf, "w") + vcf_writer.write_header() + vcf_writer_primers = Writer(f"{args.prefix}.primers.vcf", template_vcf, "w") + vcf_writer_primers.write_header() - pool_map = {} - for param in args.vcflist: - pool_name, file_name = param.split(":") - pool_map[file_name] = pool_name - if not first_vcf: - first_vcf = file_name + variants = [] + for file_name, pool_name in pool_map.items(): + vcf_reader = VCF(file_name) + if not vcf_reader.seqnames: + print(f"Skipping {file_name} as it has no variants", file=sys.stderr) + continue - vcf_reader = vcf.Reader(filename=first_vcf) - vcf_reader.infos["Pool"] = vcf.parser._Format("Pool", 1, "String", "The pool name") - vcf_writer = vcf.Writer(open(args.prefix+'.merged.vcf', 'w'), vcf_reader) - vcf_writer_primers = vcf.Writer(open(args.prefix+'.primers.vcf', 'w'), vcf_reader) + vcf_reader.add_info_to_header( + { + "ID": "Pool", + "Number": 1, + "Type": "String", + "Description": "The pool name", + } + ) + for v in vcf_reader: + v.INFO["Pool"] = pool_name + variants.append(v) - variants = [] - for file_name, pool_name in pool_map.items(): - vcf_reader = vcf.Reader(filename=file_name) - for v in vcf_reader: - v.INFO['Pool'] = pool_name - variants.append(v) + variants.sort(key=attrgetter("CHROM", "POS")) - variants.sort(key=attrgetter('CHROM', 'POS')) + for v in variants: + if v.POS in primer_map[v.INFO["Pool"]]: + vcf_writer_primers.write_record(v) + print( + "found primer binding site mismatch: %s" + % (primer_map[v.INFO["Pool"]][v.POS]), + file=sys.stderr, + ) + else: + vcf_writer.write_record(v) - for v in variants: - if v.POS in primer_map[v.INFO['Pool']]: - vcf_writer_primers.write_record(v) - print("found primer binding site mismatch: %s" % (primer_map[v.INFO['Pool']][v.POS]), file=sys.stderr) - else: - vcf_writer.write_record(v) def main(): import argparse - parser = argparse.ArgumentParser(description='Trim alignments from an amplicon scheme.') - parser.add_argument('prefix') - parser.add_argument('bedfile') - parser.add_argument('vcflist', nargs='+') + parser = argparse.ArgumentParser( + description="Trim alignments from an amplicon scheme." + ) + parser.add_argument("prefix") + parser.add_argument("bedfile") + parser.add_argument("vcflist", nargs="+") args = parser.parse_args() vcf_merge(args) + if __name__ == "__main__": main() - diff --git a/artic/vcfextract.py b/artic/vcfextract.py index a10140da..0c18ef26 100755 --- a/artic/vcfextract.py +++ b/artic/vcfextract.py @@ -1,58 +1,58 @@ #!/usr/bin/env python -import vcf import sys import subprocess -import csv import os from collections import defaultdict -from operator import attrgetter +from cyvcf2 import VCF + def read_vcf(fn): vcfinfo = {} - vcf_reader = vcf.Reader(open(fn, 'r')) + vcf_reader = VCF(fn) for record in vcf_reader: vcfinfo[record.POS] = record return vcfinfo + def collect_depths(bamfile): if not os.path.exists(bamfile): raise SystemExit("bamfile %s doesn't exist" % (bamfile,)) - p = subprocess.Popen(['samtools', 'depth', bamfile], stdout=subprocess.PIPE) + p = subprocess.Popen(["samtools", "depth", bamfile], stdout=subprocess.PIPE) out, err = p.communicate() depths = defaultdict(dict) - for ln in out.decode('utf-8').split("\n"): + for ln in out.decode("utf-8").split("\n"): if ln: contig, pos, depth = ln.split("\t") depths[int(pos)] = int(depth) return depths + def main(): positions = {} for sample_tag in sys.argv[1:]: - for vcfset in ['', '.primertrimmed']: + for vcfset in ["", ".primertrimmed"]: vcffn = "%s%s.vcf" % (sample_tag, vcfset) if not os.path.exists(vcffn): continue print(vcffn, file=sys.stderr) - vcf_reader = vcf.Reader(filename=vcffn) + vcf_reader = VCF(vcffn) for record in vcf_reader: if len(record.ALT[0]) == 1 and len(record.REF) == 1: - positions[record.POS] = 'snp' + positions[record.POS] = "snp" else: - positions[record.POS] = 'indel' - + positions[record.POS] = "indel" print("pos\tset\tsample\tvartype\tdepth\tsupportfraction\tbasecalledfrequency") - #for run, samples in runs.iteritems(): + # for run, samples in runs.iteritems(): # for sample_tag in samples.keys(): for sample_tag in sys.argv[1:]: - for vcfset in ['', '.primertrimmed']: + for vcfset in ["", ".primertrimmed"]: vcffn = "%s%s.vcf" % (sample_tag, vcfset) if not os.path.exists(vcffn): print("%s does not exist" % (vcffn)) @@ -62,19 +62,33 @@ def main(): bamfn = "%s.primertrimmed.sorted.bam" % (sample_tag) depths = collect_depths(bamfn) - #1-based pyvcf + # 1-based pyvcf for pos, variant_type in positions.items(): - if pos-1 in depths: - depth = depths[pos-1] + if pos - 1 in depths: + depth = depths[pos - 1] else: depth = 0 if pos in vcffile: info = vcffile[pos].INFO - print("%s\t%s\t%s\t%s\t%s\t%s\t%s" % (pos, vcfset, sample_tag, variant_type, depth, info['SupportFraction'], info['BaseCalledFraction'])) + print( + "%s\t%s\t%s\t%s\t%s\t%s\t%s" + % ( + pos, + vcfset, + sample_tag, + variant_type, + depth, + info["SupportFraction"], + info["BaseCalledFraction"], + ) + ) else: - print("%s\t%s\t%s\tinvariant\t%s\t0\t0" % (pos, vcfset, sample_tag, depth)) + print( + "%s\t%s\t%s\tinvariant\t%s\t0\t0" + % (pos, vcfset, sample_tag, depth) + ) + if __name__ == "__main__": main() - diff --git a/artic/vcftagprimersites.py b/artic/vcftagprimersites.py deleted file mode 100755 index 914afb2d..00000000 --- a/artic/vcftagprimersites.py +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/env python - -import pandas as pd -import vcf -import sys -import subprocess -import csv -from collections import defaultdict - - -def getPrimerDirection(primerID): - """Infer the primer direction based on it's ID containing LEFT/RIGHT - - Parameters - ---------- - primerID : string - The primer ID from the 4th field of the primer scheme - """ - if 'LEFT' in primerID: - return '+' - elif 'RIGHT' in primerID: - return '-' - else: - print("LEFT/RIGHT must be specified in Primer ID", file=sys.stderr) - raise SystemExit(1) - - -def merge_sites(canonical, alt): - """Merges a canonical primer site with an alt site, producing an interval that encompasses both - - Parameters - ---------- - canonical : dict - The canonical primer site, provided as a dictionary of the bed file row - alt : dict - The alt primer site, provided as a dictionary of the bed file row - - Returns - ------- - dict - A dictionary of the merged site, where the dict represents a bed file row - """ - # base the merged site on the canonical - mergedSite = canonical - - # check the both the canonical and alt are the same direction - if canonical['direction'] != alt['direction']: - print( - "could not merge alt with different orientation to canonical", file=sys.stderr) - raise SystemExit(1) - - # merge the start/ends of the alt with the canonical to get the largest window possible - if alt['start'] < canonical['start']: - mergedSite['start'] = alt['start'] - if alt['end'] > canonical['end']: - mergedSite['end'] = alt['end'] - return mergedSite - - -def read_bed_file(fn): - """Parses a bed file and collapses alts into canonical primer sites - - Parameters - ---------- - fn : str - The bedfile to parse - - Returns - ------- - list - A list of dictionaries, where each dictionary contains a row of the parsed bedfile. - The available dictionary keys are - Primer_ID, direction, start, end - """ - - # read the primer scheme into a pandas dataframe and run type, length and null checks - primers = pd.read_csv(fn, sep='\t', header=None, - names=['chrom', 'start', 'end', - 'Primer_ID', 'PoolName'], - dtype={'chrom': str, 'start': int, 'end': int, - 'Primer_ID': str, 'PoolName': str}, - usecols=(0, 1, 2, 3, 4), - skiprows=0) - if len(primers.index) < 1: - print("primer scheme file is empty", file=sys.stderr) - raise SystemExit(1) - if primers.isnull().sum().sum(): - print("malformed primer scheme file", file=sys.stderr) - raise SystemExit(1) - - # compute the direction - primers['direction'] = primers.apply( - lambda row: getPrimerDirection(row.Primer_ID), axis=1) - - # separate alt primers into a new dataframe - altFilter = primers['Primer_ID'].str.contains('_alt') - alts = pd.DataFrame( - columns=('chrom', 'start', 'end', 'Primer_ID', 'PoolName', 'direction')) - alts = pd.concat([alts, primers[altFilter]]) - primers = primers.drop(primers[altFilter].index.values) - - # convert the primers dataframe to dictionary, indexed by Primer_ID - # - verify_integrity is used to prevent duplicate Primer_IDs being processed - bedFile = primers.set_index('Primer_ID', drop=False, - verify_integrity=True).T.to_dict() - - # if there were no alts, return the bedfile as a list of dicts - if len(alts.index) == 0: - return list(bedFile.values()) - - # merge alts - for _, row in alts.iterrows(): - primerID = row['Primer_ID'].split('_alt')[0] - - # check the bedFile if another version of this primer exists - if primerID not in bedFile: - - # add to the bed file and continue - bedFile[primerID] = row.to_dict() - continue - - # otherwise, we've got a primer ID we've already seen so merge the alt - mergedSite = merge_sites(bedFile[primerID], row) - - # update the bedFile - bedFile[primerID] = mergedSite - - # return the bedFile as a list - return [value for value in bedFile.values()] - - -def overlaps(coords, pos): - for v in coords: - if pos >= v['start'] and pos <= v['end']: - return v - return False - - -if __name__ == "__main__": - if sys.argv[1] not in sets: - print("Invalid set") - raise SystemExit(1) - - bedfile = read_bed_file(sys.argv[1]) - - vcf_reader = vcf.Reader(filename=sys.argv[2]) - vcf_writer = vcf.Writer(sys.stdout, vcf_reader) - for record in vcf_reader: - v = overlaps(bedfile, record.POS) - if v: - record.INFO['PRIMER'] = v["Sequence_(5-3')"] - -# PP = list(record.INFO) -# record.INFO = {} -# record.INFO['PP'] = PP -# record.INFO['DEPTH'] = depths[record.CHROM][record.POS] - - vcf_writer.write_record(record) diff --git a/artic/version.py b/artic/version.py index b3f9ac7f..3e8d9f94 100644 --- a/artic/version.py +++ b/artic/version.py @@ -1 +1 @@ -__version__ = "1.2.4" +__version__ = "1.4.0" diff --git a/docs/commands.md b/docs/commands.md index 68c5a5bc..ef1338c7 100644 --- a/docs/commands.md +++ b/docs/commands.md @@ -2,35 +2,16 @@ title: commands summary: The available artic pipeline commands. authors: + - Sam Wilkinson - Will Rowe - Nick Loman -date: 2020-03-30 +date: 2024-08-16 --- # Commands This page documents the available commands via the `artic` command line interface. ---- - -## basecaller - -### Overview - -Display basecallers in files - -### Input - -### Output - -### Usage example - -```bash -artic basecaller -``` - ---- - ## demultiplex ### Overview @@ -145,38 +126,6 @@ artic filter --max-length 500 --min-length 50 --- -## gather - -### Overview - -Gather up demultiplexed files - -### Input - -- director[y/ies] to gather from - -### Output - -- directory of gathered files - -### Usage example - -```bash -artic gather --directory ./ -``` - -| Argument name(s) | Required | Default value | Description | -| :----------------- | :------- | :------------ | :---------------------------------------- | -| --directory | Y | NA | The director[y/ies] to gather files from | -| prefix | Y | NA | Prefix for gathered files | -| --max-length | N | NA | Remove reads greater than max-length | -| --min-length | N | NA | Remove reads less than than min-length | -| --prompt-directory | N | NA | The run directory for interactive prompts | -| --fast5-directory | N | NA | The directory with fast5 files | -| --no-fast5s | N | NA | Do not use fast5s and nanopolish | - ---- - ## guppyplex ### Overview @@ -233,23 +182,20 @@ artic minion | :------------------- | :------- | :------------- | :------------------------------------------------------------------------------------------- | | scheme | Y | NA | The name of the primer scheme | | sample | Y | NA | The name of the sample | -| --medaka | N | False | Use medaka instead of nanopolish for variants | -| --medaka-model | * | NA | Medaka model to use (required if --medaka set) | -| --minimap2 | N | True | Use minimap2 | -| --bwa | N | False | Use bwa instead of minimap2 | +| --clair3 | N | False | Use clair3 instead of medaka for variants (experimental feature from v1.4.0) | +| --model | Y | NA | Medaka or Clair3 model to use | | --normalise | N | 100 | Normalise down to moderate coverage to save runtime | | --threads | N | 8 | Number of threads | | --scheme-directory | N | /artic/schemes | Default scheme directory | | --max-haplotypes | N | 1000000 | Max-haplotypes value for nanopolish | | --read-file | N | NA | Use alternative FASTA/FASTQ file to .fasta | -| --fast5-directory | N | NA | FAST5 Directory | -| --sequencing-summary | N | NA | Path to Guppy sequencing summary | -| --skip-nanopolish | N | False | Skip nanopolish | | --no-longshot | N | False | Use medaka variant instead of longshot (experimental feautre from v1.2.0) | -| --strict | N | False | Enables experimental features (from v1.2.0), including VFC overlap checks and stats | +| --min-mapq | Y | 20 | Remove reads which map to the reference with a lower mapping quality than this | +| --no-indels | N | False | Ignore insertions and deletions during variant calling, maintains the co-ordinates of the ref| +| --no-frameshifts | N | False | Do not allow frameshift variants (indels of lengths which are non divisible be 3 ) to be added to the consensus | +| --use-muscle | N | False | Use muscle to produce an alignment of the produced consensus sequence against the reference | | --dry-run | N | False | Perform a dry run of the minion pipeline, outputing commands to a log but not executing them | -* `--medaka-model` is required if `--medaka` is set. --- diff --git a/docs/faq.md b/docs/faq.md index 7cf5f954..25cc14a9 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -13,9 +13,9 @@ date: 2020-03-30 The standard operating proceedure for the ARTIC Network SARS-SoV-2 bioinformatics can be found [here](https://artic.network/ncov-2019/ncov2019-bioinformatics-sop.html). -## Should I use the nanopolish or medaka workflow +## Should I use the medaka or clair3 workflow -We currently recommend the nanopolish workflow (if you have signal data available) as we have spent more time validating and supporting this workflow. That being said, both tend to give consistent results with our test datasets so the choice is yours. +We currently recommend the medaka workflow as we have spent more time validating and supporting this workflow. That being said, both tend to give consistent results with our test datasets so the choice is yours. ## Lab-on-an-SSD diff --git a/docs/installation.md b/docs/installation.md index f5c96200..ce41f45d 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -9,7 +9,7 @@ date: 2020-03-30 # Installation -As of [release 1.1.0](https://github.com/artic-network/fieldbioinformatics/releases/tag/1.1.0), it is probably easiest to install the `artic pipeline` using conda. Alternatively, you can install the pipeline itself via source/pip but you will have to also satisfy the pipeline dependencies. +As of [release 1.4.0](https://github.com/artic-network/fieldbioinformatics/releases/tag/1.4.0), conda installation of fieldbioinformatics will become difficult due to the mutually exclusive requirements of medaka and clair3, for this reason we recommend either utilising the docker image [available here](https://quay.io/repository/artic/fieldbioinformatics) or to build the package from source after installing the dependencies via Conda. ## Via conda @@ -19,21 +19,21 @@ conda install -c bioconda artic ## Via source -### 1. installing the pipeline +### 1. installing dependencies + +The `artic pipeline` has several [software dependencies](https://github.com/artic-network/fieldbioinformatics/blob/master/environment.yml). You can solve these dependencies using the minimal conda environment we have provided, we strongly recommend that you use either the mamba solver or conda version >= 23.10.0 since libmamba solver is now the default conda solver: ```sh git clone https://github.com/artic-network/fieldbioinformatics cd fieldbioinformatics -python setup.py install +conda env create -f environment.yml ``` -### 2. installing dependencies - -The `artic pipeline` has several [software dependencies](https://github.com/artic-network/fieldbioinformatics/blob/master/environment.yml). You can solve these dependencies using the minimal conda environment we have provided: +### 2. installing the pipeline ```sh -conda env create -f environment.yml conda activate artic +pip install . ``` ### 3. testing the pipeline @@ -47,7 +47,7 @@ artic -v To check that you have all the required dependencies, you can try the pipeline tests with both workflows: ``` -./test-runner.sh nanopolish +./test-runner.sh clair3 ./test-runner.sh medaka ``` diff --git a/environment.yml b/environment.yml index 1b4b891f..3f0c52d1 100644 --- a/environment.yml +++ b/environment.yml @@ -4,27 +4,25 @@ channels: - bioconda - defaults dependencies: - - artic-porechop =0.3.2pre - - artic-tools =0.2.6 - - longshot =0.4.5 - - bcftools =1.17 - - biopython =1.81 - - bwa = 0.7.17 - - clint =0.5.1 - - htslib =1.17 - - medaka >=1.8.0 - - minimap2 =2.26 - - multiqc =1.15 - - muscle =3.8 - - nanopolish =0.14.0 - - pandas =2.0.3 + - longshot + - bcftools + - biopython + - bwa + - clint + - htslib + - minimap2 + - clair3 + - multiqc + - muscle<5.1 + - pandas - pip - - pysam =0.21.0 + - pysam - pytest - - python =3.9.15 - - pyvcf =0.6.8 - - pyfaidx =0.6.0 - - requests =2.31.0 - - samtools =1.17 - - tqdm =4.66.1 - - keras-preprocessing =1.1.2 + - python + - cyvcf2 + - pyfaidx + - requests + - samtools + - tqdm + - pip: + - medaka \ No newline at end of file diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 9dc60131..00000000 --- a/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ -biopython -clint -pandas -pysam -pytest -pyvcf -requests -tqdm \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..9afe6d2e --- /dev/null +++ b/setup.cfg @@ -0,0 +1,39 @@ +[metadata] +name = artic +version = 1.4.0 +author = Nick Loman +author_email = n.j.loman@bham.ac.uk +maintainer = Sam Wilkinson +maintainer_email = s.a.j.wilkinson@bham.ac.uk +description = ``artic`` is a pipeline for working with viral amplicon sequencing data +long_description = file: README.md +long_description_content_type = text/markdown +license_files = LICENSE +classifiers = + Programming Language :: Python :: 3 + Development Status :: 4 - Beta + Intended Audience :: Science/Research + License :: OSI Approved :: MIT + Topic :: Scientific/Engineering :: Bio-Informatics + +[options] +zip_safe = False +include_package_data = True +packages = find: +python_requires = <3.11 + +[options.entry_points] +console_scripts = + artic=artic.pipeline:main + align_trim=artic.align_trim:main + align_trim_n=artic.align_trim_n:main + margin_cons=artic.margin_cons:main + margin_cons_medaka=artic.margin_cons_medaka:main + vcfextract=artic.vcfextract:main + artic_vcf_merge=artic.vcf_merge:main + artic_vcf_filter=artic.vcf_filter:main + artic_make_depth_mask=artic.make_depth_mask:main + artic_fasta_header=artic.fasta_header:main + artic_mask=artic.mask:main + artic_get_stats=artic.artic_mqc:main + diff --git a/setup.py b/setup.py index 236f0679..60684932 100644 --- a/setup.py +++ b/setup.py @@ -1,53 +1,3 @@ -import os from setuptools import setup -version_py = os.path.join(os.path.dirname(__file__), 'artic', 'version.py') -version = open(version_py).read().strip().split( - '=')[-1].replace('"', '').strip() -long_description = """ -``artic`` is a pipeline for working with virus sequencing data sequenced with nanopore -""" - -HERE = os.path.dirname(__file__) - -with open(os.path.join(HERE, "requirements.txt"), "r") as f: - install_requires = [x.strip() for x in f.readlines()] - -setup( - name="artic", - version=version, - install_requires=install_requires, - requires=['python (>=3.5)'], - packages=['artic'], - author="Nick Loman", - description='A toolset for working with nanopore sequencing data', - long_description=long_description, - url="https://artic.network/ncov-2019/ncov2019-bioinformatics-sop.html", - package_dir={'artic': "artic"}, - package_data={'artic': []}, - zip_safe=False, - include_package_data=True, - entry_points={ - 'console_scripts': [ - 'artic=artic.pipeline:main', - 'align_trim=artic.align_trim:main', - 'align_trim_n=artic.align_trim_n:main', - 'margin_cons=artic.margin_cons:main', - 'margin_cons_medaka=artic.margin_cons_medaka:main', - 'vcfextract=artic.vcfextract:main', - 'artic_vcf_merge=artic.vcf_merge:main', - 'artic_vcf_filter=artic.vcf_filter:main', - 'artic_make_depth_mask=artic.make_depth_mask:main', - 'artic_fasta_header=artic.fasta_header:main', - 'artic_mask=artic.mask:main', - 'artic_get_stats=artic.artic_mqc:main', - ], - }, - author_email="n.j.loman@bham.ac.uk", - classifiers=[ - 'Development Status :: 4 - Beta', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: MIT', - 'Topic :: Scientific/Engineering :: Bio-Informatics' - ] -) +setup() diff --git a/test-data/primer-schemes/IturiEBOV/V1/IturiEBOV.scheme.bed b/test-data/primer-schemes/IturiEBOV/V1/IturiEBOV.scheme.bed index 612bdee4..0127cb9e 100644 --- a/test-data/primer-schemes/IturiEBOV/V1/IturiEBOV.scheme.bed +++ b/test-data/primer-schemes/IturiEBOV/V1/IturiEBOV.scheme.bed @@ -1,122 +1,122 @@ -BTB20484 27 55 Ebov-DRC_1_LEFT Ebov-DRC_1 -BTB20484 383 407 Ebov-DRC_1_RIGHT Ebov-DRC_1 -BTB20484 350 376 Ebov-DRC_2_LEFT Ebov-DRC_2 -BTB20484 707 730 Ebov-DRC_2_RIGHT Ebov-DRC_2 -BTB20484 665 687 Ebov-DRC_3_LEFT Ebov-DRC_1 -BTB20484 1033 1055 Ebov-DRC_3_RIGHT Ebov-DRC_1 -BTB20484 994 1017 Ebov-DRC_4_LEFT Ebov-DRC_2 -BTB20484 1354 1376 Ebov-DRC_4_RIGHT Ebov-DRC_2 -BTB20484 1294 1316 Ebov-DRC_5_LEFT Ebov-DRC_1 -BTB20484 1680 1702 Ebov-DRC_5_RIGHT Ebov-DRC_1 -BTB20484 1625 1647 Ebov-DRC_6_LEFT Ebov-DRC_2 -BTB20484 1995 2017 Ebov-DRC_6_RIGHT Ebov-DRC_2 -BTB20484 1926 1948 Ebov-DRC_7_LEFT Ebov-DRC_1 -BTB20484 2317 2339 Ebov-DRC_7_RIGHT Ebov-DRC_1 -BTB20484 2238 2260 Ebov-DRC_8_LEFT Ebov-DRC_2 -BTB20484 2630 2652 Ebov-DRC_8_RIGHT Ebov-DRC_2 -BTB20484 2540 2565 Ebov-DRC_9_LEFT Ebov-DRC_1 -BTB20484 2895 2920 Ebov-DRC_9_RIGHT Ebov-DRC_1 -BTB20484 2843 2872 Ebov-DRC_10_LEFT Ebov-DRC_2 -BTB20484 3215 3237 Ebov-DRC_10_RIGHT Ebov-DRC_2 -BTB20484 3155 3177 Ebov-DRC_11_LEFT Ebov-DRC_1 -BTB20484 3524 3546 Ebov-DRC_11_RIGHT Ebov-DRC_1 -BTB20484 3460 3485 Ebov-DRC_12_LEFT Ebov-DRC_2 -BTB20484 3821 3843 Ebov-DRC_12_RIGHT Ebov-DRC_2 -BTB20484 3769 3792 Ebov-DRC_13_LEFT Ebov-DRC_1 -BTB20484 4121 4149 Ebov-DRC_13_RIGHT Ebov-DRC_1 -BTB20484 4014 4036 Ebov-DRC_14_LEFT Ebov-DRC_2 -BTB20484 4369 4395 Ebov-DRC_14_RIGHT Ebov-DRC_2 -BTB20484 4287 4311 Ebov-DRC_15_LEFT Ebov-DRC_1 -BTB20484 4674 4696 Ebov-DRC_15_RIGHT Ebov-DRC_1 -BTB20484 4606 4628 Ebov-DRC_16_LEFT Ebov-DRC_2 -BTB20484 4990 5014 Ebov-DRC_16_RIGHT Ebov-DRC_2 -BTB20484 4945 4967 Ebov-DRC_17_LEFT Ebov-DRC_1 -BTB20484 5315 5338 Ebov-DRC_17_RIGHT Ebov-DRC_1 -BTB20484 5235 5260 Ebov-DRC_18_LEFT Ebov-DRC_2 -BTB20484 5585 5615 Ebov-DRC_18_RIGHT Ebov-DRC_2 -BTB20484 5528 5556 Ebov-DRC_19_LEFT Ebov-DRC_1 -BTB20484 5895 5918 Ebov-DRC_19_RIGHT Ebov-DRC_1 -BTB20484 5827 5849 Ebov-DRC_20_LEFT Ebov-DRC_2 -BTB20484 6211 6237 Ebov-DRC_20_RIGHT Ebov-DRC_2 -BTB20484 6164 6188 Ebov-DRC_21_LEFT Ebov-DRC_1 -BTB20484 6535 6558 Ebov-DRC_21_RIGHT Ebov-DRC_1 -BTB20484 6479 6501 Ebov-DRC_22_LEFT Ebov-DRC_2 -BTB20484 6859 6881 Ebov-DRC_22_RIGHT Ebov-DRC_2 -BTB20484 6791 6813 Ebov-DRC_23_LEFT Ebov-DRC_1 -BTB20484 7184 7207 Ebov-DRC_23_RIGHT Ebov-DRC_1 -BTB20484 7140 7162 Ebov-DRC_24_LEFT Ebov-DRC_2 -BTB20484 7510 7532 Ebov-DRC_24_RIGHT Ebov-DRC_2 -BTB20484 7437 7459 Ebov-DRC_25_LEFT Ebov-DRC_1 -BTB20484 7805 7827 Ebov-DRC_25_RIGHT Ebov-DRC_1 -BTB20484 7741 7763 Ebov-DRC_26_LEFT Ebov-DRC_2 -BTB20484 8097 8121 Ebov-DRC_26_RIGHT Ebov-DRC_2 -BTB20484 8056 8086 Ebov-DRC_27_LEFT Ebov-DRC_1 -BTB20484 8417 8440 Ebov-DRC_27_RIGHT Ebov-DRC_1 -BTB20484 8364 8387 Ebov-DRC_28_LEFT Ebov-DRC_2 -BTB20484 8737 8764 Ebov-DRC_28_RIGHT Ebov-DRC_2 -BTB20484 8694 8716 Ebov-DRC_29_LEFT Ebov-DRC_1 -BTB20484 9056 9081 Ebov-DRC_29_RIGHT Ebov-DRC_1 -BTB20484 9024 9046 Ebov-DRC_30_LEFT Ebov-DRC_2 -BTB20484 9375 9404 Ebov-DRC_30_RIGHT Ebov-DRC_2 -BTB20484 9307 9332 Ebov-DRC_31_LEFT Ebov-DRC_1 -BTB20484 9666 9688 Ebov-DRC_31_RIGHT Ebov-DRC_1 -BTB20484 9602 9624 Ebov-DRC_32_LEFT Ebov-DRC_2 -BTB20484 9982 10006 Ebov-DRC_32_RIGHT Ebov-DRC_2 -BTB20484 9937 9961 Ebov-DRC_33_LEFT Ebov-DRC_1 -BTB20484 10305 10330 Ebov-DRC_33_RIGHT Ebov-DRC_1 -BTB20484 10237 10259 Ebov-DRC_34_LEFT Ebov-DRC_2 -BTB20484 10617 10639 Ebov-DRC_34_RIGHT Ebov-DRC_2 -BTB20484 10539 10561 Ebov-DRC_35_LEFT Ebov-DRC_1 -BTB20484 10923 10945 Ebov-DRC_35_RIGHT Ebov-DRC_1 -BTB20484 10860 10887 Ebov-DRC_36_LEFT Ebov-DRC_2 -BTB20484 11247 11270 Ebov-DRC_36_RIGHT Ebov-DRC_2 -BTB20484 11152 11180 Ebov-DRC_37_LEFT Ebov-DRC_1 -BTB20484 11517 11539 Ebov-DRC_37_RIGHT Ebov-DRC_1 -BTB20484 11385 11409 Ebov-DRC_38_LEFT Ebov-DRC_2 -BTB20484 11770 11792 Ebov-DRC_38_RIGHT Ebov-DRC_2 -BTB20484 11700 11722 Ebov-DRC_39_LEFT Ebov-DRC_1 -BTB20484 12073 12098 Ebov-DRC_39_RIGHT Ebov-DRC_1 -BTB20484 12040 12063 Ebov-DRC_40_LEFT Ebov-DRC_2 -BTB20484 12397 12420 Ebov-DRC_40_RIGHT Ebov-DRC_2 -BTB20484 12346 12370 Ebov-DRC_41_LEFT Ebov-DRC_1 -BTB20484 12709 12733 Ebov-DRC_41_RIGHT Ebov-DRC_1 -BTB20484 12645 12667 Ebov-DRC_42_LEFT Ebov-DRC_2 -BTB20484 13032 13054 Ebov-DRC_42_RIGHT Ebov-DRC_2 -BTB20484 12961 12983 Ebov-DRC_43_LEFT Ebov-DRC_1 -BTB20484 13358 13380 Ebov-DRC_43_RIGHT Ebov-DRC_1 -BTB20484 13313 13336 Ebov-DRC_44_LEFT Ebov-DRC_2 -BTB20484 13681 13704 Ebov-DRC_44_RIGHT Ebov-DRC_2 -BTB20484 13639 13661 Ebov-DRC_45_LEFT Ebov-DRC_1 -BTB20484 14004 14026 Ebov-DRC_45_RIGHT Ebov-DRC_1 -BTB20484 13941 13969 Ebov-DRC_46_LEFT Ebov-DRC_2 -BTB20484 14325 14352 Ebov-DRC_46_RIGHT Ebov-DRC_2 -BTB20484 14274 14296 Ebov-DRC_47_LEFT Ebov-DRC_1 -BTB20484 14646 14671 Ebov-DRC_47_RIGHT Ebov-DRC_1 -BTB20484 14590 14612 Ebov-DRC_48_LEFT Ebov-DRC_2 -BTB20484 14951 14974 Ebov-DRC_48_RIGHT Ebov-DRC_2 -BTB20484 14913 14935 Ebov-DRC_49_LEFT Ebov-DRC_1 -BTB20484 15272 15294 Ebov-DRC_49_RIGHT Ebov-DRC_1 -BTB20484 15202 15224 Ebov-DRC_50_LEFT Ebov-DRC_2 -BTB20484 15575 15597 Ebov-DRC_50_RIGHT Ebov-DRC_2 -BTB20484 15504 15529 Ebov-DRC_51_LEFT Ebov-DRC_1 -BTB20484 15899 15922 Ebov-DRC_51_RIGHT Ebov-DRC_1 -BTB20484 15844 15866 Ebov-DRC_52_LEFT Ebov-DRC_2 -BTB20484 16207 16229 Ebov-DRC_52_RIGHT Ebov-DRC_2 -BTB20484 16171 16193 Ebov-DRC_53_LEFT Ebov-DRC_1 -BTB20484 16530 16553 Ebov-DRC_53_RIGHT Ebov-DRC_1 -BTB20484 16470 16492 Ebov-DRC_54_LEFT Ebov-DRC_2 -BTB20484 16848 16870 Ebov-DRC_54_RIGHT Ebov-DRC_2 -BTB20484 16777 16800 Ebov-DRC_55_LEFT Ebov-DRC_1 -BTB20484 17144 17166 Ebov-DRC_55_RIGHT Ebov-DRC_1 -BTB20484 17085 17107 Ebov-DRC_56_LEFT Ebov-DRC_2 -BTB20484 17447 17472 Ebov-DRC_56_RIGHT Ebov-DRC_2 -BTB20484 17381 17406 Ebov-DRC_57_LEFT Ebov-DRC_1 -BTB20484 17748 17772 Ebov-DRC_57_RIGHT Ebov-DRC_1 -BTB20484 17649 17675 Ebov-DRC_58_LEFT Ebov-DRC_2 -BTB20484 18010 18032 Ebov-DRC_58_RIGHT Ebov-DRC_2 -BTB20484 17940 17967 Ebov-DRC_59_LEFT Ebov-DRC_1 -BTB20484 18324 18353 Ebov-DRC_59_RIGHT Ebov-DRC_1 -BTB20484 18209 18237 Ebov-DRC_60_LEFT Ebov-DRC_2 -BTB20484 18561 18589 Ebov-DRC_60_RIGHT Ebov-DRC_2 -BTB20484 18539 18565 Ebov-DRC_61_LEFT Ebov-DRC_1 -BTB20484 18898 18920 Ebov-DRC_61_RIGHT Ebov-DRC_1 +BTB20484 27 55 Ebov-DRC_1_LEFT Ebov-DRC_1 + +BTB20484 383 407 Ebov-DRC_1_RIGHT Ebov-DRC_1 - +BTB20484 350 376 Ebov-DRC_2_LEFT Ebov-DRC_2 + +BTB20484 707 730 Ebov-DRC_2_RIGHT Ebov-DRC_2 - +BTB20484 665 687 Ebov-DRC_3_LEFT Ebov-DRC_1 + +BTB20484 1033 1055 Ebov-DRC_3_RIGHT Ebov-DRC_1 - +BTB20484 994 1017 Ebov-DRC_4_LEFT Ebov-DRC_2 + +BTB20484 1354 1376 Ebov-DRC_4_RIGHT Ebov-DRC_2 - +BTB20484 1294 1316 Ebov-DRC_5_LEFT Ebov-DRC_1 + +BTB20484 1680 1702 Ebov-DRC_5_RIGHT Ebov-DRC_1 - +BTB20484 1625 1647 Ebov-DRC_6_LEFT Ebov-DRC_2 + +BTB20484 1995 2017 Ebov-DRC_6_RIGHT Ebov-DRC_2 - +BTB20484 1926 1948 Ebov-DRC_7_LEFT Ebov-DRC_1 + +BTB20484 2317 2339 Ebov-DRC_7_RIGHT Ebov-DRC_1 - +BTB20484 2238 2260 Ebov-DRC_8_LEFT Ebov-DRC_2 + +BTB20484 2630 2652 Ebov-DRC_8_RIGHT Ebov-DRC_2 - +BTB20484 2540 2565 Ebov-DRC_9_LEFT Ebov-DRC_1 + +BTB20484 2895 2920 Ebov-DRC_9_RIGHT Ebov-DRC_1 - +BTB20484 2843 2872 Ebov-DRC_10_LEFT Ebov-DRC_2 + +BTB20484 3215 3237 Ebov-DRC_10_RIGHT Ebov-DRC_2 - +BTB20484 3155 3177 Ebov-DRC_11_LEFT Ebov-DRC_1 + +BTB20484 3524 3546 Ebov-DRC_11_RIGHT Ebov-DRC_1 - +BTB20484 3460 3485 Ebov-DRC_12_LEFT Ebov-DRC_2 + +BTB20484 3821 3843 Ebov-DRC_12_RIGHT Ebov-DRC_2 - +BTB20484 3769 3792 Ebov-DRC_13_LEFT Ebov-DRC_1 + +BTB20484 4121 4149 Ebov-DRC_13_RIGHT Ebov-DRC_1 - +BTB20484 4014 4036 Ebov-DRC_14_LEFT Ebov-DRC_2 + +BTB20484 4369 4395 Ebov-DRC_14_RIGHT Ebov-DRC_2 - +BTB20484 4287 4311 Ebov-DRC_15_LEFT Ebov-DRC_1 + +BTB20484 4674 4696 Ebov-DRC_15_RIGHT Ebov-DRC_1 - +BTB20484 4606 4628 Ebov-DRC_16_LEFT Ebov-DRC_2 + +BTB20484 4990 5014 Ebov-DRC_16_RIGHT Ebov-DRC_2 - +BTB20484 4945 4967 Ebov-DRC_17_LEFT Ebov-DRC_1 + +BTB20484 5315 5338 Ebov-DRC_17_RIGHT Ebov-DRC_1 - +BTB20484 5235 5260 Ebov-DRC_18_LEFT Ebov-DRC_2 + +BTB20484 5585 5615 Ebov-DRC_18_RIGHT Ebov-DRC_2 - +BTB20484 5528 5556 Ebov-DRC_19_LEFT Ebov-DRC_1 + +BTB20484 5895 5918 Ebov-DRC_19_RIGHT Ebov-DRC_1 - +BTB20484 5827 5849 Ebov-DRC_20_LEFT Ebov-DRC_2 + +BTB20484 6211 6237 Ebov-DRC_20_RIGHT Ebov-DRC_2 - +BTB20484 6164 6188 Ebov-DRC_21_LEFT Ebov-DRC_1 + +BTB20484 6535 6558 Ebov-DRC_21_RIGHT Ebov-DRC_1 - +BTB20484 6479 6501 Ebov-DRC_22_LEFT Ebov-DRC_2 + +BTB20484 6859 6881 Ebov-DRC_22_RIGHT Ebov-DRC_2 - +BTB20484 6791 6813 Ebov-DRC_23_LEFT Ebov-DRC_1 + +BTB20484 7184 7207 Ebov-DRC_23_RIGHT Ebov-DRC_1 - +BTB20484 7140 7162 Ebov-DRC_24_LEFT Ebov-DRC_2 + +BTB20484 7510 7532 Ebov-DRC_24_RIGHT Ebov-DRC_2 - +BTB20484 7437 7459 Ebov-DRC_25_LEFT Ebov-DRC_1 + +BTB20484 7805 7827 Ebov-DRC_25_RIGHT Ebov-DRC_1 - +BTB20484 7741 7763 Ebov-DRC_26_LEFT Ebov-DRC_2 + +BTB20484 8097 8121 Ebov-DRC_26_RIGHT Ebov-DRC_2 - +BTB20484 8056 8086 Ebov-DRC_27_LEFT Ebov-DRC_1 + +BTB20484 8417 8440 Ebov-DRC_27_RIGHT Ebov-DRC_1 - +BTB20484 8364 8387 Ebov-DRC_28_LEFT Ebov-DRC_2 + +BTB20484 8737 8764 Ebov-DRC_28_RIGHT Ebov-DRC_2 - +BTB20484 8694 8716 Ebov-DRC_29_LEFT Ebov-DRC_1 + +BTB20484 9056 9081 Ebov-DRC_29_RIGHT Ebov-DRC_1 - +BTB20484 9024 9046 Ebov-DRC_30_LEFT Ebov-DRC_2 + +BTB20484 9375 9404 Ebov-DRC_30_RIGHT Ebov-DRC_2 - +BTB20484 9307 9332 Ebov-DRC_31_LEFT Ebov-DRC_1 + +BTB20484 9666 9688 Ebov-DRC_31_RIGHT Ebov-DRC_1 - +BTB20484 9602 9624 Ebov-DRC_32_LEFT Ebov-DRC_2 + +BTB20484 9982 10006 Ebov-DRC_32_RIGHT Ebov-DRC_2 - +BTB20484 9937 9961 Ebov-DRC_33_LEFT Ebov-DRC_1 + +BTB20484 10305 10330 Ebov-DRC_33_RIGHT Ebov-DRC_1 - +BTB20484 10237 10259 Ebov-DRC_34_LEFT Ebov-DRC_2 + +BTB20484 10617 10639 Ebov-DRC_34_RIGHT Ebov-DRC_2 - +BTB20484 10539 10561 Ebov-DRC_35_LEFT Ebov-DRC_1 + +BTB20484 10923 10945 Ebov-DRC_35_RIGHT Ebov-DRC_1 - +BTB20484 10860 10887 Ebov-DRC_36_LEFT Ebov-DRC_2 + +BTB20484 11247 11270 Ebov-DRC_36_RIGHT Ebov-DRC_2 - +BTB20484 11152 11180 Ebov-DRC_37_LEFT Ebov-DRC_1 + +BTB20484 11517 11539 Ebov-DRC_37_RIGHT Ebov-DRC_1 - +BTB20484 11385 11409 Ebov-DRC_38_LEFT Ebov-DRC_2 + +BTB20484 11770 11792 Ebov-DRC_38_RIGHT Ebov-DRC_2 - +BTB20484 11700 11722 Ebov-DRC_39_LEFT Ebov-DRC_1 + +BTB20484 12073 12098 Ebov-DRC_39_RIGHT Ebov-DRC_1 - +BTB20484 12040 12063 Ebov-DRC_40_LEFT Ebov-DRC_2 + +BTB20484 12397 12420 Ebov-DRC_40_RIGHT Ebov-DRC_2 - +BTB20484 12346 12370 Ebov-DRC_41_LEFT Ebov-DRC_1 + +BTB20484 12709 12733 Ebov-DRC_41_RIGHT Ebov-DRC_1 - +BTB20484 12645 12667 Ebov-DRC_42_LEFT Ebov-DRC_2 + +BTB20484 13032 13054 Ebov-DRC_42_RIGHT Ebov-DRC_2 - +BTB20484 12961 12983 Ebov-DRC_43_LEFT Ebov-DRC_1 + +BTB20484 13358 13380 Ebov-DRC_43_RIGHT Ebov-DRC_1 - +BTB20484 13313 13336 Ebov-DRC_44_LEFT Ebov-DRC_2 + +BTB20484 13681 13704 Ebov-DRC_44_RIGHT Ebov-DRC_2 - +BTB20484 13639 13661 Ebov-DRC_45_LEFT Ebov-DRC_1 + +BTB20484 14004 14026 Ebov-DRC_45_RIGHT Ebov-DRC_1 - +BTB20484 13941 13969 Ebov-DRC_46_LEFT Ebov-DRC_2 + +BTB20484 14325 14352 Ebov-DRC_46_RIGHT Ebov-DRC_2 - +BTB20484 14274 14296 Ebov-DRC_47_LEFT Ebov-DRC_1 + +BTB20484 14646 14671 Ebov-DRC_47_RIGHT Ebov-DRC_1 - +BTB20484 14590 14612 Ebov-DRC_48_LEFT Ebov-DRC_2 + +BTB20484 14951 14974 Ebov-DRC_48_RIGHT Ebov-DRC_2 - +BTB20484 14913 14935 Ebov-DRC_49_LEFT Ebov-DRC_1 + +BTB20484 15272 15294 Ebov-DRC_49_RIGHT Ebov-DRC_1 - +BTB20484 15202 15224 Ebov-DRC_50_LEFT Ebov-DRC_2 + +BTB20484 15575 15597 Ebov-DRC_50_RIGHT Ebov-DRC_2 - +BTB20484 15504 15529 Ebov-DRC_51_LEFT Ebov-DRC_1 + +BTB20484 15899 15922 Ebov-DRC_51_RIGHT Ebov-DRC_1 - +BTB20484 15844 15866 Ebov-DRC_52_LEFT Ebov-DRC_2 + +BTB20484 16207 16229 Ebov-DRC_52_RIGHT Ebov-DRC_2 - +BTB20484 16171 16193 Ebov-DRC_53_LEFT Ebov-DRC_1 + +BTB20484 16530 16553 Ebov-DRC_53_RIGHT Ebov-DRC_1 - +BTB20484 16470 16492 Ebov-DRC_54_LEFT Ebov-DRC_2 + +BTB20484 16848 16870 Ebov-DRC_54_RIGHT Ebov-DRC_2 - +BTB20484 16777 16800 Ebov-DRC_55_LEFT Ebov-DRC_1 + +BTB20484 17144 17166 Ebov-DRC_55_RIGHT Ebov-DRC_1 - +BTB20484 17085 17107 Ebov-DRC_56_LEFT Ebov-DRC_2 + +BTB20484 17447 17472 Ebov-DRC_56_RIGHT Ebov-DRC_2 - +BTB20484 17381 17406 Ebov-DRC_57_LEFT Ebov-DRC_1 + +BTB20484 17748 17772 Ebov-DRC_57_RIGHT Ebov-DRC_1 - +BTB20484 17649 17675 Ebov-DRC_58_LEFT Ebov-DRC_2 + +BTB20484 18010 18032 Ebov-DRC_58_RIGHT Ebov-DRC_2 - +BTB20484 17940 17967 Ebov-DRC_59_LEFT Ebov-DRC_1 + +BTB20484 18324 18353 Ebov-DRC_59_RIGHT Ebov-DRC_1 - +BTB20484 18209 18237 Ebov-DRC_60_LEFT Ebov-DRC_2 + +BTB20484 18561 18589 Ebov-DRC_60_RIGHT Ebov-DRC_2 - +BTB20484 18539 18565 Ebov-DRC_61_LEFT Ebov-DRC_1 + +BTB20484 18898 18920 Ebov-DRC_61_RIGHT Ebov-DRC_1 - diff --git a/test-data/primer-schemes/nCoV-2019/V1/nCoV-2019.scheme.bed b/test-data/primer-schemes/nCoV-2019/V1/nCoV-2019.scheme.bed index cffbba29..4284729b 100644 --- a/test-data/primer-schemes/nCoV-2019/V1/nCoV-2019.scheme.bed +++ b/test-data/primer-schemes/nCoV-2019/V1/nCoV-2019.scheme.bed @@ -1,196 +1,196 @@ -MN908947.3 30 54 nCoV-2019_1_LEFT nCoV-2019_1 -MN908947.3 385 410 nCoV-2019_1_RIGHT nCoV-2019_1 -MN908947.3 320 342 nCoV-2019_2_LEFT nCoV-2019_2 -MN908947.3 704 726 nCoV-2019_2_RIGHT nCoV-2019_2 -MN908947.3 642 664 nCoV-2019_3_LEFT nCoV-2019_1 -MN908947.3 1004 1028 nCoV-2019_3_RIGHT nCoV-2019_1 -MN908947.3 943 965 nCoV-2019_4_LEFT nCoV-2019_2 -MN908947.3 1312 1337 nCoV-2019_4_RIGHT nCoV-2019_2 -MN908947.3 1242 1264 nCoV-2019_5_LEFT nCoV-2019_1 -MN908947.3 1623 1651 nCoV-2019_5_RIGHT nCoV-2019_1 -MN908947.3 1573 1595 nCoV-2019_6_LEFT nCoV-2019_2 -MN908947.3 1942 1964 nCoV-2019_6_RIGHT nCoV-2019_2 -MN908947.3 1875 1897 nCoV-2019_7_LEFT nCoV-2019_1 -MN908947.3 2247 2269 nCoV-2019_7_RIGHT nCoV-2019_1 -MN908947.3 2181 2205 nCoV-2019_8_LEFT nCoV-2019_2 -MN908947.3 2568 2592 nCoV-2019_8_RIGHT nCoV-2019_2 -MN908947.3 2505 2529 nCoV-2019_9_LEFT nCoV-2019_1 -MN908947.3 2882 2904 nCoV-2019_9_RIGHT nCoV-2019_1 -MN908947.3 2826 2850 nCoV-2019_10_LEFT nCoV-2019_2 -MN908947.3 3183 3210 nCoV-2019_10_RIGHT nCoV-2019_2 -MN908947.3 3144 3166 nCoV-2019_11_LEFT nCoV-2019_1 -MN908947.3 3507 3531 nCoV-2019_11_RIGHT nCoV-2019_1 -MN908947.3 3460 3482 nCoV-2019_12_LEFT nCoV-2019_2 -MN908947.3 3826 3853 nCoV-2019_12_RIGHT nCoV-2019_2 -MN908947.3 3771 3795 nCoV-2019_13_LEFT nCoV-2019_1 -MN908947.3 4142 4164 nCoV-2019_13_RIGHT nCoV-2019_1 -MN908947.3 4054 4077 nCoV-2019_14_LEFT nCoV-2019_2 -MN908947.3 4428 4450 nCoV-2019_14_RIGHT nCoV-2019_2 -MN908947.3 4294 4321 nCoV-2019_15_LEFT nCoV-2019_1 -MN908947.3 4674 4696 nCoV-2019_15_RIGHT nCoV-2019_1 -MN908947.3 4636 4658 nCoV-2019_16_LEFT nCoV-2019_2 -MN908947.3 4995 5017 nCoV-2019_16_RIGHT nCoV-2019_2 -MN908947.3 4939 4966 nCoV-2019_17_LEFT nCoV-2019_1 -MN908947.3 5296 5321 nCoV-2019_17_RIGHT nCoV-2019_1 -MN908947.3 5230 5259 nCoV-2019_18_LEFT nCoV-2019_2 -MN908947.3 5620 5644 nCoV-2019_18_RIGHT nCoV-2019_2 -MN908947.3 5563 5586 nCoV-2019_19_LEFT nCoV-2019_1 -MN908947.3 5932 5957 nCoV-2019_19_RIGHT nCoV-2019_1 -MN908947.3 5867 5894 nCoV-2019_20_LEFT nCoV-2019_2 -MN908947.3 6247 6272 nCoV-2019_20_RIGHT nCoV-2019_2 -MN908947.3 6167 6196 nCoV-2019_21_LEFT nCoV-2019_1 -MN908947.3 6528 6550 nCoV-2019_21_RIGHT nCoV-2019_1 -MN908947.3 6466 6495 nCoV-2019_22_LEFT nCoV-2019_2 -MN908947.3 6846 6873 nCoV-2019_22_RIGHT nCoV-2019_2 -MN908947.3 6718 6745 nCoV-2019_23_LEFT nCoV-2019_1 -MN908947.3 7092 7117 nCoV-2019_23_RIGHT nCoV-2019_1 -MN908947.3 7035 7058 nCoV-2019_24_LEFT nCoV-2019_2 -MN908947.3 7389 7415 nCoV-2019_24_RIGHT nCoV-2019_2 -MN908947.3 7305 7332 nCoV-2019_25_LEFT nCoV-2019_1 -MN908947.3 7671 7694 nCoV-2019_25_RIGHT nCoV-2019_1 -MN908947.3 7626 7651 nCoV-2019_26_LEFT nCoV-2019_2 -MN908947.3 7997 8019 nCoV-2019_26_RIGHT nCoV-2019_2 -MN908947.3 7943 7968 nCoV-2019_27_LEFT nCoV-2019_1 -MN908947.3 8319 8341 nCoV-2019_27_RIGHT nCoV-2019_1 -MN908947.3 8249 8275 nCoV-2019_28_LEFT nCoV-2019_2 -MN908947.3 8635 8661 nCoV-2019_28_RIGHT nCoV-2019_2 -MN908947.3 8595 8619 nCoV-2019_29_LEFT nCoV-2019_1 -MN908947.3 8954 8983 nCoV-2019_29_RIGHT nCoV-2019_1 -MN908947.3 8888 8913 nCoV-2019_30_LEFT nCoV-2019_2 -MN908947.3 9245 9271 nCoV-2019_30_RIGHT nCoV-2019_2 -MN908947.3 9204 9226 nCoV-2019_31_LEFT nCoV-2019_1 -MN908947.3 9557 9585 nCoV-2019_31_RIGHT nCoV-2019_1 -MN908947.3 9477 9502 nCoV-2019_32_LEFT nCoV-2019_2 -MN908947.3 9834 9858 nCoV-2019_32_RIGHT nCoV-2019_2 -MN908947.3 9784 9806 nCoV-2019_33_LEFT nCoV-2019_1 -MN908947.3 10146 10171 nCoV-2019_33_RIGHT nCoV-2019_1 -MN908947.3 10076 10099 nCoV-2019_34_LEFT nCoV-2019_2 -MN908947.3 10437 10459 nCoV-2019_34_RIGHT nCoV-2019_2 -MN908947.3 10362 10384 nCoV-2019_35_LEFT nCoV-2019_1 -MN908947.3 10737 10763 nCoV-2019_35_RIGHT nCoV-2019_1 -MN908947.3 10666 10688 nCoV-2019_36_LEFT nCoV-2019_2 -MN908947.3 11048 11074 nCoV-2019_36_RIGHT nCoV-2019_2 -MN908947.3 10999 11022 nCoV-2019_37_LEFT nCoV-2019_1 -MN908947.3 11372 11394 nCoV-2019_37_RIGHT nCoV-2019_1 -MN908947.3 11306 11331 nCoV-2019_38_LEFT nCoV-2019_2 -MN908947.3 11668 11693 nCoV-2019_38_RIGHT nCoV-2019_2 -MN908947.3 11555 11584 nCoV-2019_39_LEFT nCoV-2019_1 -MN908947.3 11927 11949 nCoV-2019_39_RIGHT nCoV-2019_1 -MN908947.3 11863 11889 nCoV-2019_40_LEFT nCoV-2019_2 -MN908947.3 12234 12256 nCoV-2019_40_RIGHT nCoV-2019_2 -MN908947.3 12110 12133 nCoV-2019_41_LEFT nCoV-2019_1 -MN908947.3 12465 12490 nCoV-2019_41_RIGHT nCoV-2019_1 -MN908947.3 12417 12439 nCoV-2019_42_LEFT nCoV-2019_2 -MN908947.3 12779 12802 nCoV-2019_42_RIGHT nCoV-2019_2 -MN908947.3 12710 12732 nCoV-2019_43_LEFT nCoV-2019_1 -MN908947.3 13074 13096 nCoV-2019_43_RIGHT nCoV-2019_1 -MN908947.3 13005 13027 nCoV-2019_44_LEFT nCoV-2019_2 -MN908947.3 13378 13400 nCoV-2019_44_RIGHT nCoV-2019_2 -MN908947.3 13319 13344 nCoV-2019_45_LEFT nCoV-2019_1 -MN908947.3 13669 13699 nCoV-2019_45_RIGHT nCoV-2019_1 -MN908947.3 13599 13621 nCoV-2019_46_LEFT nCoV-2019_2 -MN908947.3 13962 13984 nCoV-2019_46_RIGHT nCoV-2019_2 -MN908947.3 13918 13946 nCoV-2019_47_LEFT nCoV-2019_1 -MN908947.3 14271 14299 nCoV-2019_47_RIGHT nCoV-2019_1 -MN908947.3 14207 14232 nCoV-2019_48_LEFT nCoV-2019_2 -MN908947.3 14579 14601 nCoV-2019_48_RIGHT nCoV-2019_2 -MN908947.3 14545 14570 nCoV-2019_49_LEFT nCoV-2019_1 -MN908947.3 14898 14926 nCoV-2019_49_RIGHT nCoV-2019_1 -MN908947.3 14865 14895 nCoV-2019_50_LEFT nCoV-2019_2 -MN908947.3 15224 15246 nCoV-2019_50_RIGHT nCoV-2019_2 -MN908947.3 15171 15193 nCoV-2019_51_LEFT nCoV-2019_1 -MN908947.3 15538 15560 nCoV-2019_51_RIGHT nCoV-2019_1 -MN908947.3 15481 15503 nCoV-2019_52_LEFT nCoV-2019_2 -MN908947.3 15861 15886 nCoV-2019_52_RIGHT nCoV-2019_2 -MN908947.3 15827 15851 nCoV-2019_53_LEFT nCoV-2019_1 -MN908947.3 16186 16209 nCoV-2019_53_RIGHT nCoV-2019_1 -MN908947.3 16118 16144 nCoV-2019_54_LEFT nCoV-2019_2 -MN908947.3 16485 16510 nCoV-2019_54_RIGHT nCoV-2019_2 -MN908947.3 16416 16444 nCoV-2019_55_LEFT nCoV-2019_1 -MN908947.3 16804 16833 nCoV-2019_55_RIGHT nCoV-2019_1 -MN908947.3 16748 16770 nCoV-2019_56_LEFT nCoV-2019_2 -MN908947.3 17130 17152 nCoV-2019_56_RIGHT nCoV-2019_2 -MN908947.3 17065 17087 nCoV-2019_57_LEFT nCoV-2019_1 -MN908947.3 17430 17452 nCoV-2019_57_RIGHT nCoV-2019_1 -MN908947.3 17381 17406 nCoV-2019_58_LEFT nCoV-2019_2 -MN908947.3 17738 17761 nCoV-2019_58_RIGHT nCoV-2019_2 -MN908947.3 17674 17697 nCoV-2019_59_LEFT nCoV-2019_1 -MN908947.3 18036 18062 nCoV-2019_59_RIGHT nCoV-2019_1 -MN908947.3 17966 17993 nCoV-2019_60_LEFT nCoV-2019_2 -MN908947.3 18324 18348 nCoV-2019_60_RIGHT nCoV-2019_2 -MN908947.3 18253 18275 nCoV-2019_61_LEFT nCoV-2019_1 -MN908947.3 18650 18672 nCoV-2019_61_RIGHT nCoV-2019_1 -MN908947.3 18596 18618 nCoV-2019_62_LEFT nCoV-2019_2 -MN908947.3 18957 18979 nCoV-2019_62_RIGHT nCoV-2019_2 -MN908947.3 18896 18918 nCoV-2019_63_LEFT nCoV-2019_1 -MN908947.3 19275 19297 nCoV-2019_63_RIGHT nCoV-2019_1 -MN908947.3 19204 19232 nCoV-2019_64_LEFT nCoV-2019_2 -MN908947.3 19591 19616 nCoV-2019_64_RIGHT nCoV-2019_2 -MN908947.3 19548 19570 nCoV-2019_65_LEFT nCoV-2019_1 -MN908947.3 19911 19939 nCoV-2019_65_RIGHT nCoV-2019_1 -MN908947.3 19844 19866 nCoV-2019_66_LEFT nCoV-2019_2 -MN908947.3 20231 20255 nCoV-2019_66_RIGHT nCoV-2019_2 -MN908947.3 20172 20200 nCoV-2019_67_LEFT nCoV-2019_1 -MN908947.3 20542 20572 nCoV-2019_67_RIGHT nCoV-2019_1 -MN908947.3 20472 20496 nCoV-2019_68_LEFT nCoV-2019_2 -MN908947.3 20867 20890 nCoV-2019_68_RIGHT nCoV-2019_2 -MN908947.3 20786 20813 nCoV-2019_69_LEFT nCoV-2019_1 -MN908947.3 21146 21169 nCoV-2019_69_RIGHT nCoV-2019_1 -MN908947.3 21075 21104 nCoV-2019_70_LEFT nCoV-2019_2 -MN908947.3 21427 21455 nCoV-2019_70_RIGHT nCoV-2019_2 -MN908947.3 21357 21386 nCoV-2019_71_LEFT nCoV-2019_1 -MN908947.3 21716 21743 nCoV-2019_71_RIGHT nCoV-2019_1 -MN908947.3 21658 21682 nCoV-2019_72_LEFT nCoV-2019_2 -MN908947.3 22013 22038 nCoV-2019_72_RIGHT nCoV-2019_2 -MN908947.3 21961 21990 nCoV-2019_73_LEFT nCoV-2019_1 -MN908947.3 22324 22346 nCoV-2019_73_RIGHT nCoV-2019_1 -MN908947.3 22262 22290 nCoV-2019_74_LEFT nCoV-2019_2 -MN908947.3 22626 22650 nCoV-2019_74_RIGHT nCoV-2019_2 -MN908947.3 22516 22542 nCoV-2019_75_LEFT nCoV-2019_1 -MN908947.3 22877 22903 nCoV-2019_75_RIGHT nCoV-2019_1 -MN908947.3 22797 22819 nCoV-2019_76_LEFT nCoV-2019_2 -MN908947.3 23192 23214 nCoV-2019_76_RIGHT nCoV-2019_2 -MN908947.3 23122 23144 nCoV-2019_77_LEFT nCoV-2019_1 -MN908947.3 23500 23522 nCoV-2019_77_RIGHT nCoV-2019_1 -MN908947.3 23443 23466 nCoV-2019_78_LEFT nCoV-2019_2 -MN908947.3 23822 23847 nCoV-2019_78_RIGHT nCoV-2019_2 -MN908947.3 23789 23812 nCoV-2019_79_LEFT nCoV-2019_1 -MN908947.3 24145 24169 nCoV-2019_79_RIGHT nCoV-2019_1 -MN908947.3 24078 24100 nCoV-2019_80_LEFT nCoV-2019_2 -MN908947.3 24443 24467 nCoV-2019_80_RIGHT nCoV-2019_2 -MN908947.3 24391 24416 nCoV-2019_81_LEFT nCoV-2019_1 -MN908947.3 24765 24789 nCoV-2019_81_RIGHT nCoV-2019_1 -MN908947.3 24696 24721 nCoV-2019_82_LEFT nCoV-2019_2 -MN908947.3 25052 25076 nCoV-2019_82_RIGHT nCoV-2019_2 -MN908947.3 24978 25003 nCoV-2019_83_LEFT nCoV-2019_1 -MN908947.3 25347 25369 nCoV-2019_83_RIGHT nCoV-2019_1 -MN908947.3 25279 25301 nCoV-2019_84_LEFT nCoV-2019_2 -MN908947.3 25646 25673 nCoV-2019_84_RIGHT nCoV-2019_2 -MN908947.3 25601 25623 nCoV-2019_85_LEFT nCoV-2019_1 -MN908947.3 25969 25994 nCoV-2019_85_RIGHT nCoV-2019_1 -MN908947.3 25902 25924 nCoV-2019_86_LEFT nCoV-2019_2 -MN908947.3 26290 26315 nCoV-2019_86_RIGHT nCoV-2019_2 -MN908947.3 26197 26219 nCoV-2019_87_LEFT nCoV-2019_1 -MN908947.3 26566 26590 nCoV-2019_87_RIGHT nCoV-2019_1 -MN908947.3 26520 26542 nCoV-2019_88_LEFT nCoV-2019_2 -MN908947.3 26890 26913 nCoV-2019_88_RIGHT nCoV-2019_2 -MN908947.3 26835 26857 nCoV-2019_89_LEFT nCoV-2019_1 -MN908947.3 27202 27227 nCoV-2019_89_RIGHT nCoV-2019_1 -MN908947.3 27141 27164 nCoV-2019_90_LEFT nCoV-2019_2 -MN908947.3 27511 27533 nCoV-2019_90_RIGHT nCoV-2019_2 -MN908947.3 27446 27471 nCoV-2019_91_LEFT nCoV-2019_1 -MN908947.3 27825 27854 nCoV-2019_91_RIGHT nCoV-2019_1 -MN908947.3 27784 27808 nCoV-2019_92_LEFT nCoV-2019_2 -MN908947.3 28145 28172 nCoV-2019_92_RIGHT nCoV-2019_2 -MN908947.3 28081 28104 nCoV-2019_93_LEFT nCoV-2019_1 -MN908947.3 28442 28464 nCoV-2019_93_RIGHT nCoV-2019_1 -MN908947.3 28394 28416 nCoV-2019_94_LEFT nCoV-2019_2 -MN908947.3 28756 28779 nCoV-2019_94_RIGHT nCoV-2019_2 -MN908947.3 28677 28699 nCoV-2019_95_LEFT nCoV-2019_1 -MN908947.3 29041 29063 nCoV-2019_95_RIGHT nCoV-2019_1 -MN908947.3 28985 29007 nCoV-2019_96_LEFT nCoV-2019_2 -MN908947.3 29356 29378 nCoV-2019_96_RIGHT nCoV-2019_2 -MN908947.3 29288 29316 nCoV-2019_97_LEFT nCoV-2019_1 -MN908947.3 29665 29693 nCoV-2019_97_RIGHT nCoV-2019_1 -MN908947.3 29486 29510 nCoV-2019_98_LEFT nCoV-2019_2 -MN908947.3 29836 29866 nCoV-2019_98_RIGHT nCoV-2019_2 +MN908947.3 30 54 nCoV-2019_1_LEFT_1 1 + ACCAACCAACTTTCGATCTCTTGT +MN908947.3 385 410 nCoV-2019_1_RIGHT_1 1 - CATCTTTAAGATGTTGACGTGCCTC +MN908947.3 320 342 nCoV-2019_2_LEFT_1 2 + CTGTTTTACAGGTTCGCGACGT +MN908947.3 704 726 nCoV-2019_2_RIGHT_1 2 - TAAGGATCAGTGCCAAGCTCGT +MN908947.3 642 664 nCoV-2019_3_LEFT_1 1 + CGGTAATAAAGGAGCTGGTGGC +MN908947.3 1004 1028 nCoV-2019_3_RIGHT_1 1 - AAGGTGTCTGCAATTCATAGCTCT +MN908947.3 943 965 nCoV-2019_4_LEFT_1 2 + GGTGTATACTGCTGCCGTGAAC +MN908947.3 1312 1337 nCoV-2019_4_RIGHT_1 2 - CACAAGTAGTGGCACCTTCTTTAGT +MN908947.3 1242 1264 nCoV-2019_5_LEFT_1 1 + TGGTGAAACTTCATGGCAGACG +MN908947.3 1623 1651 nCoV-2019_5_RIGHT_1 1 - ATTGATGTTGACTTTCTCTTTTTGGAGT +MN908947.3 1573 1595 nCoV-2019_6_LEFT_1 2 + GGTGTTGTTGGAGAAGGTTCCG +MN908947.3 1942 1964 nCoV-2019_6_RIGHT_1 2 - TAGCGGCCTTCTGTAAAACACG +MN908947.3 1875 1897 nCoV-2019_7_LEFT_1 1 + ATCAGAGGCTGCTCGTGTTGTA +MN908947.3 2247 2269 nCoV-2019_7_RIGHT_1 1 - TGCACAGGTGACAATTTGTCCA +MN908947.3 2181 2205 nCoV-2019_8_LEFT_1 2 + AGAGTTTCTTAGAGACGGTTGGGA +MN908947.3 2568 2592 nCoV-2019_8_RIGHT_1 2 - GCTTCAACAGCTTCACTAGTAGGT +MN908947.3 2505 2529 nCoV-2019_9_LEFT_1 1 + TCCCACAGAAGTGTTAACAGAGGA +MN908947.3 2882 2904 nCoV-2019_9_RIGHT_1 1 - ATGACAGCATCTGCCACAACAC +MN908947.3 2826 2850 nCoV-2019_10_LEFT_1 2 + TGAGAAGTGCTCTGCCTATACAGT +MN908947.3 3183 3210 nCoV-2019_10_RIGHT_1 2 - TCATCTAACCAATCTTCTTCTTGCTCT +MN908947.3 3144 3166 nCoV-2019_11_LEFT_1 1 + GGAATTTGGTGCCACTTCTGCT +MN908947.3 3507 3531 nCoV-2019_11_RIGHT_1 1 - TCATCAGATTCAACTTGCATGGCA +MN908947.3 3460 3482 nCoV-2019_12_LEFT_1 2 + AAACATGGAGGAGGTGTTGCAG +MN908947.3 3826 3853 nCoV-2019_12_RIGHT_1 2 - TTCACTCTTCATTTCCAAAAAGCTTGA +MN908947.3 3771 3795 nCoV-2019_13_LEFT_1 1 + TCGCACAAATGTCTACTTAGCTGT +MN908947.3 4142 4164 nCoV-2019_13_RIGHT_1 1 - ACCACAGCAGTTAAAACACCCT +MN908947.3 4054 4077 nCoV-2019_14_LEFT_1 2 + CATCCAGATTCTGCCACTCTTGT +MN908947.3 4428 4450 nCoV-2019_14_RIGHT_1 2 - AGTTTCCACACAGACAGGCATT +MN908947.3 4294 4321 nCoV-2019_15_LEFT_1 1 + ACAGTGCTTAAAAAGTGTAAAAGTGCC +MN908947.3 4674 4696 nCoV-2019_15_RIGHT_1 1 - AACAGAAACTGTAGCTGGCACT +MN908947.3 4636 4658 nCoV-2019_16_LEFT_1 2 + AATTTGGAAGAAGCTGCTCGGT +MN908947.3 4995 5017 nCoV-2019_16_RIGHT_1 2 - CACAACTTGCGTGTGGAGGTTA +MN908947.3 4939 4966 nCoV-2019_17_LEFT_1 1 + CTTCTTTCTTTGAGAGAAGTGAGGACT +MN908947.3 5296 5321 nCoV-2019_17_RIGHT_1 1 - TTTGTTGGAGTGTTAACAATGCAGT +MN908947.3 5230 5259 nCoV-2019_18_LEFT_1 2 + TGGAAATACCCACAAGTTAATGGTTTAAC +MN908947.3 5620 5644 nCoV-2019_18_RIGHT_1 2 - AGCTTGTTTACCACACGTACAAGG +MN908947.3 5563 5586 nCoV-2019_19_LEFT_1 1 + GCTGTTATGTACATGGGCACACT +MN908947.3 5932 5957 nCoV-2019_19_RIGHT_1 1 - TGTCCAACTTAGGGTCAATTTCTGT +MN908947.3 5867 5894 nCoV-2019_20_LEFT_1 2 + ACAAAGAAAACAGTTACACAACAACCA +MN908947.3 6247 6272 nCoV-2019_20_RIGHT_1 2 - ACGTGGCTTTATTAGTTGCATTGTT +MN908947.3 6167 6196 nCoV-2019_21_LEFT_1 1 + TGGCTATTGATTATAAACACTACACACCC +MN908947.3 6528 6550 nCoV-2019_21_RIGHT_1 1 - TAGATCTGTGTGGCCAACCTCT +MN908947.3 6466 6495 nCoV-2019_22_LEFT_1 2 + ACTACCGAAGTTGTAGGAGACATTATACT +MN908947.3 6846 6873 nCoV-2019_22_RIGHT_1 2 - ACAGTATTCTTTGCTATAGTAGTCGGC +MN908947.3 6718 6745 nCoV-2019_23_LEFT_1 1 + ACAACTACTAACATAGTTACACGGTGT +MN908947.3 7092 7117 nCoV-2019_23_RIGHT_1 1 - ACCAGTACAGTAGGTTGCAATAGTG +MN908947.3 7035 7058 nCoV-2019_24_LEFT_1 2 + AGGCATGCCTTCTTACTGTACTG +MN908947.3 7389 7415 nCoV-2019_24_RIGHT_1 2 - ACATTCTAACCATAGCTGAAATCGGG +MN908947.3 7305 7332 nCoV-2019_25_LEFT_1 1 + GCAATTGTTTTTCAGCTATTTTGCAGT +MN908947.3 7671 7694 nCoV-2019_25_RIGHT_1 1 - ACTGTAGTGACAAGTCTCTCGCA +MN908947.3 7626 7651 nCoV-2019_26_LEFT_1 2 + TTGTGATACATTCTGTGCTGGTAGT +MN908947.3 7997 8019 nCoV-2019_26_RIGHT_1 2 - TCCGCACTATCACCAACATCAG +MN908947.3 7943 7968 nCoV-2019_27_LEFT_1 1 + ACTACAGTCAGCTTATGTGTCAACC +MN908947.3 8319 8341 nCoV-2019_27_RIGHT_1 1 - AATACAAGCACCAAGGTCACGG +MN908947.3 8249 8275 nCoV-2019_28_LEFT_1 2 + ACATAGAAGTTACTGGCGATAGTTGT +MN908947.3 8635 8661 nCoV-2019_28_RIGHT_1 2 - TGTTTAGACATGACATGAACAGGTGT +MN908947.3 8595 8619 nCoV-2019_29_LEFT_1 1 + ACTTGTGTTCCTTTTTGTTGCTGC +MN908947.3 8954 8983 nCoV-2019_29_RIGHT_1 1 - AGTGTACTCTATAAGTTTTGATGGTGTGT +MN908947.3 8888 8913 nCoV-2019_30_LEFT_1 2 + GCACAACTAATGGTGACTTTTTGCA +MN908947.3 9245 9271 nCoV-2019_30_RIGHT_1 2 - ACCACTAGTAGATACACAAACACCAG +MN908947.3 9204 9226 nCoV-2019_31_LEFT_1 1 + TTCTGAGTACTGTAGGCACGGC +MN908947.3 9557 9585 nCoV-2019_31_RIGHT_1 1 - ACAGAATAAACACCAGGTAAGAATGAGT +MN908947.3 9477 9502 nCoV-2019_32_LEFT_1 2 + TGGTGAATACAGTCATGTAGTTGCC +MN908947.3 9834 9858 nCoV-2019_32_RIGHT_1 2 - AGCACATCACTACGCAACTTTAGA +MN908947.3 9784 9806 nCoV-2019_33_LEFT_1 1 + ACTTTTGAAGAAGCTGCGCTGT +MN908947.3 10146 10171 nCoV-2019_33_RIGHT_1 1 - TGGACAGTAAACTACGTCATCAAGC +MN908947.3 10076 10099 nCoV-2019_34_LEFT_1 2 + TCCCATCTGGTAAAGTTGAGGGT +MN908947.3 10437 10459 nCoV-2019_34_RIGHT_1 2 - AGTGAAATTGGGCCTCATAGCA +MN908947.3 10362 10384 nCoV-2019_35_LEFT_1 1 + TGTTCGCATTCAACCAGGACAG +MN908947.3 10737 10763 nCoV-2019_35_RIGHT_1 1 - ACTTCATAGCCACAAGGTTAAAGTCA +MN908947.3 10666 10688 nCoV-2019_36_LEFT_1 2 + TTAGCTTGGTTGTACGCTGCTG +MN908947.3 11048 11074 nCoV-2019_36_RIGHT_1 2 - GAACAAAGACCATTGAGTACTCTGGA +MN908947.3 10999 11022 nCoV-2019_37_LEFT_1 1 + ACACACCACTGGTTGTTACTCAC +MN908947.3 11372 11394 nCoV-2019_37_RIGHT_1 1 - GTCCACACTCTCCTAGCACCAT +MN908947.3 11306 11331 nCoV-2019_38_LEFT_1 2 + ACTGTGTTATGTATGCATCAGCTGT +MN908947.3 11668 11693 nCoV-2019_38_RIGHT_1 2 - CACCAAGAGTCAGTCTAAAGTAGCG +MN908947.3 11555 11584 nCoV-2019_39_LEFT_1 1 + AGTATTGCCCTATTTTCTTCATAACTGGT +MN908947.3 11927 11949 nCoV-2019_39_RIGHT_1 1 - TGTAACTGGACACATTGAGCCC +MN908947.3 11863 11889 nCoV-2019_40_LEFT_1 2 + TGCACATCAGTAGTCTTACTCTCAGT +MN908947.3 12234 12256 nCoV-2019_40_RIGHT_1 2 - CATGGCTGCATCACGGTCAAAT +MN908947.3 12110 12133 nCoV-2019_41_LEFT_1 1 + GTTCCCTTCCATCATATGCAGCT +MN908947.3 12465 12490 nCoV-2019_41_RIGHT_1 1 - TGGTATGACAACCATTAGTTTGGCT +MN908947.3 12417 12439 nCoV-2019_42_LEFT_1 2 + TGCAAGAGATGGTTGTGTTCCC +MN908947.3 12779 12802 nCoV-2019_42_RIGHT_1 2 - CCTACCTCCCTTTGTTGTGTTGT +MN908947.3 12710 12732 nCoV-2019_43_LEFT_1 1 + TACGACAGATGTCTTGTGCTGC +MN908947.3 13074 13096 nCoV-2019_43_RIGHT_1 1 - AGCAGCATCTACAGCAAAAGCA +MN908947.3 13005 13027 nCoV-2019_44_LEFT_1 2 + TGCCACAGTACGTCTACAAGCT +MN908947.3 13378 13400 nCoV-2019_44_RIGHT_1 2 - AACCTTTCCACATACCGCAGAC +MN908947.3 13319 13344 nCoV-2019_45_LEFT_1 1 + TACCTACAACTTGTGCTAATGACCC +MN908947.3 13669 13699 nCoV-2019_45_RIGHT_1 1 - AAATTGTTTCTTCATGTTGGTAGTTAGAGA +MN908947.3 13599 13621 nCoV-2019_46_LEFT_1 2 + TGTCGCTTCCAAGAAAAGGACG +MN908947.3 13962 13984 nCoV-2019_46_RIGHT_1 2 - CACGTTCACCTAAGTTGGCGTA +MN908947.3 13918 13946 nCoV-2019_47_LEFT_1 1 + AGGACTGGTATGATTTTGTAGAAAACCC +MN908947.3 14271 14299 nCoV-2019_47_RIGHT_1 1 - AATAACGGTCAAAGAGTTTTAACCTCTC +MN908947.3 14207 14232 nCoV-2019_48_LEFT_1 2 + TGTTGACACTGACTTAACAAAGCCT +MN908947.3 14579 14601 nCoV-2019_48_RIGHT_1 2 - TAGATTACCAGAAGCAGCGTGC +MN908947.3 14545 14570 nCoV-2019_49_LEFT_1 1 + AGGAATTACTTGTGTATGCTGCTGA +MN908947.3 14898 14926 nCoV-2019_49_RIGHT_1 1 - TGACGATGACTTGGTTAGCATTAATACA +MN908947.3 14865 14895 nCoV-2019_50_LEFT_1 2 + GTTGATAAGTACTTTGATTGTTACGATGGT +MN908947.3 15224 15246 nCoV-2019_50_RIGHT_1 2 - TAACATGTTGTGCCAACCACCA +MN908947.3 15171 15193 nCoV-2019_51_LEFT_1 1 + TCAATAGCCGCCACTAGAGGAG +MN908947.3 15538 15560 nCoV-2019_51_RIGHT_1 1 - AGTGCATTAACATTGGCCGTGA +MN908947.3 15481 15503 nCoV-2019_52_LEFT_1 2 + CATCAGGAGATGCCACAACTGC +MN908947.3 15861 15886 nCoV-2019_52_RIGHT_1 2 - GTTGAGAGCAAAATTCATGAGGTCC +MN908947.3 15827 15851 nCoV-2019_53_LEFT_1 1 + AGCAAAATGTTGGACTGAGACTGA +MN908947.3 16186 16209 nCoV-2019_53_RIGHT_1 1 - AGCCTCATAAAACTCAGGTTCCC +MN908947.3 16118 16144 nCoV-2019_54_LEFT_1 2 + TGAGTTAACAGGACACATGTTAGACA +MN908947.3 16485 16510 nCoV-2019_54_RIGHT_1 2 - AACCAAAAACTTGTCCATTAGCACA +MN908947.3 16416 16444 nCoV-2019_55_LEFT_1 1 + ACTCAACTTTACTTAGGAGGTATGAGCT +MN908947.3 16804 16833 nCoV-2019_55_RIGHT_1 1 - GGTGTACTCTCCTATTTGTACTTTACTGT +MN908947.3 16748 16770 nCoV-2019_56_LEFT_1 2 + ACCTAGACCACCACTTAACCGA +MN908947.3 17130 17152 nCoV-2019_56_RIGHT_1 2 - ACACTATGCGAGCAGAAGGGTA +MN908947.3 17065 17087 nCoV-2019_57_LEFT_1 1 + ATTCTACACTCCAGGGACCACC +MN908947.3 17430 17452 nCoV-2019_57_RIGHT_1 1 - GTAATTGAGCAGGGTCGCCAAT +MN908947.3 17381 17406 nCoV-2019_58_LEFT_1 2 + TGATTTGAGTGTTGTCAATGCCAGA +MN908947.3 17738 17761 nCoV-2019_58_RIGHT_1 2 - CTTTTCTCCAAGCAGGGTTACGT +MN908947.3 17674 17697 nCoV-2019_59_LEFT_1 1 + TCACGCATGATGTTTCATCTGCA +MN908947.3 18036 18062 nCoV-2019_59_RIGHT_1 1 - AAGAGTCCTGTTACATTTTCAGCTTG +MN908947.3 17966 17993 nCoV-2019_60_LEFT_1 2 + TGATAGAGACCTTTATGACAAGTTGCA +MN908947.3 18324 18348 nCoV-2019_60_RIGHT_1 2 - GGTACCAACAGCTTCTCTAGTAGC +MN908947.3 18253 18275 nCoV-2019_61_LEFT_1 1 + TGTTTATCACCCGCGAAGAAGC +MN908947.3 18650 18672 nCoV-2019_61_RIGHT_1 1 - ATCACATAGACAACAGGTGCGC +MN908947.3 18596 18618 nCoV-2019_62_LEFT_1 2 + GGCACATGGCTTTGAGTTGACA +MN908947.3 18957 18979 nCoV-2019_62_RIGHT_1 2 - GTTGAACCTTTCTACAAGCCGC +MN908947.3 18896 18918 nCoV-2019_63_LEFT_1 1 + TGTTAAGCGTGTTGACTGGACT +MN908947.3 19275 19297 nCoV-2019_63_RIGHT_1 1 - ACAAACTGCCACCATCACAACC +MN908947.3 19204 19232 nCoV-2019_64_LEFT_1 2 + TCGATAGATATCCTGCTAATTCCATTGT +MN908947.3 19591 19616 nCoV-2019_64_RIGHT_1 2 - AGTCTTGTAAAAGTGTTCCAGAGGT +MN908947.3 19548 19570 nCoV-2019_65_LEFT_1 1 + GCTGGCTTTAGCTTGTGGGTTT +MN908947.3 19911 19939 nCoV-2019_65_RIGHT_1 1 - TGTCAGTCATAGAACAAACACCAATAGT +MN908947.3 19844 19866 nCoV-2019_66_LEFT_1 2 + GGGTGTGGACATTGCTGCTAAT +MN908947.3 20231 20255 nCoV-2019_66_RIGHT_1 2 - TCAATTTCCATTTGACTCCTGGGT +MN908947.3 20172 20200 nCoV-2019_67_LEFT_1 1 + GTTGTCCAACAATTACCTGAAACTTACT +MN908947.3 20542 20572 nCoV-2019_67_RIGHT_1 1 - CAACCTTAGAAACTACAGATAAATCTTGGG +MN908947.3 20472 20496 nCoV-2019_68_LEFT_1 2 + ACAGGTTCATCTAAGTGTGTGTGT +MN908947.3 20867 20890 nCoV-2019_68_RIGHT_1 2 - CTCCTTTATCAGAACCAGCACCA +MN908947.3 20786 20813 nCoV-2019_69_LEFT_1 1 + TGTCGCAAAATATACTCAACTGTGTCA +MN908947.3 21146 21169 nCoV-2019_69_RIGHT_1 1 - TCTTTATAGCCACGGAACCTCCA +MN908947.3 21075 21104 nCoV-2019_70_LEFT_1 2 + ACAAAAGAAAATGACTCTAAAGAGGGTTT +MN908947.3 21427 21455 nCoV-2019_70_RIGHT_1 2 - TGACCTTCTTTTAAAGACATAACAGCAG +MN908947.3 21357 21386 nCoV-2019_71_LEFT_1 1 + ACAAATCCAATTCAGTTGTCTTCCTATTC +MN908947.3 21716 21743 nCoV-2019_71_RIGHT_1 1 - TGGAAAAGAAAGGTAAGAACAAGTCCT +MN908947.3 21658 21682 nCoV-2019_72_LEFT_1 2 + ACACGTGGTGTTTATTACCCTGAC +MN908947.3 22013 22038 nCoV-2019_72_RIGHT_1 2 - ACTCTGAACTCACTTTCCATCCAAC +MN908947.3 21961 21990 nCoV-2019_73_LEFT_1 1 + CAATTTTGTAATGATCCATTTTTGGGTGT +MN908947.3 22324 22346 nCoV-2019_73_RIGHT_1 1 - CACCAGCTGTCCAACCTGAAGA +MN908947.3 22262 22290 nCoV-2019_74_LEFT_1 2 + ACATCACTAGGTTTCAAACTTTACTTGC +MN908947.3 22626 22650 nCoV-2019_74_RIGHT_1 2 - GCAACACAGTTGCTGATTCTCTTC +MN908947.3 22516 22542 nCoV-2019_75_LEFT_1 1 + AGAGTCCAACCAACAGAATCTATTGT +MN908947.3 22877 22903 nCoV-2019_75_RIGHT_1 1 - ACCACCAACCTTAGAATCAAGATTGT +MN908947.3 22797 22819 nCoV-2019_76_LEFT_1 2 + AGGGCAAACTGGAAAGATTGCT +MN908947.3 23192 23214 nCoV-2019_76_RIGHT_1 2 - ACACCTGTGCCTGTTAAACCAT +MN908947.3 23122 23144 nCoV-2019_77_LEFT_1 1 + CCAGCAACTGTTTGTGGACCTA +MN908947.3 23500 23522 nCoV-2019_77_RIGHT_1 1 - CAGCCCCTATTAAACAGCCTGC +MN908947.3 23443 23466 nCoV-2019_78_LEFT_1 2 + CAACTTACTCCTACTTGGCGTGT +MN908947.3 23822 23847 nCoV-2019_78_RIGHT_1 2 - TGTGTACAAAAACTGCCATATTGCA +MN908947.3 23789 23812 nCoV-2019_79_LEFT_1 1 + GTGGTGATTCAACTGAATGCAGC +MN908947.3 24145 24169 nCoV-2019_79_RIGHT_1 1 - CATTTCATCTGTGAGCAAAGGTGG +MN908947.3 24078 24100 nCoV-2019_80_LEFT_1 2 + TTGCCTTGGTGATATTGCTGCT +MN908947.3 24443 24467 nCoV-2019_80_RIGHT_1 2 - TGGAGCTAAGTTGTTTAACAAGCG +MN908947.3 24391 24416 nCoV-2019_81_LEFT_1 1 + GCACTTGGAAAACTTCAAGATGTGG +MN908947.3 24765 24789 nCoV-2019_81_RIGHT_1 1 - GTGAAGTTCTTTTCTTGTGCAGGG +MN908947.3 24696 24721 nCoV-2019_82_LEFT_1 2 + GGGCTATCATCTTATGTCCTTCCCT +MN908947.3 25052 25076 nCoV-2019_82_RIGHT_1 2 - TGCCAGAGATGTCACCTAAATCAA +MN908947.3 24978 25003 nCoV-2019_83_LEFT_1 1 + TCCTTTGCAACCTGAATTAGACTCA +MN908947.3 25347 25369 nCoV-2019_83_RIGHT_1 1 - TTTGACTCCTTTGAGCACTGGC +MN908947.3 25279 25301 nCoV-2019_84_LEFT_1 2 + TGCTGTAGTTGTCTCAAGGGCT +MN908947.3 25646 25673 nCoV-2019_84_RIGHT_1 2 - AGGTGTGAGTAAACTGTTACAAACAAC +MN908947.3 25601 25623 nCoV-2019_85_LEFT_1 1 + ACTAGCACTCTCCAAGGGTGTT +MN908947.3 25969 25994 nCoV-2019_85_RIGHT_1 1 - ACACAGTCTTTTACTCCAGATTCCC +MN908947.3 25902 25924 nCoV-2019_86_LEFT_1 2 + TCAGGTGATGGCACAACAAGTC +MN908947.3 26290 26315 nCoV-2019_86_RIGHT_1 2 - ACGAAAGCAAGAAAAAGAAGTACGC +MN908947.3 26197 26219 nCoV-2019_87_LEFT_1 1 + CGACTACTAGCGTGCCTTTGTA +MN908947.3 26566 26590 nCoV-2019_87_RIGHT_1 1 - ACTAGGTTCCATTGTTCAAGGAGC +MN908947.3 26520 26542 nCoV-2019_88_LEFT_1 2 + CCATGGCAGATTCCAACGGTAC +MN908947.3 26890 26913 nCoV-2019_88_RIGHT_1 2 - TGGTCAGAATAGTGCCATGGAGT +MN908947.3 26835 26857 nCoV-2019_89_LEFT_1 1 + GTACGCGTTCCATGTGGTCATT +MN908947.3 27202 27227 nCoV-2019_89_RIGHT_1 1 - ACCTGAAAGTCAACGAGATGAAACA +MN908947.3 27141 27164 nCoV-2019_90_LEFT_1 2 + ACACAGACCATTCCAGTAGCAGT +MN908947.3 27511 27533 nCoV-2019_90_RIGHT_1 2 - TGAAATGGTGAATTGCCCTCGT +MN908947.3 27446 27471 nCoV-2019_91_LEFT_1 1 + TCACTACCAAGAGTGTGTTAGAGGT +MN908947.3 27825 27854 nCoV-2019_91_RIGHT_1 1 - TTCAAGTGAGAACCAAAAGATAATAAGCA +MN908947.3 27784 27808 nCoV-2019_92_LEFT_1 2 + TTTGTGCTTTTTAGCCTTTCTGCT +MN908947.3 28145 28172 nCoV-2019_92_RIGHT_1 2 - AGGTTCCTGGCAATTAATTGTAAAAGG +MN908947.3 28081 28104 nCoV-2019_93_LEFT_1 1 + TGAGGCTGGTTCTAAATCACCCA +MN908947.3 28442 28464 nCoV-2019_93_RIGHT_1 1 - AGGTCTTCCTTGCCATGTTGAG +MN908947.3 28394 28416 nCoV-2019_94_LEFT_1 2 + GGCCCCAAGGTTTACCCAATAA +MN908947.3 28756 28779 nCoV-2019_94_RIGHT_1 2 - TTTGGCAATGTTGTTCCTTGAGG +MN908947.3 28677 28699 nCoV-2019_95_LEFT_1 1 + TGAGGGAGCCTTGAATACACCA +MN908947.3 29041 29063 nCoV-2019_95_RIGHT_1 1 - CAGTACGTTTTTGCCGAGGCTT +MN908947.3 28985 29007 nCoV-2019_96_LEFT_1 2 + GCCAACAACAACAAGGCCAAAC +MN908947.3 29356 29378 nCoV-2019_96_RIGHT_1 2 - TAGGCTCTGTTGGTGGGAATGT +MN908947.3 29288 29316 nCoV-2019_97_LEFT_1 1 + TGGATGACAAAGATCCAAATTTCAAAGA +MN908947.3 29665 29693 nCoV-2019_97_RIGHT_1 1 - ACACACTGATTAAAGATTGCTATGTGAG +MN908947.3 29486 29510 nCoV-2019_98_LEFT_1 2 + AACAATTGCAACAATCCATGAGCA +MN908947.3 29836 29866 nCoV-2019_98_RIGHT_1 2 - TTCTCCTAAGAAGCTATTAAAATCACATGG \ No newline at end of file diff --git a/test-runner.sh b/test-runner.sh index 802c9ba0..044083b7 100755 --- a/test-runner.sh +++ b/test-runner.sh @@ -7,7 +7,7 @@ set -e # full data available: http://artic.s3.climb.ac.uk/run-folders/EBOV_Amplicons_flongle.tar.gz # # usage: -# ./test-runner.sh [medaka|nanopolish] +# ./test-runner.sh [medaka|clair3] # # specify either medaka or nanopolish to run the respective workflow of the pipeline # @@ -15,65 +15,44 @@ set -e # Setup the data, commands and the testing function. # data -inputData="./20190830_1509_MN22126_AAQ411_9efc5448" +inputData="./20190830_1509_MN22126_AAQ411_9efc5448_barcoded" primerSchemes="../test-data/primer-schemes" primerScheme="IturiEBOV/V1" prefix="ebov-mayinga" +bed="../test-data/primer-schemes/IturiEBOV/V1/IturiEBOV.scheme.bed" +ref="../test-data/primer-schemes/IturiEBOV/V1/IturiEBOV.reference.fasta" barcode="03" threads=2 -downloadCmd="wget http://artic.s3.climb.ac.uk/run-folders/EBOV_Amplicons_flongle.tar.gz" -extractCmd="tar -vxzf EBOV_Amplicons_flongle.tar.gz" +downloadCmd="wget https://loman-labz-public-datasets.s3.climb.ac.uk/EBOV_Amplicons_flongle_barcoded.tar.gz" +extractCmd="tar -vxzf EBOV_Amplicons_flongle_barcoded.tar.gz" -# pipeline commands -## nanopolish workflow specific -gatherCmd_n="artic gather \ - --min-length 400 \ - --max-length 800 \ - --prefix ${prefix} \ - --directory ${inputData} \ - --fast5-directory ${inputData}/fast5_pass" - -demuxCmd_n="artic demultiplex \ - --threads ${threads} \ - ${prefix}_fastq_pass.fastq" - -minionCmd_n="artic minion \ - --normalise 200 \ - --threads ${threads} \ - --scheme-directory ${primerSchemes} \ - --read-file ${prefix}_fastq_pass-NB${barcode}.fastq \ - --fast5-directory ${inputData}/fast5_pass \ - --sequencing-summary ${inputData}/lab-on-an-ssd_20190830_160932_AAQ411_minion_sequencing_run_EBOV_Amplicons_flongle_sequencing_summary.txt \ - ${primerScheme} \ - ${prefix}" - -## medaka workflow specific -gatherCmd_m="artic gather \ - --min-length 400 \ - --max-length 800 \ - --prefix ${prefix} \ - --directory ${inputData} \ - --no-fast5s" -demuxCmd_m="artic demultiplex \ - --threads ${threads} \ - ${prefix}_fastq_pass.fastq" - -guppyplexCmd_m="artic guppyplex \ +guppyplexCmd="artic guppyplex \ --min-length 400 \ --max-length 800 \ --prefix ${prefix} \ - --directory ./ \ + --directory ./${inputData}/pass/barcode${barcode} \ --output ${prefix}_guppyplex_fastq_pass-NB${barcode}.fastq" +## medaka workflow specific minionCmd_m="artic minion \ --normalise 200 \ --threads ${threads} \ - --scheme-directory ${primerSchemes} \ --read-file ${prefix}_guppyplex_fastq_pass-NB${barcode}.fastq \ - --medaka \ - --medaka-model r941_min_high_g351 \ - ${primerScheme} \ + --model r941_e81_hac_g514 \ + --bed ${bed} \ + --ref ${ref} \ + ${prefix}" + +# clair3 workflow specific +minionCmd_c="artic minion \ + --normalise 200 \ + --threads ${threads} \ + --read-file ${prefix}_guppyplex_fastq_pass-NB${barcode}.fastq \ + --clair3 \ + --model r941_prom_hac_g360+g422 \ + --bed ${bed} \ + --ref ${ref} \ ${prefix}" # colours @@ -104,19 +83,20 @@ function cmdTester { ########################################################################################### # Run the tests. -# check that nanopolish or medaka is specified -if [ "$1" == "nanopolish" ] || [ "$1" == "medaka" ]; then +# check that clair3 or medaka is specified +if [ "$1" == "clair3" ] || [ "$1" == "medaka" ]; then echo -e "${BLUE}Starting tests...${NC}" echo -e "${BLUE} - using the $1 workflow${NC}" echo else - echo "please specify medaka or nanopolish" - echo "./test-runner.sh [medaka|nanopolish]" + echo "please specify medaka or clair3" + echo "./test-runner.sh [medaka|clair3]" exit 1 fi # setup a tmp directory to work in -mkdir tmp && cd tmp || exit +mkdir tmp || true +cd tmp || exit # download the data echo "downloading the test data..." @@ -125,30 +105,19 @@ cmdTester $extractCmd # run the correct workflow echo "running the pipeline..." -if [ "$1" == "nanopolish" ] +if [ "$1" == "medaka" ] then - - # collect the reads - cmdTester $gatherCmd_n - - # demultiplex - cmdTester $demuxCmd_n - - # run the core pipeline with nanopolish - cmdTester $minionCmd_n -else - # collect the reads - cmdTester $gatherCmd_m - - # demultiplex - cmdTester $demuxCmd_m - - # guppyplex - cmdTester $guppyplexCmd_m + cmdTester $guppyplexCmd # run the core pipeline with medaka cmdTester $minionCmd_m +else + # guppyplex the reads + cmdTester $guppyplexCmd + + # run the core pipeline with clair3 + cmdTester $minionCmd_c fi ########################################################################################### diff --git a/artic/align_trim_unit_test.py b/tests/align_trim_unit_test.py similarity index 60% rename from artic/align_trim_unit_test.py rename to tests/align_trim_unit_test.py index a22c0460..ad298ead 100644 --- a/artic/align_trim_unit_test.py +++ b/tests/align_trim_unit_test.py @@ -3,45 +3,26 @@ import pysam import pytest -from . import align_trim -from . import vcftagprimersites +from artic import align_trim +from artic import utils # help pytest resolve where test data is kept TEST_DIR = os.path.dirname(os.path.abspath(__file__)) # dummy primers (using min required fields) -p1 = { - "start": 0, - "end": 10, - "direction": "+", - "primerID": "primer1_LEFT" -} -p2 = { - "start": 30, - "end": 40, - "direction": "-", - "primerID": "primer1_RIGHT" -} -p3 = { - "start": 10, - "end": 20, - "direction": "+", - "primerID": "primer2_LEFT" -} -p4 = { - "start": 40, - "end": 50, - "direction": "-", - "primerID": "primer2_RIGHT" -} +p1 = {"start": 0, "end": 10, "direction": "+", "primerID": "primer1_LEFT"} +p2 = {"start": 30, "end": 40, "direction": "-", "primerID": "primer1_RIGHT"} +p3 = {"start": 10, "end": 20, "direction": "+", "primerID": "primer2_LEFT"} +p4 = {"start": 40, "end": 50, "direction": "-", "primerID": "primer2_RIGHT"} # primer scheme to hold dummy primers dummyPrimerScheme = [p1, p2, p3, p4] # actual the primer scheme for nCov -primerScheme = vcftagprimersites.read_bed_file( - TEST_DIR + "/../test-data/primer-schemes/nCoV-2019/V1/nCoV-2019.scheme.bed") +primerScheme = utils.read_bed_file( + TEST_DIR + "/../test-data/primer-schemes/nCoV-2019/V1/nCoV-2019.scheme.bed" +) # nCov alignment segment (derived from a real nCov read) seg1 = pysam.AlignedSegment() @@ -61,7 +42,9 @@ seg2.reference_id = 0 seg2.reference_start = 4294 seg2.mapping_quality = 60 -seg2.cigarstring = "41S9M1D17M1D69M5D1M1D40M2I12M1D41M1D117M1I3M1D4M2D11M2I2M1D18M1D18M1I25M56S" +seg2.cigarstring = ( + "41S9M1D17M1D69M5D1M1D40M2I12M1D41M1D117M1I3M1D4M2D11M2I2M1D18M1D18M1I25M56S" +) seg2.query_sequence = "CCAGGTTAACACAAAGACACCGACAACTTTCTTCAGCACCTACAGTGCTTAAAAGTGTAAAAGTGCCTTTACATTCTACCATCTATTATCTCTAATGAGAAGCAAGAAATTCTTGGAACTGTTTCTTGGAATTTGCAGCTTGCACATGCAGAAGAAACACGCAAATTAATGCCTGTCTGTGTGTGGAAACTAAGCCATAGTTTCAACTATACAGCGTAAATATAAGGGTATTAAATACAAGAGGGTGTGGTTGATTATGGTGCTAGATTTTACTTTTACACCAGTAAAACAACTGTAGCGTCACTTATCAACACGCTTAACGATCTAAATGAAACTCTTGTTACAATGCACACTGGCTGTAACACATGAAACTAAATTTGGAAGAAGCTGTCGGTATATGAGATCTCTCCAAAGTGCCAGCTACGGTTTCTGTTAGGTGCTGAAAAGAAAGTTGTCGGTGTCTTTGTGTGAACCTTAGCAATACGTAACC" seg2.query_qualities = [30] * 490 @@ -71,45 +54,59 @@ def test_find_primer(): - """test for the find primer function - """ + """test for the find primer function""" + print(dummyPrimerScheme) # test the primer finder on the primers themselves for primer in dummyPrimerScheme: if primer["direction"] == "+": result = align_trim.find_primer( - dummyPrimerScheme, primer["start"], primer["direction"]) + dummyPrimerScheme, primer["start"], primer["direction"] + ) else: result = align_trim.find_primer( - dummyPrimerScheme, primer["end"], primer["direction"]) - assert result[2]["primerID"] == primer["primerID"], "find_primer did not produce the query primer, which should be nearest" + dummyPrimerScheme, primer["end"], primer["direction"] + ) + + assert result + assert ( + result[2]["primerID"] == primer["primerID"] + ), "find_primer did not produce the query primer, which should be nearest" # test against other ref positions - result = align_trim.find_primer( - dummyPrimerScheme, 8, "+") - assert result[2]["primerID"] == "primer2_LEFT", "find_primer returned incorrect primer" - result = align_trim.find_primer( - dummyPrimerScheme, 25, "-") - assert result[2]["primerID"] == "primer1_RIGHT", "find_primer returned incorrect primer" + result = align_trim.find_primer(dummyPrimerScheme, 8, "+") + assert result + assert ( + result[2]["primerID"] == "primer2_LEFT" + ), "find_primer returned incorrect primer" + + result = align_trim.find_primer(dummyPrimerScheme, 25, "-") + assert result + assert ( + result[2]["primerID"] == "primer1_RIGHT" + ), "find_primer returned incorrect primer" def test_trim(): - """test for the trim function - """ + """test for the trim function""" def testRunner(seg, expectedCIGAR): # get the nearest primers to the alignment segment - p1 = align_trim.find_primer(primerScheme, seg.reference_start, '+') - p2 = align_trim.find_primer(primerScheme, seg.reference_end, '-') + p1 = align_trim.find_primer(primerScheme, seg.reference_start, "+") + p2 = align_trim.find_primer(primerScheme, seg.reference_end, "-") # get the primer positions - p1_position = p1[2]['end'] - p2_position = p2[2]['start'] + p1_position = p1[2]["end"] + p2_position = p2[2]["start"] # this segment should need forward and reverse softmasking - assert seg.reference_start < p1_position, "missed a forward soft masking opportunity (read: %s)" % seg.query_name - assert seg.reference_end > p2_position, "missed a reverse soft masking opportunity (read: %s)" % seg.query_name + assert seg.reference_start < p1_position, ( + "missed a forward soft masking opportunity (read: %s)" % seg.query_name + ) + assert seg.reference_end > p2_position, ( + "missed a reverse soft masking opportunity (read: %s)" % seg.query_name + ) # before masking, get the query_alignment_length and the CIGAR to use for testing later originalCigar = seg.cigarstring @@ -120,29 +117,49 @@ def testRunner(seg, expectedCIGAR): align_trim.trim(seg, p1_position, False, False) except Exception as e: raise Exception( - "problem soft masking left primer in {} (error: {})" .format(seg.query_name, e)) + "problem soft masking left primer in {} (error: {})".format( + seg.query_name, e + ) + ) # check the CIGAR and query alignment length is updated - assert seg.cigarstring != originalCigar, "cigar was not updated with a softmask (read: %s)" % seg.query_name - assert seg.query_alignment_length != originalQueryAlnLength, "query alignment was not updated after softmask (read: %s)" % seg.query_name + assert seg.cigarstring != originalCigar, ( + "cigar was not updated with a softmask (read: %s)" % seg.query_name + ) + assert seg.query_alignment_length != originalQueryAlnLength, ( + "query alignment was not updated after softmask (read: %s)" % seg.query_name + ) # trim the reverse primer try: align_trim.trim(seg, p2_position, True, False) except Exception as e: - raise Exception("problem soft masking right primer in {} (error: {})" .format( - seg.query_name, e)) + raise Exception( + "problem soft masking right primer in {} (error: {})".format( + seg.query_name, e + ) + ) # check the CIGAR and query alignment length is updated - assert seg.cigarstring != originalCigar, "cigar was not updated with a softmask (read: %s)" % seg.query_name - assert seg.query_alignment_length != originalQueryAlnLength, "query alignment was not updated after softmask (read: %s)" % seg.query_name + assert seg.cigarstring != originalCigar, ( + "cigar was not updated with a softmask (read: %s)" % seg.query_name + ) + assert seg.query_alignment_length != originalQueryAlnLength, ( + "query alignment was not updated after softmask (read: %s)" % seg.query_name + ) # check we have the right CIGAR - assert seg.cigarstring == expectedCIGAR, "cigar does not match expected cigar string (read: %s)" % seg.query_name + assert seg.cigarstring == expectedCIGAR, ( + "cigar does not match expected cigar string (read: %s)" % seg.query_name + ) # check the query alignment now matches the expected primer product - assert seg.reference_start >= p1_position, "left primer not masked corrrectly (read: %s)" % seg.query_name - assert seg.reference_end <= p2_position, "right primer not masked correctly (read: %s)" % seg.query_name + assert seg.reference_start >= p1_position, ( + "left primer not masked corrrectly (read: %s)" % seg.query_name + ) + assert seg.reference_end <= p2_position, ( + "right primer not masked correctly (read: %s)" % seg.query_name + ) # run the test with the two alignment segments testRunner(seg1, seg1expectedCIGAR) diff --git a/tests/get_scheme_unit_test.py b/tests/get_scheme_unit_test.py new file mode 100644 index 00000000..a100b97d --- /dev/null +++ b/tests/get_scheme_unit_test.py @@ -0,0 +1,34 @@ +import unittest +import os + +from artic.utils import get_scheme + + +class get_scheme_unit_test(unittest.TestCase): + def test_get_scheme_no_length(self): + + bed, ref, scheme_version = get_scheme( + scheme_name="nCoV-2019", + scheme_version="v1.0.0", + scheme_directory=f"{os.getcwd()}/primer-schemes", + ) + + self.assertEqual( + bed, f"{os.getcwd()}/primer-schemes/artic-sars-cov-2/400/v1.0.0/primer.bed" + ) + + self.assertEqual( + ref, + f"{os.getcwd()}/primer-schemes/artic-sars-cov-2/400/v1.0.0/reference.fasta", + ) + + def test_get_scheme_unclear_length(self): + + with self.assertRaises(SystemExit) as cm: + bed, ref, scheme_version = get_scheme( + scheme_name="hbv", + scheme_version="v1.0.0", + scheme_directory=f"{os.getcwd()}/primer-schemes", + ) + + self.assertEqual(cm.exception.code, 1) diff --git a/tests/minion_validator.py b/tests/minion_validator.py new file mode 100644 index 00000000..705f3023 --- /dev/null +++ b/tests/minion_validator.py @@ -0,0 +1,379 @@ +"""minion_validator.py runs the functional tests of the minion pipeline, for both nanopolish and medaka workflows. + +For each test dataset, it will make the following checks for each workflow: + + * check for existence of the final consensus sequence + * check the final consensus matches the expected consensus + * check for existence of the final VCF file (generated by artic_vcf_filter) + * check all expected variants are reported + * check no unexpected variants are reported + * if there is an expected deletion, the consensus sequence will be checked for it's absence + +NOTE: + + * only a basic grep is used for checking the deletion in the consensus - test will fail if deletion sequence is present multiple times (I should improve this part of the test...) + +""" + +from Bio import SeqIO +from tqdm import tqdm +import glob +import os +import pathlib +import unittest +import requests +import sys +import tarfile +from cyvcf2 import VCF + + +from artic import pipeline + +# help pytest resolve where test data is kept +dataDir = str(pathlib.Path(__file__).parent.parent) + "/test-data/" + +# testData is a lookup of sampleIDs to download urls +testData = { + "MT007544": "https://raw.githubusercontent.com/artic-network/fieldbioinformatics/master/test-data/MT007544/MT007544.fastq", + "CVR1": "https://artic.s3.climb.ac.uk/validation-sets/CVR1.tgz", + "NRW01": "http://artic.s3.climb.ac.uk/validation-teams/NRW01.tgz", + "SP1": "http://artic.s3.climb.ac.uk/validation-teams/SP1.tgz", +} + +# refConsensuses is a nested dict of sample IDs and their expected consensus sequences from the nanopolish workflow +refConsensuses = { + "CVR1": dataDir + "consensus-sequences/CVR1.consensus.fasta", + "NRW01": dataDir + "consensus-sequences/NRW01.consensus.fasta", + "SP1": dataDir + "consensus-sequences/SP1.consensus.fasta", +} + +# refMedakaConsensuses is a nested dict of sample IDs and their expected consensus sequences from the medaka workflow +refMedakaConsensuses = { + "MT007544": dataDir + "consensus-sequences/MT007544.consensus.medaka.fasta", + "CVR1": dataDir + "consensus-sequences/CVR1.consensus.medaka.fasta", + "NRW01": dataDir + "consensus-sequences/NRW01.consensus.medaka.fasta", + "SP1": dataDir + "consensus-sequences/SP1.consensus.medaka.fasta", +} + +# nanopolishTestVariants is a nested dict of sample IDs and their expected variants when using the nanopolish workflow +clair3TestVariants = { + "CVR1": { + # pos: (ref, alt, type, count) + 241: ["C", "T", "snp", 1], + 3037: ["C", "T", "snp", 1], + 12733: [ + "C", + "T", + "snp", + 1, + ], + 14408: ["C", "T", "snp", 1], + 23403: ["A", "G", "snp", 1], + 27752: ["C", "T", "snp", 1], + 28881: ["G", "A", "snp", 1], + 28882: ["G", "A", "snp", 1], + 28883: ["G", "C", "snp", 1], + }, + "NRW01": { + 1440: ["G", "A", "snp", 1], + 2891: ["G", "A", "snp", 1], + 4655: ["C", "T", "snp", 1], + 8422: ["G", "A", "snp", 1], + 22323: ["C", "T", "snp", 1], + 29546: ["C", "A", "snp", 2], + }, + "SP1": { + 241: ["C", "T", "snp", 1], + 3037: ["C", "T", "snp", 1], + 14408: ["C", "T", "snp", 1], + 23403: ["A", "G", "snp", 1], + }, +} + +# medakaTestVariants is a nested dict of sample IDs and their expected variants when using the medaka workflow +medakaTestVariants = { + "MT007544": { + # pos: (ref, alt, type, count) + 29749: ["ACGATCGAGTG", "A", "del", 1], + }, + "CVR1": { + 241: ["C", "T", "snp", 1], + 3037: ["C", "T", "snp", 1], + 12733: ["C", "T", "snp", 1], + 14408: ["C", "T", "snp", 1], + 23403: ["A", "G", "snp", 1], + 27752: ["C", "T", "snp", 1], + 28881: ["GGG", "AAC", "indel", 1], + }, + "NRW01": { + 1440: ["G", "A", "snp", 1], + 2891: ["G", "A", "snp", 1], + 4655: ["C", "T", "snp", 1], + 8422: ["G", "A", "snp", 1], + 22323: ["C", "T", "snp", 2], + 29546: ["C", "A", "snp", 2], + }, + "SP1": { + 241: ["C", "T", "snp", 1], + 3037: ["C", "T", "snp", 1], + 14408: ["C", "T", "snp", 1], + 23403: ["A", "G", "snp", 1], + }, +} + +# extraFlags is a way to add extra sample-specific commands to the validation test cmd +extraFlags = { + "medaka": { + "SP1": ["--no-frameshifts"], + }, + "clair3": { + "SP1": ["--no-frameshifts"], + "CVR1": ["--no-frameshifts"], + }, +} + + +# dataChecker will run before the tests and download all the test datasets if not present +def dataChecker(): + print("checking for validation datasets...") + for sampleID, url in testData.items(): + targetPath = dataDir + sampleID + if not os.path.exists(targetPath): + print("\tno data for {}".format(sampleID)) + print("\tmaking dir at {}".format(targetPath)) + os.mkdir(targetPath) + print("\tdownloading from {}".format(url)) + try: + download(url, dataDir, sampleID) + except Exception as e: + print("download failed: ", e) + sys.exit(1) + else: + print("\tfound data dir for {}".format(sampleID)) + print("validation datasets ready\n") + + +# download will download and untar a test dataset +def download(url, dataDir, sampleID): + filename = url.rsplit("/", 1)[1] + with open(f"{dataDir}/{filename}", "wb+") as f: + response = requests.get(url, stream=True) + total = int(response.headers.get("content-length")) + if total is None: + f.write(response.content) + else: + with tqdm(total=total, unit="B", unit_scale=True, desc=filename) as pbar: + for data in tqdm(response.iter_content(chunk_size=1024)): + f.write(data) + pbar.update(1024) + + tar = tarfile.open(dataDir + "/" + filename, "r:gz") + tar.extractall(dataDir) + tar.close() + os.remove(dataDir + "/" + filename) + + +# genCommand will create the minion command for the requested workflow (nanopolish or medaka) +def genCommand(sampleID, workflow): + cmd = [ + "minion", + "--threads", + "2", + "--read-file", + dataDir + sampleID + "/" + sampleID + ".fastq", + "--scheme-directory", + dataDir + "primer-schemes", + ] + if workflow == "medaka": + cmd.append("--model") + cmd.append("r941_min_high_g351") + + if workflow == "clair3": + cmd.append("--clair3") + cmd.append("--model") + cmd.append("r941_prom_hac_g360+g422") + + if sampleID in extraFlags[workflow]: + for flag in extraFlags[workflow][sampleID]: + cmd.append(flag) + + cmd.append("--scheme-name") + cmd.append("SARS-CoV-2") + cmd.append("--scheme-version") + cmd.append("v1.0.0") + cmd.append(sampleID) + return cmd + + +# cleanUp will remove all files generated by the pipeline for a given sampleID +def cleanUp(sampleID): + fileList = glob.glob("{}*".format(sampleID)) + for filePath in fileList: + try: + os.remove(filePath) + except Exception: + sys.stderr.write(f"Error while deleting file : {filePath}") + + +# checkConsensus will return 1 if a subsequence is present in a consensus file, or 0 if absent +def checkConsensus(consensusFile, subSeq): + for record in SeqIO.parse(open(consensusFile, "r"), "fasta"): + if subSeq in record.seq: + return 1 + return 0 + + +# runner is the test runner +def runner(workflow, sampleID): + + if workflow == "clair3": + data = clair3TestVariants + elif workflow == "medaka": + data = medakaTestVariants + else: + sys.stderr.write("invalid workflow specified") + assert False + + if sampleID not in data: + sys.stderr.write("no test data for {}".format(sampleID)) + assert False + + expVariants = data[sampleID] + + # check the number of validation datasets requested + # counter = numValidations + # if (numValidations < 0) or (numValidations > len(testData)): + # counter = len(testData) + + # generate the command + cmd = genCommand(sampleID, workflow) + + # setup and run the minion parser + parser = pipeline.init_pipeline_parser() + + # parse the arguments + try: + args = parser.parse_args(cmd) + except SystemExit: + sys.stderr.write("failed to parse valid command for `artic minion`") + + # run the minion pipeline + try: + args.func(parser, args) + except SystemExit: + sys.stderr.write("artic minion exited early with an error") + assert False + + # check the ARTIC consensus was created + consensusFile = "%s.consensus.fasta" % sampleID + assert os.path.exists(consensusFile), "no consensus produced for {}".format( + sampleID + ) + testSeqs = SeqIO.parse(open(consensusFile, "r"), "fasta") + testConsensus = next(testSeqs) + + # check the ARTIC consensus sequence matches the one on record + # if workflow == "medaka": + # refSeqs = SeqIO.parse(open(refMedakaConsensuses[sampleID], "r"), "fasta") + # else: + # refSeqs = SeqIO.parse(open(refConsensuses[sampleID], "r"), "fasta") + # refConsensus = next(refSeqs) + # assert ( + # testConsensus.seq == refConsensus.seq + # ), "produced ARTIC consensus does not match expected consensus for {}".format( + # sampleID + # ) + + # check the ARTIC VCF was created + vcfFile = "%s.normalised.vcf.gz" % sampleID + assert os.path.exists(vcfFile), "no VCF produced for {}".format(sampleID) + + obs_variants = {} + # open the VCF and check the reported variants match the expected + for record in VCF(vcfFile): + obs_variants[record.POS] = [record.REF, str(record.ALT[0]), record.var_type] + if record.POS in expVariants: + assert ( + record.REF == expVariants[record.POS][0] + ), "incorrect REF reported in VCF for {} at position {}".format( + sampleID, record.POS + ) + assert ( + str(record.ALT[0]) == expVariants[record.POS][1] + ), "incorrect ALT reported in VCF for {} at position {}".format( + sampleID, record.POS + ) + + # if this is an expected deletion, check the consensus sequence for it's absence + if expVariants[record.POS][2] == "del": + assert ( + checkConsensus(consensusFile, record.REF) == 0 + ), "expected deletion for {} was reported but was left in consensus".format( + sampleID + ) + + # also check that the VCF record is correctly labelled as DEL + assert ( + record.is_deletion + ), "deletion for {} not formatted correctly in VCF".format(sampleID) + + # if this is an expected indel, check that the VCF record is correctly labelled as INDEL + if expVariants[record.POS][2] == "indel": + assert ( + record.is_indel + ), "indel for {} not formatted correctly in VCF".format(sampleID) + + # else, check that the VCF record is correctly labelled as SNP + if expVariants[record.POS][2] == "snp": + assert ( + record.is_snp + ), "snp for {} not formatted correctly in VCF".format(sampleID) + + # decrement/remove the variant from the expected list, so we can keep track of checked variants + # expVariants[record.POS][3] -= 1 + # if expVariants[record.POS][3] == 0: + # del expVariants[record.POS] + + else: + sys.stderr.write( + f"unexpected variant found for {sampleID}: {str(record.ALT[0])} at {record.POS}" + ) + assert False + + # check we've confirmed all the expected variants + for key, val in expVariants.items(): + assert obs_variants.get( + key + ), f"expected variant not found for {sampleID}: {val[0]} -> {val[1]} at {key}" + assert ( + val[0:2] == obs_variants[key][0:2] + ), f"expected variant not found for {sampleID}: {val[0]} -> {val[1]} at {key}" + + # clean up pipeline files + cleanUp(sampleID) + + +class TestMinion(unittest.TestCase): + def setUp(self): + dataChecker() + + def test_Clair3_CVR1(self): + runner("clair3", "CVR1") + + def test_Clair3_NRW01(self): + runner("clair3", "NRW01") + + def test_Clair3_SP1(self): + runner("clair3", "SP1") + + def test_Medaka_MT007544(self): + runner("medaka", "MT007544") + + def test_Medaka_CVR1(self): + runner("medaka", "CVR1") + + def test_Medaka_NRW01(self): + runner("medaka", "NRW01") + + def test_Medaka_SP1(self): + runner("medaka", "SP1") diff --git a/artic/pipeline_unit_test.py b/tests/pipeline_unit_test.py similarity index 78% rename from artic/pipeline_unit_test.py rename to tests/pipeline_unit_test.py index c8247c07..9a21f14f 100644 --- a/artic/pipeline_unit_test.py +++ b/tests/pipeline_unit_test.py @@ -2,21 +2,22 @@ import argparse import pytest -from . import pipeline +from artic import pipeline def test_pipeline_parser(): - """basic test for the pipeline parser - """ + """basic test for the pipeline parser""" # setup a parser parser = pipeline.init_pipeline_parser() # set up a valid command dummyCLI = [ "minion", - "--medaka", - "some-scheme.bed", - "some-prefix" + "--model", + "some_nonsense_model", + "--read-file", + "some_reads.fastq", + "some-prefix", ] # try with required arguments missing @@ -29,6 +30,7 @@ def test_pipeline_parser(): except SystemExit: print("failed to parse valid command") assert False + assert args.command == dummyCLI[0], "incorrect subcommand registered" # for arg, val in vars(args).items(): diff --git a/artic/vcftagprimersites_unit_test.py b/tests/utils_unit_test.py similarity index 52% rename from artic/vcftagprimersites_unit_test.py rename to tests/utils_unit_test.py index 49ca7908..944bf424 100644 --- a/artic/vcftagprimersites_unit_test.py +++ b/tests/utils_unit_test.py @@ -2,7 +2,7 @@ import os import pytest -from . import vcftagprimersites +from artic import utils # help pytest resolve where test data is kept @@ -12,30 +12,40 @@ def test_read_bed_file(): # process the nCoV-2019 V3 primer scheme - primerScheme = vcftagprimersites.read_bed_file( - TEST_DIR + "/../test-data/primer-schemes/nCoV-2019/V3/nCoV-2019.scheme.bed") + primerScheme = utils.read_bed_file( + TEST_DIR + "/../test-data/primer-schemes/nCoV-2019/V3/nCoV-2019.bed" + ) # check the the alts have been collapsed into a canonical primer site assert len(primerScheme) == 196, "alts were not collapsed" # check that alts are merging by union (not intersection) for entry in primerScheme: - if entry['Primer_ID'] == 'nCoV-2019_45_RIGHT': - assert entry['start'] == 13660, "failed to merge nCov-2019_45_RIGHT alt, bad start" - assert entry['end'] == 13699, "failed to merge nCov-2019_45_RIGHT alt, bad end" - if entry['Primer_ID'] == 'nCoV-2019_89_LEFT': - assert entry['start'] == 26835, "failed to merge nCoV-2019_89_LEFT alt, bad start" - assert entry['end'] == 26860, "failed to merge nCoV-2019_89_LEFT alt, bad end" - - # check access to primer scheme fields + if entry["Primer_ID"] == "nCoV-2019_45_RIGHT": + assert ( + entry["start"] == 13660 + ), "failed to merge nCov-2019_45_RIGHT alt, bad start" + assert ( + entry["end"] == 13699 + ), "failed to merge nCov-2019_45_RIGHT alt, bad end" + if entry["Primer_ID"] == "nCoV-2019_89_LEFT": + assert ( + entry["start"] == 26835 + ), "failed to merge nCoV-2019_89_LEFT alt, bad start" + assert ( + entry["end"] == 26860 + ), "failed to merge nCoV-2019_89_LEFT alt, bad end" + + # check access to primer scheme fields for row in primerScheme: - assert 'Primer_ID' in row, "failed to parse primer scheme for Primer_ID" - assert 'PoolName' in row, "failed to parse primer scheme for PoolName" + assert "Primer_ID" in row, "failed to parse primer scheme for Primer_ID" + assert "PoolName" in row, "failed to parse primer scheme for PoolName" # process the nCoV-2019 V2 primer scheme # this scheme has a single alt that has replaced the original primer so no alts should be collapsed - primerScheme2 = vcftagprimersites.read_bed_file( - TEST_DIR + "/../test-data/primer-schemes/nCoV-2019/V2/nCoV-2019.bed") + primerScheme2 = utils.read_bed_file( + TEST_DIR + "/../test-data/primer-schemes/nCoV-2019/V2/nCoV-2019.bed" + ) assert len(primerScheme2) == 196, "no alts should have been collapsed" # TODO: this is a starting point for the unit tests - more to come... From ed69b8290fd7656bfe48e4e00e9d3c1eb094d8f7 Mon Sep 17 00:00:00 2001 From: Rachel Colquhoun Date: Thu, 22 Aug 2024 10:59:22 +0100 Subject: [PATCH 02/11] 1.4.0 dev m1 (#136) * mods to Dockerfile * wip * update --- Dockerfile | 5 ++++- environment.yml | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index 272a109a..4bbe0ef7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -21,8 +21,11 @@ USER root RUN python3 -m pip install ./fieldbioinformatics +RUN pip uninstall -y tensorflow keras pyabpoa \ + && micromamba install -y -c conda-forge -c bioconda tensorflow=2.11 keras=2.11 + USER $MAMBA_USER ENTRYPOINT ["/usr/local/bin/_entrypoint.sh"] -CMD ["/bin/bash"] \ No newline at end of file +CMD ["/bin/bash"] diff --git a/environment.yml b/environment.yml index 3f0c52d1..6f90fecc 100644 --- a/environment.yml +++ b/environment.yml @@ -4,14 +4,15 @@ channels: - bioconda - defaults dependencies: + - python==3.9 - longshot - bcftools - biopython - bwa + - clair3 - clint - htslib - minimap2 - - clair3 - multiqc - muscle<5.1 - pandas @@ -25,4 +26,4 @@ dependencies: - samtools - tqdm - pip: - - medaka \ No newline at end of file + - medaka==1.12.0 From 19d1e4e8129a15c3e381aa760ec4325c7c095417 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 22 Aug 2024 11:08:10 +0100 Subject: [PATCH 03/11] 1.4.1 version bump --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 9afe6d2e..a885ca4d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = artic -version = 1.4.0 +version = 1.4.1 author = Nick Loman author_email = n.j.loman@bham.ac.uk maintainer = Sam Wilkinson From 0a7fc7ec7079bad7b29a42c95a7d1c4b926f8a63 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Tue, 3 Sep 2024 09:14:19 +0100 Subject: [PATCH 04/11] Properly name fasta header --- artic/minion.py | 3 ++- environment.yml | 1 - setup.cfg | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/artic/minion.py b/artic/minion.py index 60e065e1..3478e881 100755 --- a/artic/minion.py +++ b/artic/minion.py @@ -242,7 +242,8 @@ def run(parser, args): ) # 11) apply the header to the consensus sequence and run alignment against the reference sequence - fasta_header = "%s/ARTIC/medaka" % (args.sample) + caller = "medaka" if not args.clair3 else "clair3" + fasta_header = f"{args.sample}/ARTIC/{caller}" cmds.append( 'artic_fasta_header %s.consensus.fasta "%s"' % (args.sample, fasta_header) ) diff --git a/environment.yml b/environment.yml index 6f90fecc..5fb54dbf 100644 --- a/environment.yml +++ b/environment.yml @@ -19,7 +19,6 @@ dependencies: - pip - pysam - pytest - - python - cyvcf2 - pyfaidx - requests diff --git a/setup.cfg b/setup.cfg index a885ca4d..93e57943 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = artic -version = 1.4.1 +version = 1.4.2 author = Nick Loman author_email = n.j.loman@bham.ac.uk maintainer = Sam Wilkinson From 6fa6aa81fbf134841a17665081feee2b54481d42 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 19 Sep 2024 15:46:24 +0100 Subject: [PATCH 05/11] Add multi-ref scheme support --- artic/minion.py | 22 +++---- artic/utils.py | 157 +++++++++++++++++++++++++++++++++++++++++++++++- environment.yml | 3 +- 3 files changed, 166 insertions(+), 16 deletions(-) diff --git a/artic/minion.py b/artic/minion.py index 3478e881..45e42b56 100755 --- a/artic/minion.py +++ b/artic/minion.py @@ -51,6 +51,13 @@ def run(parser, args): ) raise SystemExit(1) + if not os.path.exists(args.read_file): + print( + colored.red("failed to find read-file: {}".format(args.read_file)), + file=sys.stderr, + ) + raise SystemExit(1) + # check the parameters and set up the filenames ## find the primer scheme, reference sequence and confirm scheme version @@ -63,6 +70,7 @@ def run(parser, args): scheme_version=args.scheme_version, scheme_directory=args.scheme_directory, scheme_length=args.scheme_length, + read_file=args.read_file, ) if not os.path.exists(bed) or not os.path.exists(ref): @@ -76,18 +84,6 @@ def run(parser, args): ) raise SystemExit(1) - ## set up the read file - if args.read_file: - read_file = args.read_file - else: - read_file = "%s.fasta" % (args.sample) - if not os.path.exists(read_file): - print( - colored.red("failed to find read-file: {}".format(read_file)), - file=sys.stderr, - ) - raise SystemExit(1) - ## collect the primer pools pools = set([row["PoolName"] for row in read_bed_file(bed)]) @@ -100,7 +96,7 @@ def run(parser, args): # 3) index the ref & align with minimap cmds.append( - f"minimap2 -a -x map-ont -t {args.threads} {ref} {read_file} | samtools view -bS -F 4 - | samtools sort -o {args.sample}.sorted.bam -" + f"minimap2 -a -x map-ont -t {args.threads} {ref} {args.read_file} | samtools view -bS -F 4 - | samtools sort -o {args.sample}.sorted.bam -" ) cmds.append(f"samtools index {args.sample}.sorted.bam") diff --git a/artic/utils.py b/artic/utils.py index 86dc5ffe..a5c3fa54 100644 --- a/artic/utils.py +++ b/artic/utils.py @@ -6,6 +6,9 @@ import hashlib import re import pandas as pd +from Bio import SeqIO +import subprocess +import csv from clint.textui import colored @@ -315,6 +318,7 @@ def get_scheme( scheme_version: str, scheme_directory: str, scheme_length: int = False, + read_file: str = False ): """Get the primer scheme and reference fasta file from the manifest @@ -382,7 +386,7 @@ def get_scheme( if not manifest["primerschemes"].get(scheme_name): print( colored.red( - f"Scheme name alias does not exist in manifest, this should never happen, please create an issue on https://github.com/quick-lab/primerschemes/issues or https://github.com/artic-network/fieldbioinformatics/issues if you see this message" + "Scheme name alias does not exist in manifest, this should never happen, please create an issue on https://github.com/quick-lab/primerschemes/issues or https://github.com/artic-network/fieldbioinformatics/issues if you see this message" ) ) raise SystemExit(1) @@ -416,7 +420,7 @@ def get_scheme( if not version_pattern.match(scheme_version): print( colored.red( - f"Invalid scheme version format, please provide a version in the format 'vX.X.X', e.g. v1.0.0" + "Invalid scheme version format, please provide a version in the format 'vX.X.X', e.g. v1.0.0" ) ) raise SystemExit(1) @@ -431,6 +435,27 @@ def get_scheme( scheme = scheme[scheme_length][scheme_version] + if scheme.get("ref_selection"): + if not read_file: + print( + colored.red( + "Reference selection is available for this scheme but reads were not provided to 'get_scheme'. This should never happen, please submit an issue to github.com/artic-network/fieldbioinformatics/issues" + ) + ) + raise SystemExit(1) + + print( + colored.yellow( + f"Reference selection is available for scheme {scheme_name}, deciding which reference to use based on your reads. If you would prefer to specify the reference to use, provide the same scheme name with the appropriate suffix, choices are: {', '.join(str(x) for x in scheme[scheme_length].keys() if "-" in x)}" + ) + ) + + suffix = pick_best_ref(multi_ref_url=scheme["ref_selection"], multi_ref_md5=scheme["ref_selection_md5"], read_file=read_file, n_reads=10000, scheme_path=scheme_directory, mm2_threads=4) + + scheme_version = f"{scheme_version}-{suffix}" + + print(colored.yellow(f"Selected reference suffix: {suffix}")) + scheme_path = os.path.join( scheme_directory, scheme_name, scheme_length, scheme_version ) @@ -623,3 +648,131 @@ def get_scheme_legacy(scheme_name, scheme_directory, scheme_version="1"): file=sys.stderr, ) raise SystemExit(1) + + +def pick_best_ref( + multi_ref_url: str, + multi_ref_md5: str, + read_file: str, + n_reads: int, + scheme_path: str, + mm2_threads: int, +): + """Pick the best reference from a multi-reference alignment + + Args: + multi_ref_url (str): URL to the multi-reference alignment from quick-lab/primerschemes + multi_ref_md5 (str): MD5 hash of the multi-reference alignment + read_file (str): Path to the read file to test against the references + n_reads (int): How many reads to sample from the read file for testing (default: 10000) + scheme_path (str): Path to the scheme directory + mm2_threads (int): Number of threads to use when aligning using minimap2 + + Raises: + SystemExit: If the ref selection fasta file cannot be fetched + SystemExit: If the ref selection fasta file get returns a status code other than 200 + + Returns: + str: Primer scheme suffix of the most appropriate reference for the reads + """ + + ref_selection_path = os.path.join(scheme_path, "ref_select.fasta") + + if not os.path.exists(scheme_path): + try: + response = requests.get(multi_ref_url) + except requests.exceptions.RequestException as error: + print(colored.red(f"Failed to ref selection fasta with Exception: {error}")) + raise SystemExit(1) + + if response.status_code != 200: + print( + colored.red( + f"Failed to fetch ref selection fasta with status code: {response.status_code}" + ) + ) + raise SystemExit(1) + + with open(ref_selection_path, "w") as bed_file: + bed_file.write(response.text) + + check_hash(ref_selection_path, multi_ref_md5) + + flat_ref_path = os.path.join(scheme_path, "flattened_references.fasta") + + possible_references = {} + + with open(ref_selection_path, "r") as ref_selection_fh: + + for reference in SeqIO.parse(ref_selection_fh, "fasta"): + suffix = reference.Description.split()[1] + possible_references[reference.id] = suffix + + # Flatten out the alignment into flat fasta reference + flattened = str(reference.seq).replace("-", "") + + with open(flat_ref_path, "a") as flat_ref_fh: + flat_ref_fh.write(f">{reference.Description}\n{flattened}\n") + + # fq_it = mappy.fastx_read(fastq_path) + seqtk = subprocess.run( + ["seqtk", "sample", str(read_file), str(n_reads)], stdout=subprocess.PIPE + ) + + result = subprocess.run( + [ + "minimap2", + "-x", + "map-ont", + "-t", + str(mm2_threads), + str(flat_ref_path), + "-", + ], + input=seqtk.stdout, + stdout=subprocess.PIPE, + ) + + reader = csv.DictReader( + result.stdout.decode("utf-8").split("\n"), + delimiter="\t", + fieldnames=[ + "query_name", + "query_len", + "query_start", + "query_end", + "strand", + "ctg_name", + "ctg_len", + "ref_start", + "ref_end", + "n_matches", + "alignment_len", + "mapq", + ], + ) + + read_results = {} + + for alignment in reader: + + if not alignment: + continue + + identity = int(alignment["n_matches"]) / int(alignment["alignment_len"]) + + read_results.setdefault(alignment["query_name"], {}) + read_results[alignment["query_name"]].setdefault(alignment["ctg_name"], 0) + + if identity > read_results[alignment["query_name"]][alignment["ctg_name"]]: + read_results[alignment["query_name"]][alignment["ctg_name"]] = identity + + ref_results = {ref: 0 for ref in possible_references.keys()} + + for read, details in read_results.items(): + best_ctg = max(details, key=details.get) + ref_results[best_ctg] += 1 + + best_ref = max(ref_results, key=ref_results.get) + + return possible_references[best_ref] diff --git a/environment.yml b/environment.yml index 5fb54dbf..ebe70482 100644 --- a/environment.yml +++ b/environment.yml @@ -24,5 +24,6 @@ dependencies: - requests - samtools - tqdm + - seqtk - pip: - - medaka==1.12.0 + - medaka-cpu From bc196f5182b0570b242e8ab09ae350a96bceb971 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 19 Sep 2024 15:54:49 +0100 Subject: [PATCH 06/11] Fix message log generator --- artic/utils.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/artic/utils.py b/artic/utils.py index a5c3fa54..a2fd8607 100644 --- a/artic/utils.py +++ b/artic/utils.py @@ -318,7 +318,7 @@ def get_scheme( scheme_version: str, scheme_directory: str, scheme_length: int = False, - read_file: str = False + read_file: str = False, ): """Get the primer scheme and reference fasta file from the manifest @@ -446,11 +446,18 @@ def get_scheme( print( colored.yellow( - f"Reference selection is available for scheme {scheme_name}, deciding which reference to use based on your reads. If you would prefer to specify the reference to use, provide the same scheme name with the appropriate suffix, choices are: {', '.join(str(x) for x in scheme[scheme_length].keys() if "-" in x)}" + f"Reference selection is available for scheme {scheme_name}, deciding which reference to use based on your reads. If you would prefer to specify the reference to use, provide the same scheme name with the appropriate suffix, choices are: {', '.join(str(x) for x in scheme[scheme_length].keys() if '-' in x)}" ) ) - suffix = pick_best_ref(multi_ref_url=scheme["ref_selection"], multi_ref_md5=scheme["ref_selection_md5"], read_file=read_file, n_reads=10000, scheme_path=scheme_directory, mm2_threads=4) + suffix = pick_best_ref( + multi_ref_url=scheme["ref_selection"], + multi_ref_md5=scheme["ref_selection_md5"], + read_file=read_file, + n_reads=10000, + scheme_path=scheme_directory, + mm2_threads=4, + ) scheme_version = f"{scheme_version}-{suffix}" From a08bd087a4f5f3efacb837516ef8a859e20bc994 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Thu, 19 Sep 2024 15:56:50 +0100 Subject: [PATCH 07/11] Don't use medaka cpu --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index ebe70482..2d27c46a 100644 --- a/environment.yml +++ b/environment.yml @@ -26,4 +26,4 @@ dependencies: - tqdm - seqtk - pip: - - medaka-cpu + - medaka==1.12.0 From 8f9de84b62d603601b67b0f964fdb3bba547a39a Mon Sep 17 00:00:00 2001 From: Biowilko Date: Fri, 20 Sep 2024 10:48:14 +0100 Subject: [PATCH 08/11] Version bump --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 93e57943..d565b181 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = artic -version = 1.4.2 +version = 1.4.3 author = Nick Loman author_email = n.j.loman@bham.ac.uk maintainer = Sam Wilkinson From 1d0ac33e2f9e5b5ee235ca97b7ee709d9e363b0d Mon Sep 17 00:00:00 2001 From: Biowilko Date: Mon, 21 Oct 2024 13:47:21 +0100 Subject: [PATCH 09/11] Fix fetch scheme, add fetch_scheme entrypoint --- artic/fetch_scheme.py | 51 +++++++++++++++++++++++++++++++++++++++++ artic/utils.py | 53 +++++++++++++++++++++++++++++++------------ setup.cfg | 1 + 3 files changed, 91 insertions(+), 14 deletions(-) create mode 100644 artic/fetch_scheme.py diff --git a/artic/fetch_scheme.py b/artic/fetch_scheme.py new file mode 100644 index 00000000..5e15128d --- /dev/null +++ b/artic/fetch_scheme.py @@ -0,0 +1,51 @@ +from artic.utils import get_scheme +from pathlib import Path + + +def main(): + import argparse + import os + + parser = argparse.ArgumentParser() + parser.add_argument( + "--scheme-name", + type=str, + required=True, + help="Name of the scheme you wish to fetch", + ) + parser.add_argument( + "--scheme-version", + type=str, + required=True, + help="Version of the scheme you wish to fetch", + ) + parser.add_argument( + "--scheme-directory", + type=Path, + required=True, + help="Directory where schemes are stored", + default=f"{os.getcwd()}/primer-schemes", + ) + parser.add_argument( + "--scheme-length", + type=str, + help="Length of the scheme to fetch, only required if the scheme has multiple possible lengths.", + ) + parser.add_argument( + "--read-file", + type=Path, + help="FASTQ file containing reads sequenced with the scheme in question, this is only required when fetching a scheme with reference selection functionality.", + ) + args = parser.parse_args() + + get_scheme( + scheme_name=args.scheme_name, + scheme_version=args.scheme_version, + scheme_directory=args.scheme_directory, + scheme_length=args.scheme_length, + read_file=args.read_file, + ) + + +if __name__ == "__main__": + main() diff --git a/artic/utils.py b/artic/utils.py index a2fd8607..89d08d68 100644 --- a/artic/utils.py +++ b/artic/utils.py @@ -317,7 +317,7 @@ def get_scheme( scheme_name: str, scheme_version: str, scheme_directory: str, - scheme_length: int = False, + scheme_length: str = False, read_file: str = False, ): """Get the primer scheme and reference fasta file from the manifest @@ -326,7 +326,7 @@ def get_scheme( scheme_name (str): Name of the scheme scheme_version (str): Version of the scheme scheme_directory (str): Directory where schemes are stored - scheme_length (int, optional): Length of the scheme. Defaults to False. + scheme_length (str, optional): Length of the scheme. Defaults to False. Returns: str: Path to the primer bed file @@ -335,8 +335,11 @@ def get_scheme( """ try: + # response = requests.get( + # "https://raw.githubusercontent.com/quick-lab/primerschemes/main/index.json" + # ) response = requests.get( - "https://raw.githubusercontent.com/quick-lab/primerschemes/main/index.json" + "https://raw.githubusercontent.com/ChrisgKent/primerschemes/main/index.json" ) except requests.exceptions.RequestException as error: print(colored.red(f"Failed to fetch manifest with Exception: {error}")) @@ -353,8 +356,11 @@ def get_scheme( manifest = response.json() try: + # response = requests.get( + # "https://raw.githubusercontent.com/quick-lab/primerschemes/main/aliases.json" + # ) response = requests.get( - "https://raw.githubusercontent.com/quick-lab/primerschemes/main/aliases.json" + "https://raw.githubusercontent.com/ChrisgKent/primerschemes/main/aliases.json" ) except requests.exceptions.RequestException as error: print(colored.red(f"Failed to fetch scheme aliases with Exception: {error}")) @@ -433,13 +439,11 @@ def get_scheme( ) raise SystemExit(1) - scheme = scheme[scheme_length][scheme_version] - - if scheme.get("ref_selection"): + if scheme[scheme_length][scheme_version].get("refselect"): if not read_file: print( colored.red( - "Reference selection is available for this scheme but reads were not provided to 'get_scheme'. This should never happen, please submit an issue to github.com/artic-network/fieldbioinformatics/issues" + f"Reference selection is available for this scheme but reads were not provided. Either provide a read file using --read-file or specify the reference to use by providing the same scheme name with the appropriate suffix, choices are: {', '.join(str(x) for x in scheme[scheme_length].keys() if '-' in x)}" ) ) raise SystemExit(1) @@ -450,12 +454,28 @@ def get_scheme( ) ) + if len(scheme[scheme_length][scheme_version]["refselect"].keys()) > 1: + print( + colored.red( + f"Multiple reference selection options found for scheme {scheme_name}, fieldbioinformatics does not currently support multi pathogen schemes" + ) + ) + raise SystemExit(1) + + msa_data = next( + iter(scheme[scheme_length][scheme_version]["refselect"].values()) + ) + + scheme_select_dir = os.path.join( + scheme_directory, scheme_name, scheme_length, scheme_version + ) + suffix = pick_best_ref( - multi_ref_url=scheme["ref_selection"], - multi_ref_md5=scheme["ref_selection_md5"], + multi_ref_url=msa_data["url"], + multi_ref_md5=msa_data["md5"], read_file=read_file, n_reads=10000, - scheme_path=scheme_directory, + scheme_path=scheme_select_dir, mm2_threads=4, ) @@ -467,6 +487,8 @@ def get_scheme( scheme_directory, scheme_name, scheme_length, scheme_version ) + scheme = scheme[scheme_length][scheme_version] + if not os.path.exists(scheme_path): os.makedirs(scheme_path, exist_ok=True) @@ -683,9 +705,12 @@ def pick_best_ref( str: Primer scheme suffix of the most appropriate reference for the reads """ - ref_selection_path = os.path.join(scheme_path, "ref_select.fasta") + ref_selection_path = os.path.join(scheme_path, "refselect.fasta") if not os.path.exists(scheme_path): + os.makedirs(scheme_path, exist_ok=True) + + if not os.path.exists(ref_selection_path): try: response = requests.get(multi_ref_url) except requests.exceptions.RequestException as error: @@ -712,14 +737,14 @@ def pick_best_ref( with open(ref_selection_path, "r") as ref_selection_fh: for reference in SeqIO.parse(ref_selection_fh, "fasta"): - suffix = reference.Description.split()[1] + suffix = reference.description.split()[1] possible_references[reference.id] = suffix # Flatten out the alignment into flat fasta reference flattened = str(reference.seq).replace("-", "") with open(flat_ref_path, "a") as flat_ref_fh: - flat_ref_fh.write(f">{reference.Description}\n{flattened}\n") + flat_ref_fh.write(f">{reference.description}\n{flattened}\n") # fq_it = mappy.fastx_read(fastq_path) seqtk = subprocess.run( diff --git a/setup.cfg b/setup.cfg index d565b181..893bfd05 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,4 +36,5 @@ console_scripts = artic_fasta_header=artic.fasta_header:main artic_mask=artic.mask:main artic_get_stats=artic.artic_mqc:main + fetch_scheme=artic.fetch_scheme:main From 021d8492a3ed0fcbca0827e7c65ee10ca37e04da Mon Sep 17 00:00:00 2001 From: Biowilko Date: Mon, 21 Oct 2024 13:58:11 +0100 Subject: [PATCH 10/11] Improve arguments --- artic/fetch_scheme.py | 1 - 1 file changed, 1 deletion(-) diff --git a/artic/fetch_scheme.py b/artic/fetch_scheme.py index 5e15128d..28ec5a56 100644 --- a/artic/fetch_scheme.py +++ b/artic/fetch_scheme.py @@ -22,7 +22,6 @@ def main(): parser.add_argument( "--scheme-directory", type=Path, - required=True, help="Directory where schemes are stored", default=f"{os.getcwd()}/primer-schemes", ) From 42b58d91aec19cc1774bce165a036e822a800a15 Mon Sep 17 00:00:00 2001 From: Biowilko Date: Mon, 21 Oct 2024 14:01:22 +0100 Subject: [PATCH 11/11] Revert to production primerschemes repo in anticipation of multiref update --- artic/pipeline.py | 1 - artic/utils.py | 14 ++++---------- artic/version.py | 2 +- 3 files changed, 5 insertions(+), 12 deletions(-) diff --git a/artic/pipeline.py b/artic/pipeline.py index 76f56656..efd4b0a1 100755 --- a/artic/pipeline.py +++ b/artic/pipeline.py @@ -4,7 +4,6 @@ # Thanks to Aaron Quinlan for the argparse implementation from poretools. import argparse -import sys import os from . import version diff --git a/artic/utils.py b/artic/utils.py index 89d08d68..3a167d88 100644 --- a/artic/utils.py +++ b/artic/utils.py @@ -124,7 +124,7 @@ def identify_bed_file(bed_file): if version != 3: print( colored.red( - f"Scheme BED does not appear to be a consistent scheme version, for primer scheme formats please see https://github.com/ChrisgKent/primal-page" + "Scheme BED does not appear to be a consistent scheme version, for primer scheme formats please see https://github.com/ChrisgKent/primal-page" ) ) raise SystemExit(1) @@ -147,7 +147,7 @@ def identify_bed_file(bed_file): if version == 3: print( colored.red( - f"Scheme BED mixed primer ID formats, please ensure your BED file is consistent" + "Scheme BED mixed primer ID formats, please ensure your BED file is consistent" ) ) raise SystemExit(1) @@ -335,11 +335,8 @@ def get_scheme( """ try: - # response = requests.get( - # "https://raw.githubusercontent.com/quick-lab/primerschemes/main/index.json" - # ) response = requests.get( - "https://raw.githubusercontent.com/ChrisgKent/primerschemes/main/index.json" + "https://raw.githubusercontent.com/quick-lab/primerschemes/main/index.json" ) except requests.exceptions.RequestException as error: print(colored.red(f"Failed to fetch manifest with Exception: {error}")) @@ -356,11 +353,8 @@ def get_scheme( manifest = response.json() try: - # response = requests.get( - # "https://raw.githubusercontent.com/quick-lab/primerschemes/main/aliases.json" - # ) response = requests.get( - "https://raw.githubusercontent.com/ChrisgKent/primerschemes/main/aliases.json" + "https://raw.githubusercontent.com/quick-lab/primerschemes/main/aliases.json" ) except requests.exceptions.RequestException as error: print(colored.red(f"Failed to fetch scheme aliases with Exception: {error}")) diff --git a/artic/version.py b/artic/version.py index 3e8d9f94..aa56ed40 100644 --- a/artic/version.py +++ b/artic/version.py @@ -1 +1 @@ -__version__ = "1.4.0" +__version__ = "1.4.3"