From e6a0a166c057dbca2a3e9f07cdaa4adfdf730dd8 Mon Sep 17 00:00:00 2001
From: Sam W <s.a.j.wilkinson@bham.ac.uk>
Date: Mon, 19 Aug 2024 12:40:29 +0100
Subject: [PATCH 01/11] 1.4.0 pre-release version (#135)

* Deprecate old tools and black format

* Support fragmented data

* use new trim primer arg in minion

* reverse the (revolting) primer position logic

* Fix find primer logic

* Improve the cursed report writer logic

* explicitly named function args, what a novel idea

* add temp debugging prints to find_primer

* Add logic to handle segments outside of primer scheme

* fix typo

* General bugfixes, test fix for overlapping variant issue

* add min mapq argument

* properly provide min_mapq to handle_segment

* Fix unmatched read group issue

* Fix 8 bit int error

* First attempt to integrate clair3 as optional
variant caller

* Slightly change env yaml format

* Specify env name in mamba gh action

* Checkout repo before installing env

* python 3.6.15

* Run on ubuntu 20.04

* unpin python

* Unpin python entirely

* Separate out tests

* correct unit test path

* remove retvar hints for compatibility

* Fix silly relative imports

* Switch to cyvcf and fix unit tests

* Fix pipeline parse unit test

* Fix fstring

* Large changes to make clair3 work, various unit test updates

* Fix fuzzy primer site match logic

* Fix minion_validator path

* Fix vcf merge bug with empty VCF files

* Increase fuzzy match default threshold to 20

* Fix CHROM bug in vcf merge properly

* Report failed workflow when variants are missed

* Fix vcf merge again

* Separate out tests

* No counter anymore

* Add C241T to SP1 variants

* Add limit on ref pileup

* Single copy of CVR1 12733 var

* re-enable other consensus tests

* First attempt to make minion validator less fragile

* Unify testvariants, improve test logic

* medaka has to be different smh

* Index ref fasta when using the clair3 workflow

* Add mean amplicon depth TSV output

* Update CLI and scheme fetcher

* Update align trim unit test

* Use proper version of V3 bedfile

* Fix pipeline unit test

* Fix align trim amp depth bug

* Call variants on non primertrimmed bam

* Don't test MT vs clair

* properly disable test

* Add docker build push action on release

* Grab wget in dockerfile

* Update docs and readme

* docker push on release only

* Remove debugging print statement

* Make fuzzy primer match logic clearer

* Install procps for nxf compatibility
---
 .github/workflows/docker-build-push.yml       |  39 ++
 .github/workflows/unittests.yml               |  45 ++
 .gitignore                                    |  14 +-
 .travis.yml                                   |  38 --
 Dockerfile                                    |  28 +
 README.md                                     |  18 +-
 artic/.gitignore                              |   3 +
 artic/align_trim.py                           | 519 +++++++++++----
 artic/artic_mqc.py                            | 109 ++-
 artic/{ => deprecated}/align_trim_n.py        |   0
 artic/{ => deprecated}/basecaller.py          |   0
 artic/{ => deprecated}/conftest.py            |   0
 artic/{ => deprecated}/demultiplex.py         |   0
 artic/{ => deprecated}/extract.py             |   0
 artic/{ => deprecated}/gather.py              |   0
 artic/{ => deprecated}/margin_cons_medaka.py  |  97 +--
 artic/deprecated/setup.py                     |  48 ++
 artic/export.py                               |  86 ++-
 artic/fasta_header.py                         |  14 +-
 artic/filter_reads.py                         |  34 +-
 artic/guppyplex.py                            |  53 +-
 artic/make_depth_mask.py                      |  74 ++-
 artic/margin_cons.py                          |  91 ++-
 artic/mask.py                                 |  48 +-
 artic/minion.py                               | 431 ++++++------
 artic/minion_validator.py                     | 331 ----------
 artic/pipeline.py                             | 349 ++++++----
 artic/rampart.py                              | 124 ++--
 artic/runs.py                                 |  20 +-
 artic/utils.py                                | 625 ++++++++++++++++++
 artic/vcf_filter.py                           |  77 +--
 artic/vcf_merge.py                            | 103 ++-
 artic/vcfextract.py                           |  52 +-
 artic/vcftagprimersites.py                    | 157 -----
 artic/version.py                              |   2 +-
 docs/commands.md                              |  70 +-
 docs/faq.md                                   |   4 +-
 docs/installation.md                          |  16 +-
 environment.yml                               |  42 +-
 requirements.txt                              |   8 -
 setup.cfg                                     |  39 ++
 setup.py                                      |  52 +-
 .../IturiEBOV/V1/IturiEBOV.scheme.bed         | 244 +++----
 .../nCoV-2019/V1/nCoV-2019.scheme.bed         | 392 +++++------
 test-runner.sh                                | 105 ++-
 {artic => tests}/align_trim_unit_test.py      | 133 ++--
 tests/get_scheme_unit_test.py                 |  34 +
 tests/minion_validator.py                     | 379 +++++++++++
 {artic => tests}/pipeline_unit_test.py        |  14 +-
 .../utils_unit_test.py                        |  40 +-
 50 files changed, 3166 insertions(+), 2035 deletions(-)
 create mode 100644 .github/workflows/docker-build-push.yml
 create mode 100644 .github/workflows/unittests.yml
 delete mode 100644 .travis.yml
 create mode 100644 Dockerfile
 create mode 100644 artic/.gitignore
 rename artic/{ => deprecated}/align_trim_n.py (100%)
 rename artic/{ => deprecated}/basecaller.py (100%)
 rename artic/{ => deprecated}/conftest.py (100%)
 rename artic/{ => deprecated}/demultiplex.py (100%)
 rename artic/{ => deprecated}/extract.py (100%)
 rename artic/{ => deprecated}/gather.py (100%)
 rename artic/{ => deprecated}/margin_cons_medaka.py (58%)
 create mode 100644 artic/deprecated/setup.py
 delete mode 100644 artic/minion_validator.py
 create mode 100644 artic/utils.py
 delete mode 100755 artic/vcftagprimersites.py
 delete mode 100644 requirements.txt
 create mode 100644 setup.cfg
 rename {artic => tests}/align_trim_unit_test.py (60%)
 create mode 100644 tests/get_scheme_unit_test.py
 create mode 100644 tests/minion_validator.py
 rename {artic => tests}/pipeline_unit_test.py (78%)
 rename artic/vcftagprimersites_unit_test.py => tests/utils_unit_test.py (52%)

diff --git a/.github/workflows/docker-build-push.yml b/.github/workflows/docker-build-push.yml
new file mode 100644
index 00000000..6c9f8827
--- /dev/null
+++ b/.github/workflows/docker-build-push.yml
@@ -0,0 +1,39 @@
+name: quay_build_push
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  docker:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Docker meta
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: quay.io/artic-network/fieldbioinformatics
+          flavor: |
+            latest=true
+          tags: |
+            type=semver,pattern={{version}}
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Set up QEMU
+        uses: docker/setup-qemu-action@v3
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          registry: quay.io
+          username: ${{ secrets.QUAY_USERNAME }}
+          password: ${{ secrets.QUAY_PASSWORD }}
+      - name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          platforms: linux/amd64
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
\ No newline at end of file
diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
new file mode 100644
index 00000000..8a9c8abe
--- /dev/null
+++ b/.github/workflows/unittests.yml
@@ -0,0 +1,45 @@
+name: fieldbioinformatics_unit_tests
+
+on: [push, pull_request]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@v3
+      - uses: mamba-org/setup-micromamba@v1
+        name: Setup dependencies
+        with:
+          environment-file: environment.yml
+          init-shell: >-
+            bash
+          cache-environment: true
+          post-cleanup: "all"
+
+      - name: Pip install artic
+        run: |
+          python3 -m pip install -e .
+        shell: micromamba-shell {0}
+
+      - name: Run unit tests
+        run: |
+          pytest -s tests/*_unit_test.py
+        shell: micromamba-shell {0}
+
+      - name: Run medaka test
+        run: |
+          ./test-runner.sh medaka
+        shell: micromamba-shell {0}
+
+      - name: Run clair3 test
+        run: |
+          ./test-runner.sh clair3
+        shell: micromamba-shell {0}
+
+      - name: Run minion_validator tests
+        run: |
+          pytest -s tests/minion_validator.py
+        shell: micromamba-shell {0}
+
diff --git a/.gitignore b/.gitignore
index f3259711..c61e3270 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,6 +26,14 @@ dist/
 
 # test data
 *.index*
-CVR1/
-NRW01/
-SP1/
\ No newline at end of file
+CVR1.*
+NRW01.*
+SP1.*
+MT007544*
+tmp/
+test-data/CVR1/
+test-data/NRW01/
+test-data/SP1/
+test-data/MT007544/
+rg_*
+primer-schemes/
\ No newline at end of file
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 8289448f..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-dist: xenial
-language: python            # this works for Linux but is an error on macOS or Windows
-jobs:
-  include:
-    - name: "Python 3.6 on Xenial Linux"
-      python: 3.6           # this works for Linux but is ignored on macOS or Windows
-    - name: "Python 3.6 on macOS"
-      os: osx
-      language: shell       # 'language: python' is an error on Travis CI macOS
-      python: 3.6
-
-install:
-
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then MINICONDA_OS=Linux; else MINICONDA_OS=MacOSX; fi
-  - wget https://repo.anaconda.com/miniconda/Miniconda3-latest-$MINICONDA_OS-x86_64.sh -O miniconda.sh
-  - bash miniconda.sh -b -p "$HOME"/miniconda
-  - source "$HOME/miniconda/etc/profile.d/conda.sh"
-  - hash -r
-  - conda config --set always_yes yes --set changeps1 no
-  - conda update -q conda
-  - conda info -a
-  - conda env create -f environment.yml
-  - conda activate artic
-  - python setup.py install
-
-script:
-  # run the unit tests
-  - pytest -s artic/*_unit_test.py
-
-  # run the pipeline test for the medaka pipeline variant
-  - ./test-runner.sh medaka
-
-  # run the pipeline test for the nanopolish pipeline variant
-  - ./test-runner.sh nanopolish
-
-  # run some of the variant set validation functional tests (will download test data)
-  - pytest -s artic/minion_validator.py --workFlow nanopolish --numValidations 1
-  - pytest -s artic/minion_validator.py --workFlow medaka --numValidations 2
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 00000000..272a109a
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,28 @@
+FROM mambaorg/micromamba:1.5.8
+
+COPY . ./fieldbioinformatics/
+
+USER root
+
+RUN apt-get update && apt-get install -y --no-install-recommends build-essential wget procps
+
+USER $MAMBA_USER
+
+COPY --chown=$MAMBA_USER:$MAMBA_USER environment.yml /tmp/env.yml
+
+RUN sed -i 's/name: artic/name: base/' /tmp/env.yml
+
+RUN micromamba install --yes --file /tmp/env.yml && \
+    micromamba clean --all --yes
+
+ARG MAMBA_DOCKERFILE_ACTIVATE=1
+
+USER root
+
+RUN python3 -m pip install ./fieldbioinformatics
+
+USER $MAMBA_USER
+
+ENTRYPOINT ["/usr/local/bin/_entrypoint.sh"]
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/README.md b/README.md
index 48223c83..a2d5cf9a 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ Features include:
 
 There are **2 workflows** baked into this pipeline, one which uses signal data (via [nanopolish](https://github.com/jts/nanopolish)) and one that does not (via [medaka](https://github.com/nanoporetech/medaka)).
 
-## Installation
+<!-- ## Installation
 
 ### Via conda
 
@@ -35,26 +35,26 @@ There are **2 workflows** baked into this pipeline, one which uses signal data (
 conda install -c bioconda -c conda-forge artic
 ```
 
-If conda reports that nothing provides particular packages when running the above command ensure your `channel_priority` is set to `flexible` using the following command:
-
-```sh
-conda config --set channel_priority false
-```
+Please make sure you are using either mamba or conda version >= 23.10.0 where libmamba solver was made the default conda solver. -->
 
 ### Via source
 
 #### 1. downloading the source:
 
-Download a [release](https://github.com/artic-network/fieldbioinformatics/releases) or use the latest master (which tracks the current release):
+<!-- Download a [release](https://github.com/artic-network/fieldbioinformatics/releases) or use the latest master (which tracks the current release): -->
+
+Clone the repository then checkout the 1.4.0-dev branch to test the 1.4.0 pre-release.
 
 ```sh
 git clone https://github.com/artic-network/fieldbioinformatics
 cd fieldbioinformatics
+git checkout 1.4.0-dev
 ```
 
 #### 2. installing dependencies:
 
-The `artic pipeline` has several [software dependencies](https://github.com/artic-network/fieldbioinformatics/blob/master/environment.yml). You can solve these dependencies using the minimal conda environment we have provided:
+The `artic pipeline` has several [software dependencies](https://github.com/artic-network/fieldbioinformatics/blob/master/environment.yml). You can solve these dependencies using the minimal conda environment we have provided, Please make sure you are using either mamba or conda version >= 23.10.0 where libmamba solver was made the default conda solver.
+:
 
 ```sh
 conda env create -f environment.yml
@@ -64,7 +64,7 @@ conda activate artic
 #### 3. installing the pipeline:
 
 ```sh
-python setup.py install
+pip install .
 ```
 
 #### 4. testing the pipeline:
diff --git a/artic/.gitignore b/artic/.gitignore
new file mode 100644
index 00000000..096b5eb5
--- /dev/null
+++ b/artic/.gitignore
@@ -0,0 +1,3 @@
+# pixi environments
+.pixi
+*.egg-info
diff --git a/artic/align_trim.py b/artic/align_trim.py
index 85074db2..2a1b3f69 100755
--- a/artic/align_trim.py
+++ b/artic/align_trim.py
@@ -1,12 +1,13 @@
 #!/usr/bin/env python
 
-# Written by Nick Loman
-
 from copy import copy
-from collections import defaultdict
+import csv
 import pysam
 import sys
-from .vcftagprimersites import read_bed_file
+import numpy as np
+import random
+import argparse
+from artic.utils import read_bed_file
 
 # consumesReference lookup for if a CIGAR operation consumes the reference sequence
 consumesReference = [True, False, True, True, False, False, False, True]
@@ -15,7 +16,7 @@
 consumesQuery = [True, True, False, False, True, False, False, True]
 
 
-def find_primer(bed, pos, direction):
+def find_primer(bed, pos, direction, threshold=20):
     """Given a reference position and a direction of travel, walk out and find the nearest primer site.
 
     Parameters
@@ -29,17 +30,33 @@ def find_primer(bed, pos, direction):
 
     Returns
     -------
-    tuple
-        The offset, distance and bed entry for the closest primer to the query position
+    tuple[int, int, dict] | bool
+        A tuple containing the distance to the primer, the relative position of the primer, and the primer site, or False if no primer found
     """
     from operator import itemgetter
 
-    if direction == '+':
-        closest = min([(abs(p['start'] - pos), p['start'] - pos, p)
-                       for p in bed if p['direction'] == direction], key=itemgetter(0))
+    if direction == "+":
+        primer_distances = [
+            (abs(p["start"] - pos), p["start"] - pos, p)
+            for p in bed
+            if (p["direction"] == direction) and (pos >= (p["start"] - threshold))
+        ]
+
     else:
-        closest = min([(abs(p['end'] - pos), p['end'] - pos, p)
-                       for p in bed if p['direction'] == direction], key=itemgetter(0))
+        primer_distances = [
+            (abs(p["end"] - pos), p["end"] - pos, p)
+            for p in bed
+            if (p["direction"] == direction) and (pos <= (p["end"] + threshold))
+        ]
+
+    if not primer_distances:
+        return False
+
+    closest = min(
+        primer_distances,
+        key=itemgetter(0),
+    )
+
     return closest
 
 
@@ -80,18 +97,20 @@ def trim(segment, primer_pos, end, debug):
                 print("Chomped a %s, %s" % (flag, length), file=sys.stderr)
         except IndexError:
             print(
-                "Ran out of cigar during soft masking - completely masked read will be ignored", file=sys.stderr)
+                "Ran out of cigar during soft masking - completely masked read will be ignored",
+                file=sys.stderr,
+            )
             break
 
         # if the CIGAR operation consumes the reference sequence, increment/decrement the position by the CIGAR operation length
-        if (consumesReference[flag]):
+        if consumesReference[flag]:
             if not end:
                 pos += length
             else:
                 pos -= length
 
         # if the CIGAR operation consumes the query sequence, increment the number of CIGAR operations eaten by the CIGAR operation length
-        if (consumesQuery[flag]):
+        if consumesQuery[flag]:
             eaten += length
 
         # stop processing the CIGAR if we've gone far enough to mask the primer
@@ -125,7 +144,9 @@ def trim(segment, primer_pos, end, debug):
         if cigar[0][0] == 2:
             if debug:
                 print(
-                    "softmask created a leading deletion in the CIGAR, shuffling the alignment", file=sys.stderr)
+                    "softmask created a leading deletion in the CIGAR, shuffling the alignment",
+                    file=sys.stderr,
+                )
             while 1:
                 if cigar[0][0] != 2:
                     break
@@ -146,6 +167,271 @@ def trim(segment, primer_pos, end, debug):
     return
 
 
+def handle_segment(
+    segment: pysam.AlignedSegment,
+    bed: dict,
+    args: argparse.Namespace,
+    min_mapq: int,
+    report_writer: csv.DictWriter = False,
+):
+    """Handle the alignment segment including
+
+    Args:
+        segment (pysam.AlignedSegment): The alignment segment to process
+        bed (dict): The primer scheme
+        reportfh (typing.IO): The report file handle
+        args (argparse.Namespace): The command line arguments
+
+    Returns:
+        tuple [int, pysam.AlignedSegment] | bool: A tuple containing the amplicon number and the alignment segment, or False if the segment is to be skipped
+    """
+
+    # filter out unmapped and supplementary alignment segments
+    if segment.is_unmapped:
+        print("%s skipped as unmapped" % (segment.query_name), file=sys.stderr)
+        return False
+    if segment.is_supplementary:
+        print("%s skipped as supplementary" % (segment.query_name), file=sys.stderr)
+        return False
+    if segment.mapping_quality < min_mapq:
+        print(
+            "%s skipped as mapping quality below threshold" % (segment.query_name),
+            file=sys.stderr,
+        )
+        return False
+
+    # locate the nearest primers to this alignment segment
+    p1 = find_primer(bed, segment.reference_start, "+", args.primer_match_threshold)
+    p2 = find_primer(bed, segment.reference_end, "-", args.primer_match_threshold)
+
+    if not p1 or not p2:
+        print(
+            "%s skipped as no primer found for segment" % (segment.query_name),
+            file=sys.stderr,
+        )
+        return False
+
+    # check if primers are correctly paired and then assign read group
+    # NOTE: removed this as a function as only called once
+    # TODO: will try improving this / moving it to the primer scheme processing code
+    correctly_paired = p1[2]["Primer_ID"].replace("_LEFT", "") == p2[2][
+        "Primer_ID"
+    ].replace("_RIGHT", "")
+
+    if not args.no_read_groups:
+        if correctly_paired:
+            segment.set_tag("RG", p1[2]["PoolName"])
+        else:
+            segment.set_tag("RG", "unmatched")
+
+    # get the amplicon number
+    amplicon = p1[2]["Primer_ID"].split("_")[1]
+
+    if args.report:
+        # update the report with this alignment segment + primer details
+        report = {
+            "QueryName": segment.query_name,
+            "ReferenceStart": segment.reference_start,
+            "ReferenceEnd": segment.reference_end,
+            "PrimerPair": f"{p1[2]['Primer_ID']}_{p2[2]['Primer_ID']}",
+            "Primer1": p1[2]["Primer_ID"],
+            "Primer1Start": abs(p1[1]),
+            "Primer2": p2[2]["Primer_ID"],
+            "Primer2Start": abs(p2[1]),
+            "IsSecondary": segment.is_secondary,
+            "IsSupplementary": segment.is_supplementary,
+            "Start": p1[2]["start"],
+            "End": p2[2]["end"],
+            "CorrectlyPaired": correctly_paired,
+        }
+
+        report_writer.writerow(report)
+
+    if args.remove_incorrect_pairs and not correctly_paired:
+        print(
+            "%s skipped as not correctly paired" % (segment.query_name),
+            file=sys.stderr,
+        )
+        return False
+
+    if args.verbose:
+        # Dont screw with the order of the dict
+        report_str = "\t".join(str(x) for x in report.values())
+        print(report_str, file=sys.stderr)
+
+    # get the primer positions
+    if args.trim_primers:
+        p1_position = p1[2]["end"]
+        p2_position = p2[2]["start"]
+    else:
+        p1_position = p1[2]["start"]
+        p2_position = p2[2]["end"]
+
+    # softmask the alignment if left primer start/end inside alignment
+    if segment.reference_start < p1_position:
+        try:
+            trim(segment, p1_position, False, args.verbose)
+            if args.verbose:
+                print(
+                    "ref start %s >= primer_position %s"
+                    % (segment.reference_start, p1_position),
+                    file=sys.stderr,
+                )
+        except Exception as e:
+            print(
+                "problem soft masking left primer in {} (error: {}), skipping".format(
+                    segment.query_name, e
+                ),
+                file=sys.stderr,
+            )
+            return False
+
+    # softmask the alignment if right primer start/end inside alignment
+    if segment.reference_end > p2_position:
+        try:
+            trim(segment, p2_position, True, args.verbose)
+            if args.verbose:
+                print(
+                    "ref start %s >= primer_position %s"
+                    % (segment.reference_start, p2_position),
+                    file=sys.stderr,
+                )
+        except Exception as e:
+            print(
+                "problem soft masking right primer in {} (error: {}), skipping".format(
+                    segment.query_name, e
+                ),
+                file=sys.stderr,
+            )
+            return False
+
+    # check the the alignment still contains bases matching the reference
+    if "M" not in segment.cigarstring:
+        print(
+            "%s dropped as does not match reference post masking"
+            % (segment.query_name),
+            file=sys.stderr,
+        )
+        return False
+
+    return (amplicon, segment)
+
+
+def generate_amplicons(bed: list):
+    """Generate a dictionary of amplicons from a primer scheme list (generated by vcftagprimersites/read_bed_file)
+
+    Args:
+        bed (list): A list of dictionaries, where each dictionary contains a row of bedfile data (generated by vcftagprimersites/read_bed_file), assumes that all redundant primers have been expanded
+
+    Raises:
+        ValueError: Primer direction not recognised
+
+    Returns:
+        dict: A dictionary of amplicons, where each key is the amplicon number and the value is a dictionary containing the primer start, primer end, insert start, insert end, length and circularity
+    """
+
+    amplicons = {}
+    for primer in bed:
+
+        amplicon = primer["Primer_ID"].split("_")[1]
+
+        amplicons.setdefault(amplicon, {})
+
+        if primer["direction"] == "+":
+            amplicons[amplicon]["p_start"] = primer["start"]
+            amplicons[amplicon]["start"] = primer["end"] + 1
+
+        elif primer["direction"] == "-":
+            amplicons[amplicon]["p_end"] = primer["end"]
+            amplicons[amplicon]["end"] = primer["start"] - 1
+
+        else:
+            raise ValueError("Primer direction not recognised")
+
+    for amplicon in amplicons:
+        if not all([x in amplicons[amplicon] for x in ["p_start", "p_end"]]):
+            raise ValueError(f"Primer scheme for amplicon {amplicon} is incomplete")
+
+        # Check if primer runs accross reference start / end -> circular virus
+        amplicons[amplicon]["circular"] = (
+            amplicons[amplicon]["p_start"] > amplicons[amplicon]["p_end"]
+        )
+
+        # Calculate amplicon length considering that the "length" may be negative if the genome is circular
+        amplicons[amplicon]["length"] = abs(
+            amplicons[amplicon]["p_end"] - amplicons[amplicon]["p_start"]
+        )
+
+    return amplicons
+
+
+def normalise(trimmed_segments: dict, normalise: int, bed: list):
+    """Normalise the depth of the trimmed segments to a given value. Perform per-amplicon normalisation using numpy vector maths to determine whether the segment in question would take the depth closer to the desired depth accross the amplicon.
+
+    Args:
+        trimmed_segments (dict): Dict containing amplicon number as key and list of pysam.AlignedSegment as value
+        normalise (int): Desired normalised depth
+        bed (list): Primer scheme list (generated by vcftagprimersites/read_bed_file)
+        trim_primers (bool): Whether to trim primers from the reads
+
+    Raises:
+        ValueError: Amplicon assigned to segment not found in primer scheme file
+
+    Returns:
+       list : List of pysam.AlignedSegment to output
+    """
+
+    amplicons = generate_amplicons(bed)
+
+    output_segments = []
+
+    mean_depths = {x: 0 for x in amplicons}
+
+    for amplicon, segments in trimmed_segments.items():
+        if amplicon not in amplicons:
+            raise ValueError(f"Segment {amplicon} not found in primer scheme file")
+
+        desired_depth = np.full_like(
+            (amplicons[amplicon]["length"],), normalise, dtype=int
+        )
+
+        amplicon_depth = np.zeros((amplicons[amplicon]["length"],), dtype=int)
+
+        if not segments:
+            print(
+                f"No segments assigned to amplicon {amplicon}, skipping",
+                file=sys.stderr,
+            )
+            continue
+
+        random.shuffle(segments)
+
+        distance = np.mean(np.abs(amplicon_depth - desired_depth))
+
+        for segment in segments:
+            test_depths = np.copy(amplicon_depth)
+
+            relative_start = segment.reference_start - amplicons[amplicon]["p_start"]
+
+            if relative_start < 0:
+                relative_start = 0
+
+            relative_end = segment.reference_end - amplicons[amplicon]["p_start"]
+
+            test_depths[relative_start:relative_end] += 1
+
+            test_distance = np.mean(np.abs(test_depths - desired_depth))
+
+            if test_distance < distance:
+                amplicon_depth = test_depths
+                distance = test_distance
+                output_segments.append(segment)
+
+        mean_depths[amplicon] = np.mean(amplicon_depth)
+
+    return output_segments, mean_depths
+
+
 def go(args):
     """Filter and soft mask an alignment file so that the alignment boundaries match the primer start and end sites.
 
@@ -154,119 +440,90 @@ def go(args):
     # prepare the report outfile
     if args.report:
         reportfh = open(args.report, "w")
-        print("QueryName\tReferenceStart\tReferenceEnd\tPrimerPair\tPrimer1\tPrimer1Start\tPrimer2\tPrimer2Start\tIsSecondary\tIsSupplementary\tStart\tEnd\tCorrectlyPaired", file=reportfh)
-
-    # set up a counter to track amplicon abundance
-    counter = defaultdict(int)
+        report_headers = [
+            "QueryName",
+            "ReferenceStart",
+            "ReferenceEnd",
+            "PrimerPair",
+            "Primer1",
+            "Primer1Start",
+            "Primer2",
+            "Primer2Start",
+            "IsSecondary",
+            "IsSupplementary",
+            "Start",
+            "End",
+            "CorrectlyPaired",
+        ]
+        report_writer = csv.DictWriter(reportfh, fieldnames=report_headers)
+        report_writer.writeheader()
 
     # open the primer scheme and get the pools
     bed = read_bed_file(args.bedfile)
-    pools = set([row['PoolName'] for row in bed])
-    pools.add('unmatched')
+    pools = set([row["PoolName"] for row in bed])
+    pools.add("unmatched")
 
     # open the input SAM file and process read groups
     infile = pysam.AlignmentFile("-", "rb")
     bam_header = infile.header.copy().to_dict()
     if not args.no_read_groups:
-        bam_header['RG'] = []
+        bam_header["RG"] = []
         for pool in pools:
             read_group = {}
-            read_group['ID'] = pool
-            bam_header['RG'].append(read_group)
+            read_group["ID"] = pool
+            bam_header["RG"].append(read_group)
 
     # prepare the alignment outfile
     outfile = pysam.AlignmentFile("-", "wh", header=bam_header)
 
+    trimmed_segments = {}
+
     # iterate over the alignment segments in the input SAM file
     for segment in infile:
-
-        # filter out unmapped and supplementary alignment segments
-        if segment.is_unmapped:
-            print("%s skipped as unmapped" %
-                  (segment.query_name), file=sys.stderr)
-            continue
-        if segment.is_supplementary:
-            print("%s skipped as supplementary" %
-                  (segment.query_name), file=sys.stderr)
-            continue
-
-        # locate the nearest primers to this alignment segment
-        p1 = find_primer(bed, segment.reference_start, '+')
-        p2 = find_primer(bed, segment.reference_end, '-')
-
-        # check if primers are correctly paired and then assign read group
-        # NOTE: removed this as a function as only called once
-        # TODO: will try improving this / moving it to the primer scheme processing code
-        correctly_paired = p1[2]['Primer_ID'].replace(
-            '_LEFT', '') == p2[2]['Primer_ID'].replace('_RIGHT', '')
-        if not args.no_read_groups:
-            if correctly_paired:
-                segment.set_tag('RG', p1[2]['PoolName'])
-            else:
-                segment.set_tag('RG', 'unmatched')
-        if args.remove_incorrect_pairs and not correctly_paired:
-            print("%s skipped as not correctly paired" %
-                  (segment.query_name), file=sys.stderr)
-            continue
-
-        # update the report with this alignment segment + primer details
-        report = "%s\t%s\t%s\t%s_%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%d" % (segment.query_name, segment.reference_start, segment.reference_end, p1[2]['Primer_ID'], p2[2]['Primer_ID'], p1[2]['Primer_ID'], abs(
-            p1[1]), p2[2]['Primer_ID'], abs(p2[1]), segment.is_secondary, segment.is_supplementary, p1[2]['start'], p2[2]['end'], correctly_paired)
         if args.report:
-            print(report, file=reportfh)
-        if args.verbose:
-            print(report, file=sys.stderr)
-
-        # get the primer positions
-        if args.start:
-            p1_position = p1[2]['start']
-            p2_position = p2[2]['end']
+            trimming_tuple = handle_segment(
+                segment=segment,
+                bed=bed,
+                args=args,
+                report_writer=report_writer,
+                min_mapq=args.min_mapq,
+            )
         else:
-            p1_position = p1[2]['end']
-            p2_position = p2[2]['start']
-
-        # softmask the alignment if left primer start/end inside alignment
-        if segment.reference_start < p1_position:
-            try:
-                trim(segment, p1_position, False, args.verbose)
-                if args.verbose:
-                    print("ref start %s >= primer_position %s" %
-                          (segment.reference_start, p1_position), file=sys.stderr)
-            except Exception as e:
-                print("problem soft masking left primer in {} (error: {}), skipping" .format(
-                    segment.query_name, e), file=sys.stderr)
-                continue
-
-        # softmask the alignment if right primer start/end inside alignment
-        if segment.reference_end > p2_position:
-            try:
-                trim(segment, p2_position, True, args.verbose)
-                if args.verbose:
-                    print("ref start %s >= primer_position %s" %
-                          (segment.reference_start, p2_position), file=sys.stderr)
-            except Exception as e:
-                print("problem soft masking right primer in {} (error: {}), skipping" .format(
-                    segment.query_name, e), file=sys.stderr)
-                continue
-
-        # normalise if requested
-        if args.normalise:
-            pair = "%s-%s-%d" % (p1[2]['Primer_ID'],
-                                 p2[2]['Primer_ID'], segment.is_reverse)
-            counter[pair] += 1
-            if counter[pair] > args.normalise:
-                print("%s dropped as abundance theshold reached" %
-                      (segment.query_name), file=sys.stderr)
-                continue
-
-        # check the the alignment still contains bases matching the reference
-        if 'M' not in segment.cigarstring:
-            print("%s dropped as does not match reference post masking" %
-                  (segment.query_name), file=sys.stderr)
+            trimming_tuple = handle_segment(
+                segment=segment,
+                bed=bed,
+                args=args,
+                min_mapq=args.min_mapq,
+            )
+        if not trimming_tuple:
             continue
 
-        # current alignment segment has passed filters, send it to the outfile
-        outfile.write(segment)
+        # unpack the trimming tuple since segment passed trimming
+        amplicon, trimmed_segment = trimming_tuple
+        trimmed_segments.setdefault(amplicon, [])
+
+        if trimmed_segment:
+            trimmed_segments[amplicon].append(trimmed_segment)
+
+    # normalise if requested
+    if args.normalise:
+        output_segments, mean_amp_depths = normalise(
+            trimmed_segments, args.normalise, bed
+        )
+
+        # write mean amplicon depths to file
+        if args.amp_depth_report:
+            with open(args.amp_depth_report, "w") as amp_depth_report_fh:
+                amp_depth_report_fh.write("amplicon\tmean_depth\n")
+                for amplicon, depth in mean_amp_depths.items():
+                    amp_depth_report_fh.write(f"{amplicon}\t{depth}\n")
+
+        for output_segment in output_segments:
+            outfile.write(output_segment)
+    else:
+        for amplicon, segments in trimmed_segments.items():
+            for segment in segments:
+                outfile.write(segment)
 
     # close up the file handles
     infile.close()
@@ -276,21 +533,39 @@ def go(args):
 
 
 def main():
-    import argparse
-
     parser = argparse.ArgumentParser(
-        description='Trim alignments from an amplicon scheme.')
+        description="Trim alignments from an amplicon scheme."
+    )
+    parser.add_argument("bedfile", help="BED file containing the amplicon scheme")
+    parser.add_argument(
+        "--normalise", type=int, help="Subsample to n coverage per strand"
+    )
+    parser.add_argument(
+        "--min-mapq", type=int, default=20, help="Minimum mapping quality to keep"
+    )
+    parser.add_argument(
+        "--primer-match-threshold",
+        type=int,
+        default=5,
+        help="Fuzzy match primer positions within this threshold",
+    )
+    parser.add_argument("--report", type=str, help="Output report to file")
+    parser.add_argument(
+        "--amp-depth-report", type=str, help="Output amplicon depth tsv to path"
+    )
+    parser.add_argument(
+        "--trim-primers",
+        action="store_true",
+        help="Trims primers from reads",
+    )
     parser.add_argument(
-        'bedfile', help='BED file containing the amplicon scheme')
-    parser.add_argument('--normalise', type=int,
-                        help='Subsample to n coverage per strand')
-    parser.add_argument('--report', type=str, help='Output report to file')
-    parser.add_argument('--start', action='store_true',
-                        help='Trim to start of primers instead of ends')
-    parser.add_argument('--no-read-groups', dest='no_read_groups',
-                        action='store_true', help='Do not divide reads into groups in SAM output')
-    parser.add_argument('--verbose', action='store_true', help='Debug mode')
-    parser.add_argument('--remove-incorrect-pairs', action='store_true')
+        "--no-read-groups",
+        dest="no_read_groups",
+        action="store_true",
+        help="Do not divide reads into groups in SAM output",
+    )
+    parser.add_argument("--verbose", action="store_true", help="Debug mode")
+    parser.add_argument("--remove-incorrect-pairs", action="store_true")
 
     args = parser.parse_args()
     go(args)
diff --git a/artic/artic_mqc.py b/artic/artic_mqc.py
index 763316fc..08ef102a 100644
--- a/artic/artic_mqc.py
+++ b/artic/artic_mqc.py
@@ -2,9 +2,8 @@
 import json
 import re
 import sys
-from collections import OrderedDict
 
-from .vcftagprimersites import read_bed_file
+from artic.utils import read_bed_file
 
 # Alignment_Length_Threshold drops binned reads that are <X% of amplicon length)
 Alignment_Length_Threshold = 0.95
@@ -16,7 +15,9 @@
 amplicon_plot_template = {
     "id": "custom_data_lineplot",
     "section_name": "ARTIC: Amplicon Coverage",
-    "description": "This plot summarises the number of reads that were assigned to each amplicon in the primer scheme.\nWe use the align_trim report file from the ARTIC pipeline and group each read by its assigned amplicon.\nIf the length of alignment between read and reference is <{}% of the amplicon length, the read discarded from the coverage plot.\nIf the total number of reads assigned to an amplicon is below {} (red dashed line),\nthe amplicon is marked as dropped out." .format(Alignment_Length_Threshold, Amplicon_Dropout_Val),
+    "description": "This plot summarises the number of reads that were assigned to each amplicon in the primer scheme.\nWe use the align_trim report file from the ARTIC pipeline and group each read by its assigned amplicon.\nIf the length of alignment between read and reference is <{}% of the amplicon length, the read discarded from the coverage plot.\nIf the total number of reads assigned to an amplicon is below {} (red dashed line),\nthe amplicon is marked as dropped out.".format(
+        Alignment_Length_Threshold, Amplicon_Dropout_Val
+    ),
     "plot_type": "linegraph",
     "pconfig": {
         "id": "custom_data_linegraph",
@@ -26,15 +27,17 @@
         "xDecimals": "False",
         "ylab": "# reads",
         "xlab": "amplicon",
-        "yPlotLines": [{
-            "color": "#FF0000",
-            "width": 2,
-            "dashStyle": "LongDash",
-            "label": "Amplicon dropout",
-            "value": Amplicon_Dropout_Val
-        }]
+        "yPlotLines": [
+            {
+                "color": "#FF0000",
+                "width": 2,
+                "dashStyle": "LongDash",
+                "label": "Amplicon dropout",
+                "value": Amplicon_Dropout_Val,
+            }
+        ],
     },
-    "data": {}
+    "data": {},
 }
 
 # Template for the stats table data
@@ -48,11 +51,12 @@
         "title": "",
         "min": 0,
         "scale": "RdYlGn-rev",
-        "format": "{:,.0f}"
+        "format": "{:,.0f}",
     },
-    "data": {}
+    "data": {},
 }
 
+
 def getSchemeAmplicons(schemeFile):
     """Get the expected amplicon names from the provided scheme.
 
@@ -60,7 +64,7 @@ def getSchemeAmplicons(schemeFile):
     ----------
     schemeFile : string
         The filename of the primer scheme
-    
+
     Returns
     -------
     dict
@@ -80,11 +84,14 @@ def getSchemeAmplicons(schemeFile):
     named_amplicons = {}
     for amplicon in amplicons:
         if amplicons[amplicon] != 2:
-            print("in correct numbers of primer for {}" .format(amplicon), file=sys.stderr)
+            print(
+                "in correct numbers of primer for {}".format(amplicon), file=sys.stderr
+            )
             raise SystemExit(1)
-        named_amplicons[("{}_LEFT_{}_RIGHT" .format(amplicon, amplicon))] = 0
+        named_amplicons[("{}_LEFT_{}_RIGHT".format(amplicon, amplicon))] = 0
     return named_amplicons
 
+
 def getAmpliconCounts(amplicons, align_trim_report):
     """Get the read counts per amplicon.
 
@@ -107,9 +114,9 @@ def getAmpliconCounts(amplicons, align_trim_report):
         # skip the first line (header)
         fh.readline()
 
-        # process each line and add to counts
+        # process each line and add to counts
         for l in fh:
-            fields = l.rstrip().split('\t')
+            fields = l.rstrip().split("\t")
 
             # check read is from a properly paired amplicon
             if int(fields[12]) != 1:
@@ -123,11 +130,17 @@ def getAmpliconCounts(amplicons, align_trim_report):
 
             # increment the read count for this amplicon
             if fields[3] not in amplicons:
-                print("amplicon in align_trim report but not in primer scheme {}" .format(fields[3]), file=sys.stderr)
+                print(
+                    "amplicon in align_trim report but not in primer scheme {}".format(
+                        fields[3]
+                    ),
+                    file=sys.stderr,
+                )
                 raise SystemExit(1)
             amplicons[fields[3]] += 1
     return amplicons
 
+
 def getVCFreportInfo(vcf_report):
     """Get the read counts per amplicon.
 
@@ -147,38 +160,42 @@ def getVCFreportInfo(vcf_report):
     passed_vars = 0
     with open(vcf_report, "r") as fh:
         for l in fh:
-            match = re.search(r'.*\t(\d+)\svariant\srecords\sprocessed', l)
+            match = re.search(r".*\t(\d+)\svariant\srecords\sprocessed", l)
             if match:
                 total_vars = int(match.group(1))
-            match = re.search(r'.*\t(\d+)\svariant\srecords\spassed\schecks', l)
+            match = re.search(r".*\t(\d+)\svariant\srecords\spassed\schecks", l)
             if match:
                 passed_vars = int(match.group(1))
         stats["# overlap var. fails"] = total_vars - passed_vars
     return stats
 
+
 def run(args):
-    """Collect stats from ARTIC pipeline output and generate files for use by MultiQC.
-    """
+    """Collect stats from ARTIC pipeline output and generate files for use by MultiQC."""
     # get a list of expected amplicon names
     amplicons = getSchemeAmplicons(args.scheme)
 
-    # open align trim output and count reads per amplicon in scheme
+    # open align trim output and count reads per amplicon in scheme
     amplicon_counts = getAmpliconCounts(amplicons, args.align_report)
 
     # replace amplicon names with ints and count number of dropouts
     dropouts = 0
     amplicon_renamed_counts = dict()
     for amplicon, count in amplicon_counts.items():
-        amplicon_renamed_counts[int(amplicon.split('_')[1])] = count
+        amplicon_renamed_counts[int(amplicon.split("_")[1])] = count
         if count < Amplicon_Dropout_Val:
             dropouts += 1
-    
+
     # add counts to multiqc amplicon plot template
     amplicon_plot_template["data"][args.sample] = amplicon_renamed_counts
 
     # write the amplicon plot output
-    with open("{}.amplicon_plot_data_mqc.json" .format(args.sample), "w") as amplicon_plot_mqc_file:
-        json.dump(amplicon_plot_template, amplicon_plot_mqc_file, indent=4, sort_keys=False)
+    with open(
+        "{}.amplicon_plot_data_mqc.json".format(args.sample), "w"
+    ) as amplicon_plot_mqc_file:
+        json.dump(
+            amplicon_plot_template, amplicon_plot_mqc_file, indent=4, sort_keys=False
+        )
     amplicon_plot_mqc_file.close()
 
     # add counts to multiqc stats template
@@ -191,20 +208,40 @@ def run(args):
             amplicon_stats_template["data"][args.sample][stat] = value
 
     # write the stats output
-    with open("{}.amplicon_stats_data_mqc.json" .format(args.sample), "w") as amplicon_stats_mqc_file:
-        json.dump(amplicon_stats_template, amplicon_stats_mqc_file, indent=4, sort_keys=False)
+    with open(
+        "{}.amplicon_stats_data_mqc.json".format(args.sample), "w"
+    ) as amplicon_stats_mqc_file:
+        json.dump(
+            amplicon_stats_template, amplicon_stats_mqc_file, indent=4, sort_keys=False
+        )
     amplicon_stats_mqc_file.close()
 
+
 def main():
     import argparse
-    parser = argparse.ArgumentParser(description='Collect stats from ARTIC pipeline output and generate files for use by MultiQC')
-    parser.add_argument('--scheme', required=True, type=str, help='the amplicon scheme used')
-    parser.add_argument('--align-report', required=True, type=str, help='the report file from align_trim (*.alignreport.txt')
-    parser.add_argument('--vcf-report', required=False, type=str, help='the report file from vcf_check (*.vcfreport.txt')
-    parser.add_argument('sample', type=str, help='the sample name')
+
+    parser = argparse.ArgumentParser(
+        description="Collect stats from ARTIC pipeline output and generate files for use by MultiQC"
+    )
+    parser.add_argument(
+        "--scheme", required=True, type=str, help="the amplicon scheme used"
+    )
+    parser.add_argument(
+        "--align-report",
+        required=True,
+        type=str,
+        help="the report file from align_trim (*.alignreport.txt",
+    )
+    parser.add_argument(
+        "--vcf-report",
+        required=False,
+        type=str,
+        help="the report file from vcf_check (*.vcfreport.txt",
+    )
+    parser.add_argument("sample", type=str, help="the sample name")
     args = parser.parse_args()
     run(args)
 
+
 if __name__ == "__main__":
     main()
-
diff --git a/artic/align_trim_n.py b/artic/deprecated/align_trim_n.py
similarity index 100%
rename from artic/align_trim_n.py
rename to artic/deprecated/align_trim_n.py
diff --git a/artic/basecaller.py b/artic/deprecated/basecaller.py
similarity index 100%
rename from artic/basecaller.py
rename to artic/deprecated/basecaller.py
diff --git a/artic/conftest.py b/artic/deprecated/conftest.py
similarity index 100%
rename from artic/conftest.py
rename to artic/deprecated/conftest.py
diff --git a/artic/demultiplex.py b/artic/deprecated/demultiplex.py
similarity index 100%
rename from artic/demultiplex.py
rename to artic/deprecated/demultiplex.py
diff --git a/artic/extract.py b/artic/deprecated/extract.py
similarity index 100%
rename from artic/extract.py
rename to artic/deprecated/extract.py
diff --git a/artic/gather.py b/artic/deprecated/gather.py
similarity index 100%
rename from artic/gather.py
rename to artic/deprecated/gather.py
diff --git a/artic/margin_cons_medaka.py b/artic/deprecated/margin_cons_medaka.py
similarity index 58%
rename from artic/margin_cons_medaka.py
rename to artic/deprecated/margin_cons_medaka.py
index 798d05df..e4a8d54e 100755
--- a/artic/margin_cons_medaka.py
+++ b/artic/deprecated/margin_cons_medaka.py
@@ -9,20 +9,21 @@
 from .vcftagprimersites import read_bed_file
 import argparse
 
+
 def collect_depths(bamfile):
     if not os.path.exists(bamfile):
         raise SystemExit("bamfile %s doesn't exist" % (bamfile,))
 
-    p = subprocess.Popen(['samtools', 'depth', bamfile],
-                             stdout=subprocess.PIPE)
+    p = subprocess.Popen(["samtools", "depth", bamfile], stdout=subprocess.PIPE)
     out, err = p.communicate()
     depths = defaultdict(dict)
-    for ln in out.decode('utf-8').split("\n"):
-       if ln:
-          contig, pos, depth = ln.split("\t")
-          depths[contig][int(pos)] = int(depth)
+    for ln in out.decode("utf-8").split("\n"):
+        if ln:
+            contig, pos, depth = ln.split("\t")
+            depths[contig][int(pos)] = int(depth)
     return depths
 
+
 class Reporter:
     def __init__(self, vcffile, depths):
         self.vcffile = vcffile
@@ -33,7 +34,11 @@ def report(self, r, status, allele):
         print("%s\t%s\tstatus\t%s" % (idfile, r.POS, status), file=sys.stderr)
         print("%s\t%s\tallele\t%s" % (idfile, r.POS, allele), file=sys.stderr)
         print("%s\t%s\tref\t%s" % (idfile, r.POS, r.REF), file=sys.stderr)
-        print("%s\t%s\tdepth\t%s" % (idfile, r.POS, self.depths[r.CHROM][r.POS]), file=sys.stderr)
+        print(
+            "%s\t%s\tdepth\t%s" % (idfile, r.POS, self.depths[r.CHROM][r.POS]),
+            file=sys.stderr,
+        )
+
 
 def go(args):
     MASKED_POSITIONS = defaultdict(set)
@@ -48,12 +53,12 @@ def go(args):
 
         for n, c in enumerate(cons[k]):
             try:
-                depth = depths[k][n+1]
+                depth = depths[k][n + 1]
             except:
                 depth = 0
 
             if depth < args.depth:
-                cons[k][n] = 'N'
+                cons[k][n] = "N"
 
     if args.masked:
         for region in args.masked.split(","):
@@ -62,18 +67,18 @@ def go(args):
             start = int(start)
             end = int(end)
             for n in range(start, end):
-                cons[contig][n-1] = 'N'
-                MASKED_POSITIONS[contig].add(n-1)
+                cons[contig][n - 1] = "N"
+                MASKED_POSITIONS[contig].add(n - 1)
 
     sett = set()
-    vcf_reader = vcf.Reader(open(args.vcffile, 'r'))
+    vcf_reader = vcf.Reader(open(args.vcffile, "r"))
     for record in vcf_reader:
-        if record.ALT[0] != '.':
+        if record.ALT[0] != ".":
             # variant call
 
             if record.POS in MASKED_POSITIONS[record.CHROM]:
                 reporter.report(record, "masked_manual", "n")
-                cons[record.CHROM][record.POS-1] = 'N'
+                cons[record.CHROM][record.POS - 1] = "N"
                 continue
 
             if record.num_het:
@@ -82,30 +87,38 @@ def go(args):
                     continue
                 else:
                     reporter.report(record, "het_site", "y")
-                    cons[record.CHROM][record.POS-1] = 'N'
+                    cons[record.CHROM][record.POS - 1] = "N"
                     continue
 
-            if 'PRIMER' in record.INFO:
+            if "PRIMER" in record.INFO:
                 reporter.report(record, "primer_binding_site", "n")
-                cons[record.CHROM][record.POS-1] = 'N'
+                cons[record.CHROM][record.POS - 1] = "N"
                 continue
 
-            #support = float(record.INFO['SupportFraction'])
-            #total_reads = int(record.INFO['TotalReads'])
+            # support = float(record.INFO['SupportFraction'])
+            # total_reads = int(record.INFO['TotalReads'])
             qual = record.QUAL
 
             REF = record.REF
             ALT = str(record.ALT[0])
 
             if len(ALT) > len(REF):
-                print("Skipping insertion at position: %s" % (record.POS), file=sys.stderr)
+                print(
+                    "Skipping insertion at position: %s" % (record.POS), file=sys.stderr
+                )
                 continue
 
-            if depths[record.CHROM][record.POS] >= args.depth and record.QUAL >= args.quality:
+            if (
+                depths[record.CHROM][record.POS] >= args.depth
+                and record.QUAL >= args.quality
+            ):
                 if len(REF) > len(ALT):
-                    print("N-masking confident deletion at %s" % (record.POS), file=sys.stderr)
+                    print(
+                        "N-masking confident deletion at %s" % (record.POS),
+                        file=sys.stderr,
+                    )
                     for n in range(len(REF)):
-                        cons[record.CHROM][record.POS-1+n] = 'N'
+                        cons[record.CHROM][record.POS - 1 + n] = "N"
                     continue
 
                 reporter.report(record, "variant", ALT)
@@ -117,7 +130,7 @@ def go(args):
                 if len(ALT) > len(REF):
                     print("insertion", file=sys.stderr)
                     continue
-                cons[record.CHROM][record.POS-1] = str(ALT)
+                cons[record.CHROM][record.POS - 1] = str(ALT)
             elif len(REF) > len(ALT):
                 continue
             else:
@@ -125,25 +138,33 @@ def go(args):
                     reporter.report(record, "low_depth_variant", "n")
                 else:
                     reporter.report(record, "low_qual_variant", "n")
-                #cons[record.CHROM][record.POS-1] = 'N'
-                continue    
+                # cons[record.CHROM][record.POS-1] = 'N'
+                continue
 
-    #print >>sys.stderr, str(sett)
+    # print >>sys.stderr, str(sett)
 
     for k in seqs.keys():
-       print(">%s-%s" % (args.bamfile, k))
-       print("".join(cons[k]))
+        print(">%s-%s" % (args.bamfile, k))
+        print("".join(cons[k]))
+
 
 def main():
-   parser = argparse.ArgumentParser()
-   parser.add_argument('--depth', type=int, default=5, help='minimum depth to call a variant')
-   parser.add_argument('--quality', type=int, default=0, help='minimum quality to call a variant')
-   parser.add_argument('--masked', help='Regions to mask (contig:start-end,contig:start-end)')
-   parser.add_argument('reference')
-   parser.add_argument('vcffile')
-   parser.add_argument('bamfile')
-   args = parser.parse_args()
-   go(args)
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--depth", type=int, default=5, help="minimum depth to call a variant"
+    )
+    parser.add_argument(
+        "--quality", type=int, default=0, help="minimum quality to call a variant"
+    )
+    parser.add_argument(
+        "--masked", help="Regions to mask (contig:start-end,contig:start-end)"
+    )
+    parser.add_argument("reference")
+    parser.add_argument("vcffile")
+    parser.add_argument("bamfile")
+    args = parser.parse_args()
+    go(args)
+
 
 if __name__ == "__main__":
     main()
diff --git a/artic/deprecated/setup.py b/artic/deprecated/setup.py
new file mode 100644
index 00000000..84e1575a
--- /dev/null
+++ b/artic/deprecated/setup.py
@@ -0,0 +1,48 @@
+import os
+from setuptools import setup
+
+version_py = os.path.join(os.path.dirname(__file__), "artic", "version.py")
+version = open(version_py).read().strip().split("=")[-1].replace('"', "").strip()
+long_description = """
+``artic`` is a pipeline for working with virus sequencing data sequenced with nanopore
+"""
+
+HERE = os.path.dirname(__file__)
+
+setup(
+    name="artic",
+    version=version,
+    requires=["python (>=3.5)"],
+    packages=["artic"],
+    author="Nick Loman",
+    description="A toolset for working with nanopore sequencing data",
+    long_description=long_description,
+    url="https://artic.network/ncov-2019/ncov2019-bioinformatics-sop.html",
+    package_dir={"artic": "artic"},
+    package_data={"artic": []},
+    zip_safe=False,
+    include_package_data=True,
+    entry_points={
+        "console_scripts": [
+            "artic=artic.pipeline:main",
+            "align_trim=artic.align_trim:main",
+            "align_trim_n=artic.align_trim_n:main",
+            "margin_cons=artic.margin_cons:main",
+            "margin_cons_medaka=artic.margin_cons_medaka:main",
+            "vcfextract=artic.vcfextract:main",
+            "artic_vcf_merge=artic.vcf_merge:main",
+            "artic_vcf_filter=artic.vcf_filter:main",
+            "artic_make_depth_mask=artic.make_depth_mask:main",
+            "artic_fasta_header=artic.fasta_header:main",
+            "artic_mask=artic.mask:main",
+            "artic_get_stats=artic.artic_mqc:main",
+        ],
+    },
+    author_email="n.j.loman@bham.ac.uk",
+    classifiers=[
+        "Development Status :: 4 - Beta",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: MIT",
+        "Topic :: Scientific/Engineering :: Bio-Informatics",
+    ],
+)
diff --git a/artic/export.py b/artic/export.py
index 77d6c01b..0d32484a 100755
--- a/artic/export.py
+++ b/artic/export.py
@@ -1,38 +1,60 @@
-#Written by Nick Loman (@pathogenomenick)
+# Written by Nick Loman (@pathogenomenick)
 
 import os
-import sys
 import pandas as pd
 from Bio import SeqIO
 
-def run(parser, args):
-   if not os.path.exists(args.output_directory):
-      os.mkdir(args.output_directory)
-
-   query_set = set()
-
-   fqfn = "%s/%s.fastq" % (args.output_directory, args.prefix,)
-   cmd = "samtools view -h -F 4 %s | samtools fastq - -0 %s" % (args.bamfile, fqfn)
-   print (cmd)
-   os.system(cmd)
-
-   fn = "%s/%s.reads.list" % (args.output_directory, args.prefix,)
-   fh = open(fn, "w")
-   for rec in SeqIO.parse(open(fqfn), "fastq"):
-      fh.write("%s\n" % (rec.id))
-   fh.close()
-
-   cmd = "fast5_subset -r -i \"%s\" -l \"%s/%s.reads.list\" -f \"%s_\" -n 4000 -s \"%s/fast5\"" % (
-       args.fast5_directory, args.output_directory, args.prefix, args.prefix, args.output_directory)
-   os.system(cmd)
 
-   mappingfn = "%s/fast5/filename_mapping.txt" % (args.output_directory)
-   mapping_file = pd.read_csv(mappingfn, names=('read_id', 'filename'), sep='\t', index_col='read_id', header=None)
-
-   summary_file = pd.read_csv(args.sequencing_summary, sep='\t', low_memory=False)
-   filtered_summary_file = summary_file[summary_file.read_id.isin(query_set)].copy()
-
-   filtered_summary_file['filename'] = filtered_summary_file.apply(lambda row: mapping_file.loc[row['read_id']]['filename'] , axis=1)
-
-   summaryfn = "%s/%s_sequencing_summary.txt" % (args.output_directory, args.prefix)
-   filtered_summary_file.to_csv(summaryfn, sep="\t", index=False)
+def run(parser, args):
+    if not os.path.exists(args.output_directory):
+        os.mkdir(args.output_directory)
+
+    query_set = set()
+
+    fqfn = "%s/%s.fastq" % (
+        args.output_directory,
+        args.prefix,
+    )
+    cmd = "samtools view -h -F 4 %s | samtools fastq - -0 %s" % (args.bamfile, fqfn)
+    print(cmd)
+    os.system(cmd)
+
+    fn = "%s/%s.reads.list" % (
+        args.output_directory,
+        args.prefix,
+    )
+    fh = open(fn, "w")
+    for rec in SeqIO.parse(open(fqfn), "fastq"):
+        fh.write("%s\n" % (rec.id))
+    fh.close()
+
+    cmd = (
+        'fast5_subset -r -i "%s" -l "%s/%s.reads.list" -f "%s_" -n 4000 -s "%s/fast5"'
+        % (
+            args.fast5_directory,
+            args.output_directory,
+            args.prefix,
+            args.prefix,
+            args.output_directory,
+        )
+    )
+    os.system(cmd)
+
+    mappingfn = "%s/fast5/filename_mapping.txt" % (args.output_directory)
+    mapping_file = pd.read_csv(
+        mappingfn,
+        names=("read_id", "filename"),
+        sep="\t",
+        index_col="read_id",
+        header=None,
+    )
+
+    summary_file = pd.read_csv(args.sequencing_summary, sep="\t", low_memory=False)
+    filtered_summary_file = summary_file[summary_file.read_id.isin(query_set)].copy()
+
+    filtered_summary_file["filename"] = filtered_summary_file.apply(
+        lambda row: mapping_file.loc[row["read_id"]]["filename"], axis=1
+    )
+
+    summaryfn = "%s/%s_sequencing_summary.txt" % (args.output_directory, args.prefix)
+    filtered_summary_file.to_csv(summaryfn, sep="\t", index=False)
diff --git a/artic/fasta_header.py b/artic/fasta_header.py
index 16c89432..b470ae22 100644
--- a/artic/fasta_header.py
+++ b/artic/fasta_header.py
@@ -1,11 +1,11 @@
 from Bio import SeqIO
-import sys
+
 
 def fasta_header(fn, header):
     fh = open(fn)
     rec = list(SeqIO.parse(fh, "fasta"))
     if len(rec) > 1:
-        print ("Sorry, this script doesn't support multi-FASTA files!")
+        print("Sorry, this script doesn't support multi-FASTA files!")
         raise SystemExit
     fh.close()
 
@@ -14,15 +14,19 @@ def fasta_header(fn, header):
     SeqIO.write([rec[0]], fh, "fasta")
     fh.close()
 
+
 def main():
     import argparse
 
-    parser = argparse.ArgumentParser(description='Trim alignments from an amplicon scheme.')
-    parser.add_argument('filename')
-    parser.add_argument('header')
+    parser = argparse.ArgumentParser(
+        description="Trim alignments from an amplicon scheme."
+    )
+    parser.add_argument("filename")
+    parser.add_argument("header")
 
     args = parser.parse_args()
     fasta_header(args.filename, args.header)
 
+
 if __name__ == "__main__":
     main()
diff --git a/artic/filter_reads.py b/artic/filter_reads.py
index 571d990b..fa39baef 100644
--- a/artic/filter_reads.py
+++ b/artic/filter_reads.py
@@ -1,31 +1,27 @@
 import sys
 from Bio import SeqIO
-import tempfile
-import os
-import glob
-import shutil
 
 # extract with constraints:
 #   -- only one group ever
 #   -- only one flowcell ID ever
 #   -- always unique read ID
 
-def run(parser, args):
-	dups = set()
-	uniq = 0
-	total = 0	
-	for rec in SeqIO.parse(open(args.filename), "fastq"):
-		if args.max_length and len(rec) > args.max_length:
-			continue
-		if args.min_length and len(rec) < args.min_length:
-			continue
 
-		total += 1
-		if rec.id not in dups:
-			SeqIO.write([rec], sys.stdout, "fastq")
+def run(parser, args):
+    dups = set()
+    uniq = 0
+    total = 0
+    for rec in SeqIO.parse(open(args.filename), "fastq"):
+        if args.max_length and len(rec) > args.max_length:
+            continue
+        if args.min_length and len(rec) < args.min_length:
+            continue
 
-			dups.add(rec.id)
-			uniq += 1
+        total += 1
+        if rec.id not in dups:
+            SeqIO.write([rec], sys.stdout, "fastq")
 
-	print("%s\t%s\t%s" % (total, uniq), file=sys.stderr)
+            dups.add(rec.id)
+            uniq += 1
 
+    print("%s\t%s\t%s" % (total, uniq), file=sys.stderr)
diff --git a/artic/guppyplex.py b/artic/guppyplex.py
index 7889d52e..d6dbf6f0 100644
--- a/artic/guppyplex.py
+++ b/artic/guppyplex.py
@@ -1,13 +1,9 @@
 import sys
 from Bio import SeqIO
-import tempfile
 import os
-import glob
 import gzip
 import fnmatch
-import shutil
 import pandas as pd
-from collections import defaultdict
 from mimetypes import guess_type
 from functools import partial
 from math import log10
@@ -24,21 +20,33 @@
 
 
 def get_read_mean_quality(record):
-    return -10 * log10((10 ** (pd.Series(record.letter_annotations["phred_quality"]) / -10)).mean())
+    return -10 * log10(
+        (10 ** (pd.Series(record.letter_annotations["phred_quality"]) / -10)).mean()
+    )
 
 
 def run(parser, args):
     files = os.listdir(args.directory)
-    fastq_files = [os.path.join(args.directory, f) for f in files if fnmatch.fnmatch(f, '*.fastq*') and not f.endswith('.temp')]
+    fastq_files = [
+        os.path.join(args.directory, f)
+        for f in files
+        if fnmatch.fnmatch(f, "*.fastq*") and not f.endswith(".temp")
+    ]
 
     if fastq_files:
         if not args.output:
-            fastq_outfn = "%s_%s.fastq" % (args.prefix, os.path.basename(args.directory))
+            fastq_outfn = "%s_%s.fastq" % (
+                args.prefix,
+                os.path.basename(args.directory),
+            )
         else:
             fastq_outfn = args.output
 
         outfh = open(fastq_outfn, "w")
-        print("Processing %s files in %s" % (len(fastq_files), args.directory), file=sys.stderr)
+        print(
+            "Processing %s files in %s" % (len(fastq_files), args.directory),
+            file=sys.stderr,
+        )
 
         dups = set()
 
@@ -51,20 +59,23 @@ def run(parser, args):
             with _open(fn) as f:
                 try:
                     for rec in SeqIO.parse(f, "fastq"):
-                       if args.max_length and len(rec) > args.max_length:
-                           continue
-                       if args.min_length and len(rec) < args.min_length:
-                           continue
-                       if not args.skip_quality_check and get_read_mean_quality(rec) < args.quality:
-                           continue
-                       if args.sample < 1:
-                           r = random()
-                           if r >= args.sample:
-                              continue
+                        if args.max_length and len(rec) > args.max_length:
+                            continue
+                        if args.min_length and len(rec) < args.min_length:
+                            continue
+                        if (
+                            not args.skip_quality_check
+                            and get_read_mean_quality(rec) < args.quality
+                        ):
+                            continue
+                        if args.sample < 1:
+                            r = random()
+                            if r >= args.sample:
+                                continue
 
-                       if rec.id not in dups:
-                           SeqIO.write([rec], outfh, "fastq")
-                           dups.add(rec.id)
+                        if rec.id not in dups:
+                            SeqIO.write([rec], outfh, "fastq")
+                            dups.add(rec.id)
                 except ValueError:
                     pass
 
diff --git a/artic/make_depth_mask.py b/artic/make_depth_mask.py
index 14bfd1a4..579eca18 100755
--- a/artic/make_depth_mask.py
+++ b/artic/make_depth_mask.py
@@ -38,13 +38,12 @@ def collect_depths(bamfile, refName, minDepth, ignoreDeletions, warnRGcov):
         raise Exception("bamfile doesn't exist (%s)" % bamfile)
 
     # open the BAM file
-    bamFile = pysam.AlignmentFile(bamfile, 'rb')
+    bamFile = pysam.AlignmentFile(bamfile, "rb")
 
     # get the TID for the reference
     tid = bamFile.get_tid(refName)
     if tid == -1:
-        raise Exception(
-            "bamfile does not contain specified reference (%s)" % refName)
+        raise Exception("bamfile does not contain specified reference (%s)" % refName)
 
     # create a depth vector to hold the depths at each reference position
     depths = [0] * bamFile.get_reference_length(refName)
@@ -53,10 +52,10 @@ def collect_depths(bamfile, refName, minDepth, ignoreDeletions, warnRGcov):
     rgDepths = {}
 
     # get the read groups and init the depth vectors
-    for rg in bamFile.header['RG']:
-        if rg['ID'] == 'unmatched':
+    for rg in bamFile.header["RG"]:
+        if rg["ID"] == "unmatched":
             continue
-        rgDepths[rg['ID']] = [0] * bamFile.get_reference_length(refName)
+        rgDepths[rg["ID"]] = [0] * bamFile.get_reference_length(refName)
 
     # flag to state if BAM file has low readgroup coverage
     lowRGcov = False
@@ -65,13 +64,15 @@ def collect_depths(bamfile, refName, minDepth, ignoreDeletions, warnRGcov):
     lowRGvec = []
 
     # generate the pileup
-    for pileupcolumn in bamFile.pileup(refName, max_depth=10000, truncate=False, min_base_quality=0):
+    for pileupcolumn in bamFile.pileup(
+        refName, start=0, stop=bamFile.get_reference_length(refName), max_depth=10000, truncate=False, min_base_quality=0
+    ):
 
         # process the pileup column
         for pileupread in pileupcolumn.pileups:
 
             # get the read group for this pileup read and check it's in the BAM header
-            rg = pileupread.alignment.get_tag('RG')
+            rg = pileupread.alignment.get_tag("RG")
             assert rg in rgDepths, "alignment readgroup not in BAM header: %s" % rg
 
             # process the pileup read
@@ -112,9 +113,12 @@ def collect_depths(bamfile, refName, minDepth, ignoreDeletions, warnRGcov):
         # get the regions and print warning
         regions = list(intervals_extract(lowRGvec))
         sys.stderr.write(
-            "alignment has unmasked regions where individual readgroup depth < {}: {}\n" .format(minDepth, bamfile))
+            "alignment has unmasked regions where individual readgroup depth < {}: {}\n".format(
+                minDepth, bamfile
+            )
+        )
         for region in regions:
-            sys.stderr.write("region: %s\n" % str(region).strip('[]'))
+            sys.stderr.write("region: %s\n" % str(region).strip("[]"))
 
     # close file and return depth vector
     bamFile.close()
@@ -138,8 +142,13 @@ def go(args):
 
     # collect the depths from the pileup, replacing any depth<minDepth with 0
     try:
-        depths, rgDepths = collect_depths(args.bamfile, seqID,
-                                          args.depth, args.ignore_deletions, args.warn_rg_coverage)
+        depths, rgDepths = collect_depths(
+            args.bamfile,
+            seqID,
+            args.depth,
+            args.ignore_deletions,
+            args.warn_rg_coverage,
+        )
     except Exception as e:
         print(e)
         raise SystemExit(1)
@@ -153,7 +162,7 @@ def go(args):
 
         # rg is the readgroup and rgd is the depths per position for this readgroup
         for rg, rgd in rgDepths.items():
-            fh = open(args.outfile + "." + rg + ".depths", 'w')
+            fh = open(args.outfile + "." + rg + ".depths", "w")
             for pos, depth in enumerate(rgd):
                 fh.write("%s\t%s\t%d\t%d\n" % (seqID, rg, pos, depth))
             fh.close()
@@ -167,10 +176,10 @@ def go(args):
     # get the intervals from the mask_vector
     intervals = list(intervals_extract(mask_vector))
 
-    # create the mask outfile
-    maskfh = open(args.outfile, 'w')
+    # create the mask outfile
+    maskfh = open(args.outfile, "w")
     for i in intervals:
-        maskfh.write("%s\t%s\t%s\n" % (seqID, i[0]+1, i[1]+1))
+        maskfh.write("%s\t%s\t%s\n" % (seqID, i[0] + 1, i[1] + 1))
     maskfh.close()
 
 
@@ -178,17 +187,28 @@ def main():
     import argparse
 
     parser = argparse.ArgumentParser()
-    parser.add_argument('--depth', type=int, default=20)
-    parser.add_argument('--warn-rg-coverage', action="store_true", default=False,
-                        help="if set, a warning will be issued if the BAM file has pileup regions where coverage for each readgroup < min. depth BUT the combined coverage is > min. depth")
-    parser.add_argument('--ignore-deletions', action="store_true", default=False,
-                        help="if set, positional depth counts will ignore reads with reference deletions (i.e. evaluates positional depths on ref matches, not read span")
-    parser.add_argument('--store-rg-depths',
-                        action='store_true', default=False,
-                        help='if set, positional depth counts for each readgroup written to file (filename = <outfile>.<readgroup>.depths)')
-    parser.add_argument('reference')
-    parser.add_argument('bamfile')
-    parser.add_argument('outfile')
+    parser.add_argument("--depth", type=int, default=20)
+    parser.add_argument(
+        "--warn-rg-coverage",
+        action="store_true",
+        default=False,
+        help="if set, a warning will be issued if the BAM file has pileup regions where coverage for each readgroup < min. depth BUT the combined coverage is > min. depth",
+    )
+    parser.add_argument(
+        "--ignore-deletions",
+        action="store_true",
+        default=False,
+        help="if set, positional depth counts will ignore reads with reference deletions (i.e. evaluates positional depths on ref matches, not read span",
+    )
+    parser.add_argument(
+        "--store-rg-depths",
+        action="store_true",
+        default=False,
+        help="if set, positional depth counts for each readgroup written to file (filename = <outfile>.<readgroup>.depths)",
+    )
+    parser.add_argument("reference")
+    parser.add_argument("bamfile")
+    parser.add_argument("outfile")
 
     args = parser.parse_args()
     go(args)
diff --git a/artic/margin_cons.py b/artic/margin_cons.py
index 88ecc30a..a01ca357 100755
--- a/artic/margin_cons.py
+++ b/artic/margin_cons.py
@@ -1,33 +1,32 @@
 #!/usr/bin/env python
 from Bio import SeqIO
 import sys
-import vcf
+from cyvcf2 import VCF
 import subprocess
 from collections import defaultdict
 import os.path
-import operator
-from .vcftagprimersites import read_bed_file
 import argparse
 
-#MASKED_POSITIONS = [2282]
+# MASKED_POSITIONS = [2282]
 MASKED_POSITIONS = []
 
+
 def collect_depths(bamfile):
     if not os.path.exists(bamfile):
         raise SystemExit("bamfile %s doesn't exist" % (bamfile,))
 
     print(bamfile, file=sys.stderr)
 
-    p = subprocess.Popen(['samtools', 'depth', bamfile],
-                             stdout=subprocess.PIPE)
+    p = subprocess.Popen(["samtools", "depth", bamfile], stdout=subprocess.PIPE)
     out, err = p.communicate()
     depths = defaultdict(dict)
-    for ln in out.decode('utf-8').split("\n"):
-            if ln:
-                    contig, pos, depth = ln.split("\t")
-                    depths[contig][int(pos)] = int(depth)
+    for ln in out.decode("utf-8").split("\n"):
+        if ln:
+            contig, pos, depth = ln.split("\t")
+            depths[contig][int(pos)] = int(depth)
     return depths
 
+
 class Reporter:
     def __init__(self, vcffile, depths):
         self.vcffile = vcffile
@@ -36,65 +35,82 @@ def __init__(self, vcffile, depths):
     def report(self, r, status, allele):
         idfile = os.path.basename(self.vcffile).split(".")[0]
         print("%s\t%s\tstatus\t%s" % (idfile, r.POS, status), file=sys.stderr)
-        print("%s\t%s\tdepth\t%s" % (idfile, r.POS, r.INFO.get('TotalReads', ['n/a'])), file=sys.stderr)
-        print("%s\t%s\tbasecalledfrac\t%s" % (idfile, r.POS, r.INFO.get('BaseCalledFraction', ['n/a'])), file=sys.stderr)
-        print("%s\t%s\tsupportfrac\t%s" % (idfile, r.POS, r.INFO.get('SupportFraction', ['n/a'])), file=sys.stderr)
+        print(
+            "%s\t%s\tdepth\t%s" % (idfile, r.POS, r.INFO.get("TotalReads", ["n/a"])),
+            file=sys.stderr,
+        )
+        print(
+            "%s\t%s\tbasecalledfrac\t%s"
+            % (idfile, r.POS, r.INFO.get("BaseCalledFraction", ["n/a"])),
+            file=sys.stderr,
+        )
+        print(
+            "%s\t%s\tsupportfrac\t%s"
+            % (idfile, r.POS, r.INFO.get("SupportFraction", ["n/a"])),
+            file=sys.stderr,
+        )
         print("%s\t%s\tallele\t%s" % (idfile, r.POS, allele), file=sys.stderr)
         print("%s\t%s\tref\t%s" % (idfile, r.POS, r.REF), file=sys.stderr)
 
+
 def go(args):
 
     depths = collect_depths(args.bamfile)
     reporter = Reporter(args.vcffile, depths)
 
-    cons = ''
+    cons = ""
 
     seq = list(SeqIO.parse(open(sys.argv[1]), "fasta"))[0]
     cons = list(seq.seq)
 
     for n, c in enumerate(cons):
         try:
-            depth = depths[seq.id][n+1]
+            depth = depths[seq.id][n + 1]
         except:
             depth = 0
 
         if depth < args.depth:
-            cons[n] = 'N'
+            cons[n] = "N"
 
     for mask in MASKED_POSITIONS:
-        cons[mask-1] = 'N'
+        cons[mask - 1] = "N"
 
     sett = set()
-    vcf_reader = vcf.Reader(open(args.vcffile, 'r'))
+    vcf_reader = VCF(args.vcffile)
     for record in vcf_reader:
-        if record.ALT[0] != '.':
+        if record.ALT[0] != ".":
             # variant call
 
             if record.POS in MASKED_POSITIONS:
                 reporter.report(record, "masked_manual", "n")
                 continue
 
-            if 'PRIMER' in record.INFO:
+            if "PRIMER" in record.INFO:
                 reporter.report(record, "primer_binding_site", "n")
-                cons[record.POS-1] = 'N'
+                cons[record.POS - 1] = "N"
                 continue
 
-            support = float(record.INFO['SupportFraction'])
-            total_reads = int(record.INFO['TotalReads'])
+            support = float(record.INFO["SupportFraction"])
+            total_reads = int(record.INFO["TotalReads"])
             qual = record.QUAL
 
             REF = record.REF
             ALT = str(record.ALT[0])
 
             if len(ALT) > len(REF):
-                print("Skipping insertion at position: %s" % (record.POS), file=sys.stderr)
+                print(
+                    "Skipping insertion at position: %s" % (record.POS), file=sys.stderr
+                )
                 continue
 
             if qual >= 200 and total_reads >= 20:
                 if len(REF) > len(ALT):
-                    print("N-masking confident deletion at %s" % (record.POS), file=sys.stderr)
+                    print(
+                        "N-masking confident deletion at %s" % (record.POS),
+                        file=sys.stderr,
+                    )
                     for n in range(len(REF)):
-                        cons[record.POS-1+n] = 'N'
+                        cons[record.POS - 1 + n] = "N"
                     continue
 
                 reporter.report(record, "variant", ALT)
@@ -107,26 +123,29 @@ def go(args):
                     print("insertion", file=sys.stderr)
                     continue
 
-                cons[record.POS-1] = str(ALT)
+                cons[record.POS - 1] = str(ALT)
             elif len(REF) > len(ALT):
                 continue
             else:
                 reporter.report(record, "low_qual_variant", "n")
-                cons[record.POS-1] = 'N'
-                continue    
+                cons[record.POS - 1] = "N"
+                continue
 
     print(">%s" % (sys.argv[3]))
     print("".join(cons))
 
 
 def main():
-   parser = argparse.ArgumentParser()
-   parser.add_argument('--depth', type=int, default=20, help='minimum depth to call a variant')
-   parser.add_argument('reference')
-   parser.add_argument('vcffile')
-   parser.add_argument('bamfile')
-   args = parser.parse_args()
-   go(args)
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--depth", type=int, default=20, help="minimum depth to call a variant"
+    )
+    parser.add_argument("reference")
+    parser.add_argument("vcffile")
+    parser.add_argument("bamfile")
+    args = parser.parse_args()
+    go(args)
+
 
 if __name__ == "__main__":
     main()
diff --git a/artic/mask.py b/artic/mask.py
index e00c0f6e..97134715 100755
--- a/artic/mask.py
+++ b/artic/mask.py
@@ -1,25 +1,25 @@
 #!/usr/bin/env python
 
 from Bio import SeqIO
-import sys
-import vcf
-import subprocess
-from collections import defaultdict
-import os.path
-import operator
-from .vcftagprimersites import read_bed_file
+from cyvcf2 import VCF
 import argparse
 import pandas as pd
 
+
 def read_3col_bed(fn):
     # read the primer scheme into a pandas dataframe and run type, length and null checks
-    bedfile = pd.read_csv(fn, sep='\t', header=None,
-                          names=['chrom', 'start', 'end'],
-                          dtype={'chrom': str, 'start': int, 'end': int},
-                          usecols=(0, 1, 2),
-                          skiprows=0)
+    bedfile = pd.read_csv(
+        fn,
+        sep="\t",
+        header=None,
+        names=["chrom", "start", "end"],
+        dtype={"chrom": str, "start": int, "end": int},
+        usecols=(0, 1, 2),
+        skiprows=0,
+    )
     return bedfile
 
+
 def go(args):
     seqs = dict([(rec.id, rec) for rec in SeqIO.parse(open(args.reference), "fasta")])
     cons = {}
@@ -28,30 +28,30 @@ def go(args):
 
     bedfile = read_3col_bed(args.maskfile)
     for _, region in bedfile.iterrows():
-        for n in range(region['start'], region['end']):
-            cons[region['chrom']][n] = 'N'
+        for n in range(region["start"], region["end"]):
+            cons[region["chrom"]][n] = "N"
 
-    sett = set()
-    vcf_reader = vcf.Reader(open(args.maskvcf, 'r'))
+    vcf_reader = VCF(args.maskvcf)
     for record in vcf_reader:
         for n in range(0, len(record.REF)):
-            cons[record.CHROM][record.POS-1+n] = 'N'
+            cons[record.CHROM][record.POS - 1 + n] = "N"
 
-    fh = open(args.output, 'w')
+    fh = open(args.output, "w")
     for k in seqs.keys():
-       fh.write(">%s\n" % (k))
-       fh.write(("".join(cons[k]))+'\n')
+        fh.write(">%s\n" % (k))
+        fh.write(("".join(cons[k])) + "\n")
     fh.close()
 
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument('reference')
-    parser.add_argument('maskfile')
-    parser.add_argument('maskvcf')
-    parser.add_argument('output')
+    parser.add_argument("reference")
+    parser.add_argument("maskfile")
+    parser.add_argument("maskvcf")
+    parser.add_argument("output")
     args = parser.parse_args()
     go(args)
 
+
 if __name__ == "__main__":
     main()
diff --git a/artic/minion.py b/artic/minion.py
index 017549f7..60e065e1 100755
--- a/artic/minion.py
+++ b/artic/minion.py
@@ -1,161 +1,80 @@
-#Written by Nick Loman (@pathogenomenick)
+# Written by Nick Loman (@pathogenomenick)
 
-from clint.textui import colored, puts, indent
-from Bio import SeqIO
+from clint.textui import colored
 import os
 import sys
 import time
-import requests
-import hashlib
-from .vcftagprimersites import read_bed_file
-
-def get_nanopolish_header(ref):
-    """Checks the reference sequence for a single fasta entry.
-
-    Parameters
-    ----------
-    ref : str
-        The fasta file containing the reference sequence
-
-    Returns
-    -------
-    str
-        A formatted header string for nanopolish
-    """
-    recs = list(SeqIO.parse(open(ref), "fasta"))
-    if len (recs) != 1:
-        print("FASTA has more than one sequence", file=sys.stderr)
-        raise SystemExit(1)
-    return  "%s:%d-%d" % (recs[0].id, 1, len(recs[0])+1)
+from artic.utils import read_bed_file, get_scheme
+
+
+def run(parser, args):
 
-def check_scheme_hashes(filepath, manifest_hash):
-    with open(filepath, "rb") as fh:
-        data = fh.read()
-        hash_sha256 = hashlib.sha256(data).hexdigest()
-    if hash_sha256 != manifest_hash:
+    if any([args.bed, args.ref]) and any(
+        [
+            args.scheme_name,
+            args.scheme_version,
+            args.scheme_length,
+        ]
+    ):
         print(
-            colored.yellow(
-                f"sha256 hash for {str(filepath)} does not match manifest"
+            colored.red(
+                "You cannot mix 'Remote Scheme Options' with 'Local Scheme Options', for more information run 'artic minion --help'"
             ),
             file=sys.stderr,
         )
         raise SystemExit(1)
 
-def get_scheme(scheme_name, scheme_directory, scheme_version="1"):
-    """Get and check the ARTIC primer scheme.
-    When determining a version, the original behaviour (parsing the scheme_name and
-    separating on /V ) is used over a specified scheme_version. If neither are
-    provided, the version defaults to 1.
-    If 0 is provided as version, the latest scheme will be downloaded.
- 
-    Parameters
-    ----------
-    scheme_name : str
-        The primer scheme name
-    scheme_directory : str
-        The directory containing the primer scheme and reference sequence
-    scheme_version : str
-        The primer scheme version (optional)
-    Returns
-    -------
-    str
-        The location of the checked primer scheme
-    str
-        The location of the checked reference sequence
-    str
-        The version being used
-    """
-    # try getting the version from the scheme name (old behaviour)
-    if scheme_name.find('/V') != -1:
-        scheme_name, scheme_version = scheme_name.split('/V')
-
-    # create the filenames and check they exist
-    bed = "%s/%s/V%s/%s.scheme.bed" % (scheme_directory, scheme_name, scheme_version, scheme_name)
-    ref = "%s/%s/V%s/%s.reference.fasta" % (scheme_directory, scheme_name, scheme_version, scheme_name)
-    if os.path.exists(bed) and os.path.exists(ref):
-        return bed, ref, scheme_version
-
-    # if they don't exist, try downloading them to the current directory
-    print(
-        colored.yellow(
-            "could not find primer scheme and reference sequence, downloading"
-        ),
-        file=sys.stderr,
-    )
-
-    try:
-        manifest = requests.get("https://raw.githubusercontent.com/artic-network/primer-schemes/master/schemes_manifest.json").json()
-    except requests.exceptions.RequestException as error:
-        print("Manifest Exception:", error)
-        raise SystemExit(2)
-
-    for scheme, scheme_contents in dict(manifest["schemes"]).items():
-        if scheme == scheme_name.lower() or scheme_name.lower() in scheme_contents["aliases"]:
-            print(
-                colored.yellow(
-                    f"\tfound requested scheme:\t{scheme} (using alias {scheme_name})"
-                ),
-                file=sys.stderr,
-            )
-            if scheme_version == 0:
-                print(
-                    colored.yellow(
-                        f"Latest version for scheme {scheme} is -> {scheme_contents['latest_version']}"
-                    ),
-                    file=sys.stderr,
-                )
-                scheme_version = scheme_contents["latest_version"]
-            elif scheme_version not in dict(scheme_contents["primer_urls"]).keys():
-                print(
-                    colored.yellow(
-                        f"Requested scheme version {scheme_version} not found; using latest version ({scheme_contents['latest_version']}) instead"
-                    ),
-                    file=sys.stderr,
-                )
-                scheme_version = scheme_contents["latest_version"]
-                bed = "%s/%s/V%s/%s.scheme.bed" % (scheme_directory, scheme_name, scheme_version, scheme_name)
-                ref = "%s/%s/V%s/%s.reference.fasta" % (scheme_directory, scheme_name, scheme_version, scheme_name)
-            
-
-            os.makedirs(os.path.dirname(bed), exist_ok=True)
-            with requests.get(scheme_contents["primer_urls"][scheme_version]) as fh:
-                open(bed, 'wt').write(fh.text)
-            
-            os.makedirs(os.path.dirname(ref), exist_ok=True)
-            with requests.get(scheme_contents["reference_urls"][scheme_version]) as fh:
-                open(ref, 'wt').write(fh.text)
-            
-            check_scheme_hashes(bed, scheme_contents["primer_sha256_checksums"][scheme_version])
-            check_scheme_hashes(ref, scheme_contents["reference_sha256_checksums"][scheme_version])
-
-            return bed, ref, scheme_version
-
-    print(
-        colored.yellow(
-            f"\tRequested scheme:\t{scheme_name} could not be found, exiting"
-        ),
-        file=sys.stderr,
-    )
-    raise SystemExit(1)
+    if any([args.bed, args.ref]) and not all([args.bed, args.ref]):
+        print(
+            colored.red(
+                "You must provide both a BED file and a reference sequence for local scheme options"
+            ),
+            file=sys.stderr,
+        )
+        raise SystemExit(1)
 
-def run(parser, args):
+    if any([args.scheme_name, args.scheme_version, args.scheme_length]) and not all(
+        [args.scheme_name, args.scheme_version]
+    ):
+        print(
+            colored.red(
+                "You must provide a scheme name and a scheme version at minimum for remote scheme options"
+            ),
+            file=sys.stderr,
+        )
+        raise SystemExit(1)
 
-    # check for medaka-model
-    if args.medaka and (args.medaka_model is None):
-        print(colored.red('Must specify --medaka-model if using the --medaka workflow.'))
+    # check for model
+    if not args.model:
+        print(
+            colored.red("Must specify --model for clair3 or medaka variant calling"),
+        )
         raise SystemExit(1)
 
-    # 1) check the parameters and set up the filenames
+    # check the parameters and set up the filenames
     ## find the primer scheme, reference sequence and confirm scheme version
-    bed, ref, _ = get_scheme(args.scheme, args.scheme_directory, args.scheme_version)
 
-    ## if in strict mode, validate the primer scheme
-    if args.strict:
-        checkScheme = "artic-tools validate_scheme %s" % (bed)
-        print(colored.green("Running: "), checkScheme, file=sys.stderr)
-        if (os.system(checkScheme) != 0):
-            print(colored.red("primer scheme failed strict checking"), file=sys.stderr)
-            raise SystemExit(1)
+    if args.bed and args.ref:
+        bed = args.bed
+        ref = args.ref
+    else:
+        bed, ref, _ = get_scheme(
+            scheme_name=args.scheme_name,
+            scheme_version=args.scheme_version,
+            scheme_directory=args.scheme_directory,
+            scheme_length=args.scheme_length,
+        )
+
+    if not os.path.exists(bed) or not os.path.exists(ref):
+        print(
+            colored.red(
+                "Failed to find primer scheme or reference sequence: {} {}".format(
+                    bed, ref
+                )
+            ),
+            file=sys.stderr,
+        )
+        raise SystemExit(1)
 
     ## set up the read file
     if args.read_file:
@@ -163,135 +82,197 @@ def run(parser, args):
     else:
         read_file = "%s.fasta" % (args.sample)
     if not os.path.exists(read_file):
-        print(colored.red("failed to find read-file: {}" .format(read_file)), file=sys.stderr)
+        print(
+            colored.red("failed to find read-file: {}".format(read_file)),
+            file=sys.stderr,
+        )
         raise SystemExit(1)
 
     ## collect the primer pools
-    pools = set([row['PoolName'] for row in read_bed_file(bed)])
+    pools = set([row["PoolName"] for row in read_bed_file(bed)])
 
     ## create a holder to keep the pipeline commands in
     cmds = []
 
-    # 2) if using nanopolish, set up the reference header and run the nanopolish indexing
-    nanopolish_header = get_nanopolish_header(ref)
-    if not args.medaka and not args.skip_nanopolish:
-        if not args.fast5_directory or not args.sequencing_summary:
-              print(colored.red('Must specify FAST5 directory and sequencing summary for nanopolish mode.'))
-              raise SystemExit(1)
-        cmds.append("nanopolish index -s %s -d %s %s" % (args.sequencing_summary, args.fast5_directory, args.read_file,))
-
-    # 3) index the ref & align with minimap or bwa
-    if not args.bwa:
-        cmds.append("minimap2 -a -x map-ont -t %s %s %s | samtools view -bS -F 4 - | samtools sort -o %s.sorted.bam -" % (args.threads, ref, read_file, args.sample))
-    else:
-        cmds.append("bwa index %s" % (ref,))
-        cmds.append("bwa mem -t %s -x ont2d %s %s | samtools view -bS -F 4 - | samtools sort -o %s.sorted.bam -" % (args.threads, ref, read_file, args.sample))
-    cmds.append("samtools index %s.sorted.bam" % (args.sample,))
+    # 2) check the reference fasta has and index and create one if not
+    if not os.path.exists("%s.fai" % (ref)) and args.clair3:
+        cmds.append("samtools faidx %s" % (ref))
+
+    # 3) index the ref & align with minimap
+    cmds.append(
+        f"minimap2 -a -x map-ont -t {args.threads} {ref} {read_file} | samtools view -bS -F 4 - | samtools sort -o {args.sample}.sorted.bam -"
+    )
+
+    cmds.append(f"samtools index {args.sample}.sorted.bam")
 
     # 4) trim the alignments to the primer start sites and normalise the coverage to save time
     if args.normalise:
-        normalise_string = '--normalise %d' % (args.normalise)
+        normalise_string = f"--normalise {args.normalise}"
     else:
-        normalise_string = ''
-    cmds.append("align_trim %s %s --start --remove-incorrect-pairs --report %s.alignreport.txt < %s.sorted.bam 2> %s.alignreport.er | samtools sort -T %s - -o %s.trimmed.rg.sorted.bam" % (normalise_string, bed, args.sample, args.sample, args.sample, args.sample, args.sample))
-    cmds.append("align_trim %s %s --remove-incorrect-pairs --report %s.alignreport.txt < %s.sorted.bam 2> %s.alignreport.er | samtools sort -T %s - -o %s.primertrimmed.rg.sorted.bam" % (normalise_string, bed, args.sample, args.sample, args.sample, args.sample, args.sample))
-    cmds.append("samtools index %s.trimmed.rg.sorted.bam" % (args.sample))
-    cmds.append("samtools index %s.primertrimmed.rg.sorted.bam" % (args.sample))
-
-    # 6) do variant calling on each read group, either using the medaka or nanopolish workflow
-    if args.medaka:
-        for p in pools:
-            if os.path.exists("%s.%s.hdf" % (args.sample, p)):
-                os.remove("%s.%s.hdf" % (args.sample, p))
-            cmds.append("medaka consensus --model %s --threads %s --chunk_len 800 --chunk_ovlp 400 --RG %s %s.trimmed.rg.sorted.bam %s.%s.hdf" % (args.medaka_model, args.threads, p, args.sample, args.sample, p))
-            if args.no_indels:
-                cmds.append("medaka snp %s %s.%s.hdf %s.%s.vcf" % (ref, args.sample, p, args.sample, p))
+        normalise_string = ""
+
+    cmds.append(
+        f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --remove-incorrect-pairs --min-mapq {args.min_mapq} --report {args.sample}.alignreport.csv < {args.sample}.sorted.bam 2> {args.sample}.alignreport.er | samtools sort -T {args.sample} - -o {args.sample}.trimmed.rg.sorted.bam"
+    )
+
+    cmds.append(
+        f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --min-mapq {args.min_mapq} --remove-incorrect-pairs --trim-primers --report {args.sample}.alignreport.csv --amp-depth-report {args.sample}.amplicon_depths.tsv < {args.sample}.sorted.bam 2> {args.sample}.alignreport.er | samtools sort -T {args.sample} - -o {args.sample}.primertrimmed.rg.sorted.bam"
+    )
+
+    cmds.append(f"samtools index {args.sample}.trimmed.rg.sorted.bam")
+    cmds.append(f"samtools index {args.sample}.primertrimmed.rg.sorted.bam")
+
+    # 6) do variant calling on each read group
+    for p in pools:
+        if os.path.exists("%s.%s.hdf" % (args.sample, p)):
+            os.remove("%s.%s.hdf" % (args.sample, p))
+
+        if args.clair3:
+            # Use a specific model path if provided else use the default conda path
+            if args.model_path:
+                model_path = f"'{args.model_path}/{args.model}'"
+
             else:
-                cmds.append("medaka variant %s %s.%s.hdf %s.%s.vcf" % (ref, args.sample, p, args.sample, p))
-            
-            ## if not using longshot, annotate VCF with read depth info etc. so we can filter it
-            if args.no_longshot:
-                cmds.append("medaka tools annotate --pad 25 --RG %s %s.%s.vcf %s %s.trimmed.rg.sorted.bam tmp.medaka-annotate.vcf" % (p, args.sample, p, ref, args.sample))
-                cmds.append("mv tmp.medaka-annotate.vcf %s.%s.vcf" % (args.sample, p))
+                model_path = f"'{os.getenv('CONDA_PREFIX')}/bin/models/{args.model}'"
 
-    else:
-        if not args.skip_nanopolish:
-            indexed_nanopolish_file = read_file
+            # Split the BAM by read group
+            for p in pools:
+                cmds.append(
+                    f"samtools view -b -r {p} {args.sample}.trimmed.rg.sorted.bam -o {args.sample}.{p}.trimmed.rg.sorted.bam"
+                )
+
+                cmds.append(f"samtools index {args.sample}.{p}.trimmed.rg.sorted.bam")
+
+                cmds.append(
+                    f"run_clair3.sh --chunk_size=10000 --no_phasing_for_fa --bam_fn='{args.sample}.{p}.trimmed.rg.sorted.bam' --ref_fn='{ref}' --output='{args.sample}_rg_{p}' --threads='{args.threads}' --platform='ont' --model_path={model_path} --include_all_ctgs"
+                )
+
+                cmds.append(
+                    f"bgzip -dc {args.sample}_rg_{p}/merge_output.vcf.gz > {args.sample}.{p}.vcf"
+                )
+
+        else:
+            cmds.append(
+                f"medaka consensus --model {args.model} --threads {args.threads} --chunk_len 800 --chunk_ovlp 400 --RG {p} {args.sample}.trimmed.rg.sorted.bam {args.sample}.{p}.hdf"
+            )
             if args.no_indels:
-                nanopolish_extra_args = " --snps"
+                cmds.append(
+                    "medaka snp %s %s.%s.hdf %s.%s.vcf"
+                    % (ref, args.sample, p, args.sample, p)
+                )
             else:
-                nanopolish_extra_args = ""
-            for p in pools:
-                cmds.append("nanopolish variants --min-flanking-sequence 10 -x %s --progress -t %s --reads %s -o %s.%s.vcf -b %s.trimmed.rg.sorted.bam -g %s -w \"%s\" --ploidy 1 -m 0.15 --read-group %s %s" % (args.max_haplotypes, args.threads, indexed_nanopolish_file, args.sample, p, args.sample, ref, nanopolish_header, p, nanopolish_extra_args))
+                cmds.append(
+                    "medaka variant %s %s.%s.hdf %s.%s.vcf"
+                    % (ref, args.sample, p, args.sample, p)
+                )
+
+        if args.no_longshot:
+            cmds.append(
+                f"medaka tools annotate --pad 25 --RG {p} {args.sample}.{p}.vcf {ref} {args.sample}.primertrimmed.rg.sorted.bam tmp.medaka-annotate.vcf"
+            )
+            cmds.append(f"mv tmp.medaka-annotate.vcf {args.sample}.{p}.vcf")
 
     # 7) merge the called variants for each read group
-    merge_vcf_cmd = "artic_vcf_merge %s %s 2> %s.primersitereport.txt" % (args.sample, bed, args.sample)
+    merge_vcf_cmd = "artic_vcf_merge %s %s 2> %s.primersitereport.txt" % (
+        args.sample,
+        bed,
+        args.sample,
+    )
     for p in pools:
         merge_vcf_cmd += " %s:%s.%s.vcf" % (p, args.sample, p)
+
     cmds.append(merge_vcf_cmd)
 
     # 8) check and filter the VCFs
     ## if using strict, run the vcf checker to remove vars present only once in overlap regions (this replaces the original merged vcf from the previous step)
-    if args.strict:
-        cmds.append("bgzip -f %s.merged.vcf" % (args.sample))
-        cmds.append("tabix -p vcf %s.merged.vcf.gz" % (args.sample))
-        cmds.append("artic-tools check_vcf --dropPrimerVars --dropOverlapFails --vcfOut %s.merged.filtered.vcf %s.merged.vcf.gz %s 2> %s.vcfreport.txt" % (args.sample, args.sample, bed, args.sample))
-        cmds.append("mv %s.merged.filtered.vcf %s.merged.vcf" % (args.sample, args.sample))
-
-    ## if doing the medaka workflow and longshot required, do it on the merged VCF
-    if args.medaka and not args.no_longshot:
-        cmds.append("bgzip -f %s.merged.vcf" % (args.sample))
-        cmds.append("tabix -f -p vcf %s.merged.vcf.gz" % (args.sample))
-        cmds.append("longshot -P 0 -F -A --no_haps --bam %s.primertrimmed.rg.sorted.bam --ref %s --out %s.merged.vcf --potential_variants %s.merged.vcf.gz" % (args.sample, ref, args.sample, args.sample))
-
-    ## set up some name holder vars for ease
-    if args.medaka:
-        method = 'medaka'
-    else:
-        method = 'nanopolish'
-    vcf_file = "%s.pass.vcf" % (args.sample)
+    # if args.strict:
+    #     cmds.append("bgzip -f %s.merged.vcf" % (args.sample))
+    #     cmds.append("tabix -p vcf %s.merged.vcf.gz" % (args.sample))
+    #     cmds.append(
+    #         "artic-tools check_vcf --dropPrimerVars --dropOverlapFails --vcfOut %s.merged.filtered.vcf %s.merged.vcf.gz %s 2> %s.vcfreport.txt"
+    #         % (args.sample, args.sample, bed, args.sample)
+    #     )
+    #     cmds.append(
+    #         "mv %s.merged.filtered.vcf %s.merged.vcf" % (args.sample, args.sample)
+    #     )
+    pre_filter_vcf = f"{args.sample}.merged.vcf"
+    cmds.append(f"bgzip -kf {pre_filter_vcf}")
+    cmds.append(f"tabix -f -p vcf {pre_filter_vcf}.gz")
+
+    # if doing the medaka workflow and longshot required, do it on the merged VCF
+    if not args.no_longshot:
+        pre_filter_vcf = f"{args.sample}.longshot.vcf"
+        cmds.append(
+            f"longshot --min_mapq {args.min_mapq} -P 0 -F --max_cov 200000 --no_haps --bam {args.sample}.primertrimmed.rg.sorted.bam --ref {ref} --out {pre_filter_vcf} --potential_variants {args.sample}.merged.vcf.gz"
+        )
+        cmds.append(f"bgzip -kf {pre_filter_vcf}")
+        cmds.append(f"tabix -f -p vcf {pre_filter_vcf}.gz")
 
     ## filter the variants to produce PASS and FAIL lists, then index them
-    if args.no_frameshifts and not args.no_indels:
-        cmds.append("artic_vcf_filter --%s --no-frameshifts %s.merged.vcf %s.pass.vcf %s.fail.vcf" % (method, args.sample, args.sample, args.sample))
-    else:
-        cmds.append("artic_vcf_filter --%s %s.merged.vcf %s.pass.vcf %s.fail.vcf" % (method, args.sample, args.sample, args.sample))
-    cmds.append("bgzip -f %s" % (vcf_file))
-    cmds.append("tabix -p vcf %s.gz" % (vcf_file))
+    fs_str = "--no-frameshifts" if args.no_frameshifts else ""
+    indel_str = "--no-indels" if args.no_indels else ""
+    cmds.append(
+        f"artic_vcf_filter {fs_str} {indel_str} {pre_filter_vcf}.gz {args.sample}.pass.vcf {args.sample}.fail.vcf"
+    )
 
     # 9) get the depth of coverage for each readgroup, create a coverage mask and plots, and add failed variants to the coverage mask (artic_mask must be run before bcftools consensus)
-    cmds.append("artic_make_depth_mask --store-rg-depths %s %s.primertrimmed.rg.sorted.bam %s.coverage_mask.txt" % (ref, args.sample, args.sample))
-    cmds.append("artic_mask %s %s.coverage_mask.txt %s.fail.vcf %s.preconsensus.fasta" % (ref, args.sample, args.sample, args.sample))
+    cmds.append(
+        f"artic_make_depth_mask --store-rg-depths {ref} {args.sample}.primertrimmed.rg.sorted.bam {args.sample}.coverage_mask.txt"
+    )
+
+    cmds.append(
+        f"artic_mask {ref} {args.sample}.coverage_mask.txt {args.sample}.fail.vcf {args.sample}.preconsensus.fasta"
+    )
+
+    post_filter_vcf_file = f"{args.sample}.pass.vcf"
 
     # 10) generate the consensus sequence
-    cmds.append("bcftools consensus -f %s.preconsensus.fasta %s.gz -m %s.coverage_mask.txt -o %s.consensus.fasta" % (args.sample, vcf_file, args.sample, args.sample))
+    cmds.append(f"bgzip -kf {post_filter_vcf_file}")
+    cmds.append(f"tabix -f -p vcf {post_filter_vcf_file}.gz")
+
+    # Normalise variants in the pass/fail VCF files
+    post_normalisation_vcf_file = f"{args.sample}.normalised.vcf"
+    cmds.append(
+        f"bcftools norm --check-ref x --fasta-ref {args.sample}.preconsensus.fasta -O z -o {post_normalisation_vcf_file} {post_filter_vcf_file}.gz"
+    )
+    cmds.append(f"bgzip -kf {post_normalisation_vcf_file}")
+    cmds.append(f"tabix -f -p vcf {post_normalisation_vcf_file}.gz")
+    cmds.append(
+        f"bcftools consensus -f {args.sample}.preconsensus.fasta {post_normalisation_vcf_file}.gz -m {args.sample}.coverage_mask.txt -o {args.sample}.consensus.fasta"
+    )
 
     # 11) apply the header to the consensus sequence and run alignment against the reference sequence
-    fasta_header = "%s/ARTIC/%s" % (args.sample, method)
-    cmds.append("artic_fasta_header %s.consensus.fasta \"%s\"" % (args.sample, fasta_header))
-    cmds.append("cat %s.consensus.fasta %s > %s.muscle.in.fasta" % (args.sample, ref, args.sample))
-    cmds.append("muscle -in %s.muscle.in.fasta -out %s.muscle.out.fasta" % (args.sample, args.sample))
+    fasta_header = "%s/ARTIC/medaka" % (args.sample)
+    cmds.append(
+        'artic_fasta_header %s.consensus.fasta "%s"' % (args.sample, fasta_header)
+    )
 
-    # 12) get some QC stats
-    if args.strict:
-        cmds.append("artic_get_stats --scheme {} --align-report {}.alignreport.txt --vcf-report {}.vcfreport.txt {}" .format(bed, args.sample, args.sample, args.sample))
+    if args.use_muscle:
+        cmds.append(
+            "cat %s.consensus.fasta %s > %s.muscle.in.fasta"
+            % (args.sample, ref, args.sample)
+        )
+        cmds.append(
+            "muscle -in %s.muscle.in.fasta -out %s.muscle.out.fasta"
+            % (args.sample, args.sample)
+        )
 
     # 13) setup the log file and run the pipeline commands
     log = "%s.minion.log.txt" % (args.sample)
-    logfh = open(log, 'w')
+    logfh = open(log, "w")
     for cmd in cmds:
         print(colored.green("Running: ") + cmd, file=sys.stderr)
         if not args.dry_run:
             timerStart = time.perf_counter()
             retval = os.system(cmd)
             if retval != 0:
-                print(colored.red('Command failed:' ) + cmd, file=sys.stderr)
+                print(colored.red("Command failed:") + cmd, file=sys.stderr)
                 raise SystemExit(20)
             timerStop = time.perf_counter()
 
             ## print the executed command and the runtime to the log file
-            print("{}\t{}" .format(cmd, timerStop-timerStart), file=logfh)
-        
+            print("{}\t{}".format(cmd, timerStop - timerStart), file=logfh)
+
         ## if it's a dry run, print just the command
         else:
             print(cmd, file=logfh)
diff --git a/artic/minion_validator.py b/artic/minion_validator.py
deleted file mode 100644
index f43cd949..00000000
--- a/artic/minion_validator.py
+++ /dev/null
@@ -1,331 +0,0 @@
-"""minion_validator.py runs the functional tests of the minion pipeline, for both nanopolish and medaka workflows.
-
-For each test dataset, it will make the following checks for each workflow:
-
-    * check for existence of the final consensus sequence
-    * check the final consensus matches the expected consensus
-    * check for existence of the final VCF file (generated by artic_vcf_filter)
-    * check all expected variants are reported
-    * check no unexpected variants are reported
-    * if there is an expected deletion, the consensus sequence will be checked for it's absence
-
-NOTE:
-
-    * only a basic grep is used for checking the deletion in the consensus - test will fail if deletion sequence is present multiple times (I should improve this part of the test...)
-
-"""
-from Bio import SeqIO
-from tqdm import tqdm
-import argparse
-import errno
-import glob
-import os
-import pathlib
-import pytest
-import requests
-import sys
-import tarfile
-import vcf
-
-
-from . import pipeline
-
-# help pytest resolve where test data is kept
-dataDir = str(pathlib.Path(__file__).parent.parent) + "/test-data/"
-
-# testData is a lookup of sampleIDs to download urls
-testData = {
-    "MT007544": "https://raw.githubusercontent.com/artic-network/fieldbioinformatics/master/test-data/MT007544/MT007544.fastq",
-    "CVR1": "https://artic.s3.climb.ac.uk/validation-sets/CVR1.tgz",
-    "NRW01": "http://artic.s3.climb.ac.uk/validation-teams/NRW01.tgz",
-    "SP1": "http://artic.s3.climb.ac.uk/validation-teams/SP1.tgz"
-}
-
-# refConsensuses is a nested dict of sample IDs and their expected consensus sequences from the nanopolish workflow
-refConsensuses = {
-    "CVR1": dataDir + "consensus-sequences/CVR1.consensus.fasta",
-    "NRW01": dataDir + "consensus-sequences/NRW01.consensus.fasta",
-    "SP1": dataDir + "consensus-sequences/SP1.consensus.fasta"
-}
-
-# refMedakaConsensuses is a nested dict of sample IDs and their expected consensus sequences from the medaka workflow
-refMedakaConsensuses = {
-    "MT007544": dataDir + "consensus-sequences/MT007544.consensus.medaka.fasta",
-    "CVR1": dataDir + "consensus-sequences/CVR1.consensus.medaka.fasta",
-    "NRW01": dataDir + "consensus-sequences/NRW01.consensus.medaka.fasta",
-    "SP1": dataDir + "consensus-sequences/SP1.consensus.medaka.fasta"
-}
-
-# nanopolishTestVariants is a nested dict of sample IDs and their expected variants when using the nanopolish workflow
-nanopolishTestVariants = {
-    "CVR1":
-        {
-            # pos: (ref, alt, type, count)
-            241: ['C', 'T', "snp", 1],
-            3037: ['C', 'T', "snp", 1],
-            12733: ['C', 'T', "snp", 2], # count 2 indicates this is reported twice in the VCF (due to pool groups)
-            14408: ['C', 'T', "snp", 1],
-            23403: ['A', 'G', "snp", 1],
-            27752: ['C', 'T', "snp", 1],
-            28881: ['G', 'A', "snp", 1],
-            28882: ['G', 'A', "snp", 1],
-            28883: ['G', 'C', "snp", 1]
-        },
-    "NRW01":
-        {
-            1440: ['G', 'A', "snp", 1],
-            2891: ['G', 'A', "snp", 1],
-            4655: ['C', 'T', "snp", 1],
-            8422: ['G', 'A', "snp", 1],
-            22323: ['C', 'T', "snp", 2],
-            29546: ['C', 'A', "snp", 2]
-        },
-    "SP1":
-        {
-            241: ['C', 'T', "snp", 1],
-            3037: ['C', 'T', "snp", 1],
-            14408: ['C', 'T', "snp", 1],
-            23403: ['A', 'G', "snp", 1]
-        },
-}
-
-# medakaTestVariants is a nested dict of sample IDs and their expected variants when using the medaka workflow
-medakaTestVariants = {
-    "MT007544":
-        {
-            # pos: (ref, alt, type, count)
-            29749: ["ACGATCGAGTG", 'A', "del", 1]
-        },
-    "CVR1":
-        {
-            241: ['C', 'T', "snp", 1],
-            3037: ['C', 'T', "snp", 1],
-            12733: ['C', 'T', "snp", 2],
-            14408: ['C', 'T', "snp", 1],
-            23403: ['A', 'G', "snp", 1],
-            27752: ['C', 'T', "snp", 1],
-            28881: ['GGG', 'AAC', "indel", 1],
-        },
-    "NRW01":
-        {
-            1440: ['G', 'A', "snp", 1],
-            2891: ['G', 'A', "snp", 1],
-            4655: ['C', 'T', "snp", 1],
-            8422: ['G', 'A', "snp", 1],
-            22323: ['C', 'T', "snp", 2],
-            29546: ['C', 'A', "snp", 2]
-        },
-    "SP1":
-        {
-            241: ['C', 'T', "snp", 1],
-            3037: ['C', 'T', "snp", 1],
-            14408: ['C', 'T', "snp", 1],
-            23403: ['A', 'G', "snp", 1]
-        },
-}
-
-# extraFlags is a way to add extra sample-specific commands to the validation test cmd
-extraFlags = {
-    "medaka":
-        {
-            "SP1": ["--no-frameshifts"],
-        },
-    "nanopolish":
-        {
-        },
-}
-
-# dataChecker will run before the tests and download all the test datasets if not present
-@pytest.fixture(scope="session", autouse=True)
-def dataChecker(numValidations):
-    print("checking for validation datasets...")
-    for sampleID, url in testData.items():
-        targetPath = dataDir + sampleID
-        if os.path.exists(targetPath) == False:
-            print("\tno data for {}" .format(sampleID))
-            print("\tmaking dir at {}" .format(targetPath))
-            os.mkdir(targetPath)
-            print("\tdownloading from {}" .format(url))
-            try:
-                download(url, dataDir, sampleID)
-            except Exception as e:
-                print("download failed: ", e)
-                sys.exit(1)
-        else:
-            print("\tfound data dir for {}" .format(sampleID))
-    print("validation datasets ready\n")
-
-# download will download and untar a test dataset
-def download(url, dataDir, sampleID):
-    filename = url.rsplit('/', 1)[1] 
-    with open(f'{dataDir}/{filename}', 'wb+') as f:
-        response = requests.get(url, stream=True)
-        total = int(response.headers.get('content-length'))
-        if total is None:
-            f.write(response.content)
-        else:
-            with tqdm(total=total, unit='B', unit_scale=True, desc=filename) as pbar:
-                for data in tqdm(response.iter_content(chunk_size=1024)):
-                    f.write(data)
-                    pbar.update(1024)
-
-    tar = tarfile.open(dataDir + "/" + filename, "r:gz")
-    tar.extractall(dataDir)
-    tar.close()
-    os.remove(dataDir + "/" + filename)
-
-# genCommand will create the minion command for the requested workflow (nanopolish or medaka)
-def genCommand(sampleID, workflow):
-    cmd = [
-            "minion",
-            "--threads",
-            "2",
-            "--read-file",
-            dataDir + sampleID + "/" + sampleID + ".fastq",
-            "--scheme-directory",
-            dataDir + "primer-schemes",
-            "--fast5-directory",
-            dataDir + sampleID + "/fast5",
-            "--sequencing-summary",
-            dataDir + sampleID + "/" + sampleID + "_sequencing_summary.txt",
-    ]
-    if workflow=="medaka":
-        cmd.append("--medaka")
-        cmd.append("--medaka-model")
-        cmd.append("r941_min_high_g351")
-    if sampleID in extraFlags[workflow]:
-        for flag in extraFlags[workflow][sampleID]:
-            cmd.append(flag)
-    cmd.append("nCoV-2019/V1")
-    cmd.append(sampleID)
-    return cmd
-
-# cleanUp will remove all files generated by the pipeline for a given sampleID
-def cleanUp(sampleID):
-    fileList = glob.glob("{}*" .format(sampleID))
-    for filePath in fileList:
-        try:
-            os.remove(filePath)
-        except:
-            print("Error while deleting file : ", filePath)
-
-# checkConsensus will return 1 if a subsequence is present in a consensus file, or 0 if absent
-def checkConsensus(consensusFile, subSeq):
-    for record in SeqIO.parse(open(consensusFile, 'r'), 'fasta'):
-        if subSeq in record.seq:
-            return 1
-    return 0
-
-# runner is the test runner
-def runner(workflow, numValidations):
-
-    # get the workflow data
-    if workflow == "nanopolish":
-        data = nanopolishTestVariants
-    elif workflow == "medaka":
-        data = medakaTestVariants
-    else:
-        print("error setting up test runner")
-        assert False
-
-    # check the number of validation datasets requested
-    counter = numValidations
-    if (numValidations < 0) or (numValidations > len(testData)):
-        counter = len(testData)
-
-    # run the data through the requested workflow, then validate the output
-    print("running the {} workflow with {} of the validation datasets...\n" .format(workflow, counter))
-    for sampleID, expVariants in data.items():
-
-        # break when the requested number of datasets have been run
-        if counter == 0:
-            break
-
-        # generate the command
-        cmd = genCommand(sampleID, workflow)
-
-        # setup and run the minion parser
-        parser = pipeline.init_pipeline_parser()
-
-        # parse the arguments
-        try:
-            args = parser.parse_args(cmd)
-        except SystemExit:
-            print("failed to parse valid command for `artic minion`")
-            assert False
-
-        # run the minion pipeline
-        try:
-            args.func(parser, args)
-        except SystemExit:
-            print("artic minion exited early with an error")
-            assert False
-
-        # check the ARTIC consensus was created
-        consensusFile = "%s.consensus.fasta" % sampleID
-        assert os.path.exists(consensusFile) == True, "no consensus produced for {}" .format(sampleID)
-        testSeqs = SeqIO.parse(open(consensusFile, 'r'), 'fasta')
-        testConsensus = next(testSeqs)
-
-        # check the ARTIC consensus sequence matches the one on record
-        if workflow == "medaka":
-            refSeqs = SeqIO.parse(open(refMedakaConsensuses[sampleID], 'r'), 'fasta')
-        else:
-            refSeqs = SeqIO.parse(open(refConsensuses[sampleID], 'r'), 'fasta')
-        refConsensus = next(refSeqs)
-        assert testConsensus.seq == refConsensus.seq, "produced ARTIC consensus does not match expected consensus for {}" .format(sampleID)
-
-        # check the ARTIC VCF was created
-        vcfFile = "%s.pass.vcf.gz" % sampleID
-        assert os.path.exists(
-            vcfFile) == True, "no VCF produced for {}" .format(sampleID)
-
-        # open the VCF and check the reported variants match the expected
-        for record in vcf.Reader(filename=vcfFile):
-            if record.POS in expVariants:
-                assert record.REF == expVariants[record.POS][0], "incorrect REF reported in VCF for {} at position {}" .format(sampleID, record.POS)
-                assert str(record.ALT[0]) == expVariants[record.POS][1], "incorrect ALT reported in VCF for {} at position {}" .format(sampleID, record.POS)
-
-                # if this is an expected deletion, check the consensus sequence for it's absence
-                if expVariants[record.POS][2] == "del":
-                    assert checkConsensus(consensusFile, record.REF) == 0, "expected deletion for {} was reported but was left in consensus" .format(sampleID)
-
-                    # also check that the VCF record is correctly labelled as DEL
-                    assert record.is_deletion, "deletion for {} not formatted correctly in VCF" .format(sampleID)
-
-                # if this is an expected indel, check that the VCF record is correctly labelled as INDEL
-                if expVariants[record.POS][2] == "indel":
-                    assert record.is_indel, "indel for {} not formatted correctly in VCF" .format(sampleID)
-
-                # else, check that the VCF record is correctly labelled as SNP
-                if expVariants[record.POS][2] == "snp":
-                    assert record.is_snp, "snp for {} not formatted correctly in VCF" .format(sampleID)
-
-                # decrement/remove the variant from the expected list, so we can keep track of checked variants
-                expVariants[record.POS][3] -= 1
-                if (expVariants[record.POS][3] == 0):
-                    del expVariants[record.POS]
-            else:
-                print("unexpected variant found for {}: {} at {}" .format(sampleID, str(record.ALT[0]), record.POS))
-                assert False  
-
-        # check we've confirmed all the expected variants         
-        if len(expVariants) != 0:
-            print("variants missed during test for {}" .format(sampleID))
-            for key, val in expVariants.items():
-                print("\t{}: {}" .format(key, val))
-            assert False
-
-        # clean up pipeline files
-        cleanUp(sampleID)
-        counter -= 1
-
-# test_NanopolishMinion is the unit test runner to test the minion pipeline with the nanopolish workflow
-@pytest.mark.env("nanopolish")
-def test_NanopolishMinion(numValidations):
-    runner("nanopolish", numValidations)
-
-# test_MedakaMinion is the unit test runner to test the minion pipeline with the medaka workflow
-@pytest.mark.env("medaka")
-def test_MedakaMinion(numValidations):
-    runner("medaka", numValidations)
diff --git a/artic/pipeline.py b/artic/pipeline.py
index f8a82506..76f56656 100755
--- a/artic/pipeline.py
+++ b/artic/pipeline.py
@@ -5,30 +5,23 @@
 
 import argparse
 import sys
+import os
 
 from . import version
 
 
 def run_subtool(parser, args):
-    if args.command == 'extract':
-        from . import extract as submodule
-    if args.command == 'basecaller':
-        from . import basecaller as submodule
-    if args.command == 'demultiplex':
-        from . import demultiplex as submodule
-    if args.command == 'minion':
+    if args.command == "minion":
         from . import minion as submodule
-    if args.command == 'gather':
-        from . import gather as submodule
-    if args.command == 'guppyplex':
+    if args.command == "guppyplex":
         from . import guppyplex as submodule
-    if args.command == 'rampart':
+    if args.command == "rampart":
         from . import rampart as submodule
-    if args.command == 'filter':
+    if args.command == "filter":
         from . import filter_reads as submodule
-    if args.command == 'run':
+    if args.command == "run":
         from . import run as submodule
-    if args.command == 'export':
+    if args.command == "export":
         from . import export as submodule
 
     # run the chosen submodule.
@@ -38,9 +31,13 @@ def run_subtool(parser, args):
 class ArgumentParserWithDefaults(argparse.ArgumentParser):
     def __init__(self, *args, **kwargs):
         super(ArgumentParserWithDefaults, self).__init__(*args, **kwargs)
-        self.add_argument("-q", "--quiet", help="Do not output warnings to stderr",
-                          action="store_true",
-                          dest="quiet")
+        self.add_argument(
+            "-q",
+            "--quiet",
+            help="Do not output warnings to stderr",
+            action="store_true",
+            dest="quiet",
+        )
 
 
 def init_pipeline_parser():
@@ -52,154 +49,244 @@ def init_pipeline_parser():
         The initialised argparse Argument Parser for the pipeline
     """
     parser = argparse.ArgumentParser(
-        prog='artic', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument("-v", "--version", help="Installed Artic version",
-                        action="version",
-                        version="%(prog)s " + str(version.__version__))
+        prog="artic", formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "-v",
+        "--version",
+        help="Installed Artic version",
+        action="version",
+        version="%(prog)s " + str(version.__version__),
+    )
     subparsers = parser.add_subparsers(
-        title='[sub-commands]', dest='command', parser_class=ArgumentParserWithDefaults)
-
-    # extract
-    parser_extract = subparsers.add_parser('extract',
-                                           help='Create an empty poredb database')
-    parser_extract.add_argument('directory', metavar='directory',
-                                help='The name of the database.')
-    parser_extract.add_argument('--basecaller', metavar='basecaller',
-                                default='ONT Albacore Sequencing Software',
-                                help='The name of the basecaller')
-    parser_extract.set_defaults(func=run_subtool)
-
-    # callers
-    parser_extract = subparsers.add_parser(
-        'basecaller', help='Display basecallers in files')
-    parser_extract.add_argument(
-        'directory', metavar='directory', help='Directory of FAST5 files.')
-    parser_extract.set_defaults(func=run_subtool)
-
-    # demultiplex
-    parser_demultiplex = subparsers.add_parser(
-        'demultiplex', help='Run demultiplex')
-    parser_demultiplex.add_argument(
-        'fasta', metavar='fasta', help='Undemultiplexed FASTA file.')
-    parser_demultiplex.add_argument(
-        '--threads', type=int, default=8, help='Number of threads')
-    parser_demultiplex.add_argument(
-        '--prefix', help='Prefix for demultiplexed files')
-    parser_demultiplex.add_argument(
-        '--no-remove-directory', dest='no_remove_directory', action='store_true')
-    parser_demultiplex.set_defaults(func=run_subtool)
+        title="[sub-commands]", dest="command", parser_class=ArgumentParserWithDefaults
+    )
 
     # minion
-    parser_minion = subparsers.add_parser('minion', help='Run the alignment/variant-call/consensus pipeline')
+    parser_minion = subparsers.add_parser(
+        "minion", help="Run the alignment/variant-call/consensus pipeline"
+    )
+    # parser_minion.add_argument(
+    #     "scheme", metavar="scheme", help="The name of the scheme"
+    # )
     parser_minion.add_argument(
-        'scheme', metavar='scheme', help='The name of the scheme')
+        "sample", metavar="sample", help="The name of the sample"
+    )
     parser_minion.add_argument(
-        'sample', metavar='sample', help='The name of the sample')
-    parser_minion.add_argument('--medaka', dest='medaka', action='store_true',
-                               help='Use medaka instead of nanopolish for variants')
-    parser_minion.add_argument('--medaka-model', metavar='medaka_model', help='The model to use for medaka (required if using --medaka)')
-    parser_minion.add_argument('--no-longshot', dest='no_longshot', action='store_true', help='Do not use Longshot for variant filtering after medaka')
-    parser_minion.add_argument('--minimap2', dest='minimap2', default=True,
-                               action='store_true', help='Use minimap2 (default)')
+        "--model",
+        help="The model to use for medaka or clair3 (must be the specific model for the variant caller in question)",
+        required=True,
+    )
     parser_minion.add_argument(
-        '--bwa', dest='bwa', action='store_true', help='Use bwa instead of minimap2')
-    parser_minion.add_argument('--normalise', dest='normalise', type=int,
-                               default=100, help='Normalise down to moderate coverage to save runtime (default: %(default)d, deactivate with `--normalise 0`)')
+        "--clair3",
+        action="store_true",
+        help="Use Clair3 for variant calling, currently experimental (default: medaka)",
+    )
     parser_minion.add_argument(
-        '--threads', type=int, default=8, help='Number of threads (default: %(default)d)')
-    parser_minion.add_argument('--scheme-directory', metavar='scheme_directory',
-                               default='./primer-schemes', help='Default scheme directory')
-    parser_minion.add_argument('--scheme-version', metavar='scheme_version',
-                                default=1, help='Primer scheme version (default: %(default)d)')
-    parser_minion.add_argument('--max-haplotypes', type=int, default=1000000,
-                               metavar='max_haplotypes', help='max-haplotypes value for nanopolish')
-    parser_minion.add_argument('--read-file', metavar='read_file',
-                               help='Use alternative FASTA/FASTQ file to <sample>.fasta')
-    parser_minion.add_argument('--fast5-directory', help='FAST5 Directory')
+        "--model-path",
+        metavar="model_path",
+        help="Path containing clair3 models, defaults to models packaged with conda installation by default",
+        type=str,
+    )
     parser_minion.add_argument(
-        '--sequencing-summary', help='Path to Guppy sequencing summary')
-    parser_minion.add_argument('--skip-nanopolish', action='store_true')
-    parser_minion.add_argument('--no-indels', action='store_true', help='Do not report InDels (uses SNP-only mode of nanopolish/medaka)')
-    parser_minion.add_argument('--no-frameshifts', action='store_true', help='Remove variants which induce frameshifts (ignored when --no-indels set)')
-    parser_minion.add_argument('--dry-run', action='store_true')
-    parser_minion.add_argument('--strict', action='store_true', help='Run with strict filtering of variants against primer scheme')
-    parser_minion.set_defaults(func=run_subtool)
+        "--no-longshot",
+        dest="no_longshot",
+        action="store_true",
+        help="Do not use Longshot for variant filtering after medaka",
+    )
+    parser_minion.add_argument(
+        "--min-mapq",
+        type=int,
+        default=20,
+        help="Minimum mapping quality to consider (default: %(default)d)",
+    )
+    parser_minion.add_argument(
+        "--normalise",
+        dest="normalise",
+        type=int,
+        default=100,
+        help="Normalise down to moderate coverage to save runtime (default: %(default)d, deactivate with `--normalise 0`)",
+    )
+    parser_minion.add_argument(
+        "--primer-match-threshold",
+        type=int,
+        default=35,
+        help="Allow fuzzy primer matching within this threshold (default: %(default)d)",
+    )
+    parser_minion.add_argument(
+        "--threads",
+        type=int,
+        default=8,
+        help="Number of threads (default: %(default)d)",
+    )
+
+    remote_scheme_options = parser_minion.add_argument_group(
+        "Remote Scheme Options",
+        "Options for automagically fetching primer schemes from the scheme repository (https://github.com/quick-lab/primerschemes)",
+    )
+    remote_scheme_options.add_argument(
+        "--scheme-directory",
+        metavar="scheme_directory",
+        default=f"{os.getcwd()}/primer-schemes",
+        help="Default scheme directory (default: %(default)s)",
+    )
+    remote_scheme_options.add_argument(
+        "--scheme-name",
+        metavar="scheme_name",
+        type=str,
+        help="Name of the scheme, e.g. sars-cov-2, mpxv to fetch from the scheme repository (https://github.com/quick-lab/primerschemes)",
+    )
+    remote_scheme_options.add_argument(
+        "--scheme-length",
+        type=int,
+        metavar="scheme_length",
+        default=False,
+        help="Length of the scheme to fetch from the scheme repository (https://github.com/quick-lab/primerschemes). This is only required if the --scheme-name has multiple possible lengths.",
+    )
+    remote_scheme_options.add_argument(
+        "--scheme-version",
+        metavar="vX.X.X",
+        type=str,
+        help="Primer scheme version",
+    )
 
-    # gather
-    parser_gather = subparsers.add_parser(
-        'gather', help='Gather up demultiplexed files')
-    parser_gather.add_argument('--directory', nargs='+', metavar='directory',
-                               help='Basecalled (guppy) results directory or directories.')
-    parser_gather.add_argument(
-        '--max-length', type=int, metavar='max_length', help='remove reads greater than read length')
-    parser_gather.add_argument(
-        '--min-length', type=int, metavar='min_length', help='remove reads less than read length')
-    parser_gather.add_argument('--prefix', help='Prefix for gathered files')
-    parser_gather.add_argument('--prompt-directory', metavar='run_directory',
-                               help='The run directory for interactive prompts', default='/var/lib/minknown/data')
-    parser_gather.add_argument(
-        '--fast5-directory', metavar='fast5_directory', help='The directory with fast5 files')
-    parser_gather.add_argument('--no-fast5s', action='store_true',
-                               help='Do not use fast5s and nanopolish', default=0)
-    parser_gather.add_argument('--limit', type=int, help='Only gather n reads')
-    parser_gather.set_defaults(func=run_subtool)
+    local_scheme_options = parser_minion.add_argument_group(
+        "Local Scheme Options",
+        "Options for providing a local primer scheme and reference sequence (BED file and FASTA file) directly to the pipeline",
+    )
+    local_scheme_options.add_argument(
+        "--bed",
+        metavar="bed",
+        help="BED file containing primer scheme",
+    )
+    local_scheme_options.add_argument(
+        "--ref",
+        help="Reference sequence for the scheme",
+    )
+
+    parser_minion.add_argument(
+        "--read-file",
+        metavar="read_file",
+        help="FASTQ file containing reads",
+        required=True,
+    )
+    parser_minion.add_argument(
+        "--no-indels",
+        action="store_true",
+        help="Do not report InDels (uses SNP-only mode of nanopolish/medaka)",
+    )
+    parser_minion.add_argument(
+        "--no-frameshifts",
+        action="store_true",
+        help="Remove variants which induce frameshifts (ignored when --no-indels set)",
+    )
+    parser_minion.add_argument(
+        "--use-muscle",
+        action="store_true",
+        help="Run muscle alignment of consensus to reference",
+    )
+    parser_minion.add_argument("--dry-run", action="store_true")
+    parser_minion.set_defaults(func=run_subtool)
 
     # guppyplex
     # This is a workflow that aggregates the previous gather and demultiplex steps into a single task.
     # This is making an assumption that the results from MinKnow demultiplex are good-enough.
     parser_guppyplex = subparsers.add_parser(
-        'guppyplex', help='Aggregate pre-demultiplexed reads from MinKNOW/Guppy')
-    parser_guppyplex.add_argument('--directory', metavar='directory',
-                                  help='Basecalled and demultiplexed (guppy) results directory', required=True)
+        "guppyplex", help="Aggregate pre-demultiplexed reads from MinKNOW/Guppy"
+    )
     parser_guppyplex.add_argument(
-        '--max-length', type=int, metavar='max_length', help='remove reads greater than read length')
+        "--directory",
+        metavar="directory",
+        help="Basecalled and demultiplexed (guppy) results directory",
+        required=True,
+    )
     parser_guppyplex.add_argument(
-        '--min-length', type=int, metavar='min_length', help='remove reads less than read length')
-    parser_guppyplex.add_argument('--quality', type=float, metavar='quality',
-                                  default=7, help='remove reads against this quality filter')
-    parser_guppyplex.add_argument('--sample', type=float, metavar='sample', default=1,
-                                  help='sampling frequency for random sample of sequence to reduce excess')
+        "--max-length",
+        type=int,
+        metavar="max_length",
+        help="remove reads greater than read length",
+    )
     parser_guppyplex.add_argument(
-        '--skip-quality-check', action='store_true', help='Do not filter on quality score (speeds up)')
+        "--min-length",
+        type=int,
+        metavar="min_length",
+        help="remove reads less than read length",
+    )
     parser_guppyplex.add_argument(
-        '--prefix', help='Prefix for guppyplex files')
-    parser_guppyplex.add_argument('--output', metavar='output',
-                                  help='FASTQ file to write')
+        "--quality",
+        type=float,
+        metavar="quality",
+        default=7,
+        help="remove reads against this quality filter",
+    )
+    parser_guppyplex.add_argument(
+        "--sample",
+        type=float,
+        metavar="sample",
+        default=1,
+        help="sampling frequency for random sample of sequence to reduce excess",
+    )
+    parser_guppyplex.add_argument(
+        "--skip-quality-check",
+        action="store_true",
+        help="Do not filter on quality score (speeds up)",
+    )
+    parser_guppyplex.add_argument("--prefix", help="Prefix for guppyplex files")
+    parser_guppyplex.add_argument(
+        "--output", metavar="output", help="FASTQ file to write"
+    )
     parser_guppyplex.set_defaults(func=run_subtool)
 
     # filter
-    parser_filter = subparsers.add_parser(
-        'filter', help='Filter FASTQ files by length')
-    parser_filter.add_argument(
-        'filename', metavar='filename', help='FASTQ file.')
+    parser_filter = subparsers.add_parser("filter", help="Filter FASTQ files by length")
+    parser_filter.add_argument("filename", metavar="filename", help="FASTQ file.")
     parser_filter.add_argument(
-        '--max-length', type=int, metavar='max_length', help='remove reads greater than read length')
+        "--max-length",
+        type=int,
+        metavar="max_length",
+        help="remove reads greater than read length",
+    )
     parser_filter.add_argument(
-        '--min-length', type=int, metavar='min_length', help='remove reads less than read length')
+        "--min-length",
+        type=int,
+        metavar="min_length",
+        help="remove reads less than read length",
+    )
     parser_filter.set_defaults(func=run_subtool)
 
     # rampart
     parser_rampart = subparsers.add_parser(
-        'rampart', help='Interactive prompts to start RAMPART')
-    parser_rampart.add_argument('--protocol-directory', metavar='protocol_directory',
-                                help='The RAMPART protocols directory.', default='/home/artic/artic/artic-ebov/rampart')
-    parser_rampart.add_argument('--run-directory', metavar='run_directory',
-                                help='The run directory', default='/var/lib/MinKNOW/data')
+        "rampart", help="Interactive prompts to start RAMPART"
+    )
+    parser_rampart.add_argument(
+        "--protocol-directory",
+        metavar="protocol_directory",
+        help="The RAMPART protocols directory.",
+        default="/home/artic/artic/artic-ebov/rampart",
+    )
+    parser_rampart.add_argument(
+        "--run-directory",
+        metavar="run_directory",
+        help="The run directory",
+        default="/var/lib/MinKNOW/data",
+    )
     parser_rampart.set_defaults(func=run_subtool)
 
     # export
     parser_export = subparsers.add_parser(
-        'export', help='Export reads and fAST5 into a neat archive')
-    parser_export.add_argument('prefix')
-    parser_export.add_argument('bamfile')
-    parser_export.add_argument('sequencing_summary')
-    parser_export.add_argument('fast5_directory')
-    parser_export.add_argument('output_directory')
+        "export", help="Export reads and fAST5 into a neat archive"
+    )
+    parser_export.add_argument("prefix")
+    parser_export.add_argument("bamfile")
+    parser_export.add_argument("sequencing_summary")
+    parser_export.add_argument("fast5_directory")
+    parser_export.add_argument("output_directory")
     parser_export.set_defaults(func=run_subtool)
 
     # run
     parser_run = subparsers.add_parser(
-        'run', help='Process an entire run folder interactively')
+        "run", help="Process an entire run folder interactively"
+    )
     parser_run.set_defaults(func=run_subtool)
 
     # return the parser
@@ -212,7 +299,7 @@ def main():
     parser = init_pipeline_parser()
 
     # collect the args
-    args = parser.parse_args(sys.argv[1:])
+    args = parser.parse_args()
 
     # if args.quiet:
     #    logger.setLevel(logging.ERROR)
diff --git a/artic/rampart.py b/artic/rampart.py
index b851db22..754bb40d 100755
--- a/artic/rampart.py
+++ b/artic/rampart.py
@@ -1,79 +1,91 @@
-#Written by Nick Loman (@pathogenomenick)
+# Written by Nick Loman (@pathogenomenick)
 
 import os
 import sys
 from Bio import SeqIO
 from clint.textui import colored, puts, indent
 
+
 def chooser(directories):
-	print ("Found the following directories:")
-	for n, d in enumerate(directories):
-		print ("  [%d]: %s" % (n+1, d))
-	print ("Choose 0 to quit")
-  
-	while True:
-		choice = input("=> ")
-		try:
-			choice_number = int(choice)
-			if choice_number == 0:
-				raise SystemExit(1)
-			path = directories[choice_number-1]
-			break
-		except Exception:
-			print ("Invalid choice, please select from the list above.")
-
-	return (path)
+    print("Found the following directories:")
+    for n, d in enumerate(directories):
+        print("  [%d]: %s" % (n + 1, d))
+    print("Choose 0 to quit")
+
+    while True:
+        choice = input("=> ")
+        try:
+            choice_number = int(choice)
+            if choice_number == 0:
+                raise SystemExit(1)
+            path = directories[choice_number - 1]
+            break
+        except Exception:
+            print("Invalid choice, please select from the list above.")
+
+    return path
+
 
 def run(parser, args):
-	directories = []
+    directories = []
 
-	for root, dirs, files in os.walk(args.run_directory, topdown=False):
-		for d in dirs:
-			if d == 'fastq_pass':
-				directories.append(root+'/'+d)
+    for root, dirs, files in os.walk(args.run_directory, topdown=False):
+        for d in dirs:
+            if d == "fastq_pass":
+                directories.append(root + "/" + d)
 
-	basecalledPath = chooser(directories)
+    basecalledPath = chooser(directories)
 
-	directories = os.listdir(args.protocol_directory)
-	directories = [d for d in directories if os.path.isdir(args.protocol_directory+'/'+d)]
+    directories = os.listdir(args.protocol_directory)
+    directories = [
+        d for d in directories if os.path.isdir(args.protocol_directory + "/" + d)
+    ]
 
-	protocolPath = chooser(directories)
+    protocolPath = chooser(directories)
 
-	print (basecalledPath, protocolPath)	
+    print(basecalledPath, protocolPath)
 
-	skip_barcoding = False
-	if os.path.exists('run_configuration.json'):
-		reenter = input ("Do you want to enter sample names again? (Y/N): ")
-		if reenter.lower().startswith('n'):
-			skip_barcoding = True
+    skip_barcoding = False
+    if os.path.exists("run_configuration.json"):
+        reenter = input("Do you want to enter sample names again? (Y/N): ")
+        if reenter.lower().startswith("n"):
+            skip_barcoding = True
 
-	if not skip_barcoding:
-		barcodes = []
-		print ("Enter sample names:")
-		for barcode in range(1,25):
-			print (" NB%02d: " % (barcode,), end = "")
-			barcodes.append(input("> "))
+    if not skip_barcoding:
+        barcodes = []
+        print("Enter sample names:")
+        for barcode in range(1, 25):
+            print(" NB%02d: " % (barcode,), end="")
+            barcodes.append(input("> "))
 
-		fh = open("run_configuration.json", "w")
-		fh.write("""{
+        fh = open("run_configuration.json", "w")
+        fh.write(
+            """{
 	  \"samples\": [
-		""")
-		first = True
-		for n, b in enumerate(barcodes):
-			if b:
-				if not first:
-					fh.write(",\n")
-				first = False
-				fh.write("""{
+		"""
+        )
+        first = True
+        for n, b in enumerate(barcodes):
+            if b:
+                if not first:
+                    fh.write(",\n")
+                first = False
+                fh.write(
+                    """{
 	      "name": "%s",
 	      "description": "",
 	      "barcodes": [ "NB%02d" ]
-    }""" % (barcodes[n].replace("\"", "\\\""), n+1))
-		fh.write("] }")
-		fh.close()
-
-	cmd = "rampart --basecalledPath %s --protocol %s/%s --clearAnnotated" % (basecalledPath, args.protocol_directory, protocolPath)
-	print (cmd)
+    }"""
+                    % (barcodes[n].replace('"', '\\"'), n + 1)
+                )
+        fh.write("] }")
+        fh.close()
 
-	os.system(cmd)
+    cmd = "rampart --basecalledPath %s --protocol %s/%s --clearAnnotated" % (
+        basecalledPath,
+        args.protocol_directory,
+        protocolPath,
+    )
+    print(cmd)
 
+    os.system(cmd)
diff --git a/artic/runs.py b/artic/runs.py
index cab54e44..5e733570 100644
--- a/artic/runs.py
+++ b/artic/runs.py
@@ -1,13 +1,13 @@
-import sys
 import csv
 
+
 def load_runs(fn):
-	rows = []
-	skipcomments = (row for row in open(fn) if not row.startswith('#'))
-	for row in csv.DictReader(skipcomments, dialect='excel-tab'):
-		if 'Included' not in row:
-			rows.append(row)
-		elif int(row['Included']):
-			rows.append(row)
-		
-	return rows
+    rows = []
+    skipcomments = (row for row in open(fn) if not row.startswith("#"))
+    for row in csv.DictReader(skipcomments, dialect="excel-tab"):
+        if "Included" not in row:
+            rows.append(row)
+        elif int(row["Included"]):
+            rows.append(row)
+
+    return rows
diff --git a/artic/utils.py b/artic/utils.py
new file mode 100644
index 00000000..86dc5ffe
--- /dev/null
+++ b/artic/utils.py
@@ -0,0 +1,625 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import requests
+import hashlib
+import re
+import pandas as pd
+from clint.textui import colored
+
+
+def getPrimerDirection(primerID):
+    """Infer the primer direction based on it's ID containing LEFT/RIGHT
+
+    Parameters
+    ----------
+    primerID : string
+        The primer ID from the 4th field of the primer scheme
+    """
+    if "LEFT" in primerID:
+        return "+"
+    elif "RIGHT" in primerID:
+        return "-"
+    else:
+        print("LEFT/RIGHT must be specified in Primer ID", file=sys.stderr)
+        raise SystemExit(1)
+
+
+def merge_sites(canonical, alt):
+    """Merges a canonical primer site with an alt site, producing an interval that encompasses both
+
+    Parameters
+    ----------
+    canonical : dict
+        The canonical primer site, provided as a dictionary of the bed file row
+    alt : dict
+        The alt primer site, provided as a dictionary of the bed file row
+
+    Returns
+    -------
+    dict
+        A dictionary of the merged site, where the dict represents a bed file row
+    """
+    # base the merged site on the canonical
+    mergedSite = canonical
+
+    # check the both the canonical and alt are the same direction
+    if canonical["direction"] != alt["direction"]:
+        print(
+            "could not merge alt with different orientation to canonical",
+            file=sys.stderr,
+        )
+        raise SystemExit(1)
+
+    # merge the start/ends of the alt with the canonical to get the largest window possible
+    if alt["start"] < canonical["start"]:
+        mergedSite["start"] = alt["start"]
+    if alt["end"] > canonical["end"]:
+        mergedSite["end"] = alt["end"]
+    return mergedSite
+
+
+def identify_bed_file(bed_file):
+    """Identifies the version of the primer ID format used in the bed file
+
+    Parameters
+    ----------
+    bed_file : str
+        The bed file to identify the primer ID format
+
+    Returns
+    -------
+    int
+        The version of the primer ID format used in the bed file
+    """
+
+    V1_pattern = re.compile(
+        r"[a-zA-Z0-9\-]+_[0-9]+_(LEFT|RIGHT)(_ALT[0-9]*|_alt[0-9]*)*"
+    )
+
+    V2_pattern = re.compile(r"[a-zA-Z0-9\-]+_[0-9]+_(LEFT|RIGHT)_[0-9]+")
+
+    version = False
+
+    with open(bed_file, "r") as bed_fh:
+        bed = bed_fh.readlines()
+
+    for line in bed:
+        if line.startswith("#"):
+            continue
+
+        splits = line.strip().split("\t")
+
+        if len(splits) < 6:
+            print(
+                colored.red(
+                    f"Invalid bed file format, only found {len(splits)} columns. For valid formats please see https://github.com/quick-lab/primerschemes"
+                )
+            )
+            raise SystemExit(1)
+
+        if not V1_pattern.match(splits[3]) and not V2_pattern.match(splits[3]):
+            print(
+                colored.red(
+                    f"Invalid primer ID format, {splits[3]} does not match the expected format"
+                )
+            )
+            raise SystemExit(1)
+
+        if V2_pattern.match(splits[3]):
+            if len(splits) < 7:
+                print(
+                    colored.red(
+                        f"Invalid bed file format, only found {len(splits)} columns. For valid formats please see https://github.com/ChrisgKent/primal-page"
+                    )
+                )
+
+            if not version:
+                version = 3
+
+            if version != 3:
+                print(
+                    colored.red(
+                        f"Scheme BED does not appear to be a consistent scheme version, for primer scheme formats please see https://github.com/ChrisgKent/primal-page"
+                    )
+                )
+                raise SystemExit(1)
+
+        elif V1_pattern.match(splits[3]):
+
+            if len(splits) == 7:
+                if not version:
+                    version = 2
+            elif len(splits) == 6:
+                if not version:
+                    version = 1
+            else:
+                print(
+                    colored.red(
+                        f"Invalid bed file format, found {len(splits)} columns with V1 primer names. For valid formats please see https://github.com/ChrisgKent/primal-page"
+                    )
+                )
+
+            if version == 3:
+                print(
+                    colored.red(
+                        f"Scheme BED mixed primer ID formats, please ensure your BED file is consistent"
+                    )
+                )
+                raise SystemExit(1)
+
+        return version
+
+
+def read_bed_file(fn):
+    """Parses a V2/V3 bed file and collapses primers into canonical primer sites
+
+    Parameters
+    ----------
+    fn : str
+        The bedfile to parse
+
+    Returns
+    -------
+    list
+        A list of dictionaries, where each dictionary contains a row of the parsed bedfile.
+        The available dictionary keys are - Primer_ID, direction, start, end
+    """
+
+    # read the primer scheme into a pandas dataframe and run type, length and null checks
+    version = identify_bed_file(fn)
+
+    if version in (1, 2):
+        return read_bed_file_legacy(fn)
+
+    primers = pd.read_csv(
+        fn,
+        sep="\t",
+        comment="#",
+        header=None,
+        names=["chrom", "start", "end", "Primer_ID", "PoolName", "direction"],
+        dtype={
+            "chrom": str,
+            "start": int,
+            "end": int,
+            "Primer_ID": str,
+            "PoolName": str,
+        },
+        usecols=(0, 1, 2, 3, 4, 5),
+        skiprows=0,
+    )
+    if len(primers.index) < 1:
+        print("primer scheme file is empty", file=sys.stderr)
+        raise SystemExit(1)
+    if primers.isnull().sum().sum():
+        print("malformed primer scheme file", file=sys.stderr)
+        raise SystemExit(1)
+
+    canonical_primers = {}
+    for _, row in primers.iterrows():
+        scheme_name, primer_id, direction, primer_n = row["Primer_ID"].split("_")
+
+        if (primer_id, direction) not in canonical_primers:
+            canonical_primers[(primer_id, direction)] = row.to_dict()
+            continue
+
+        canonical_primers[(primer_id, direction)] = merge_sites(
+            canonical_primers[(primer_id, direction)], row
+        )
+
+    # return the bedFile as a list
+    return [value for value in canonical_primers.values()]
+
+
+def read_bed_file_legacy(fn):
+    """Parses a bed file and collapses alts into canonical primer sites
+
+    Parameters
+    ----------
+    fn : str
+        The bedfile to parse
+
+    Returns
+    -------
+    list
+        A list of dictionaries, where each dictionary contains a row of the parsed bedfile.
+        The available dictionary keys are - Primer_ID, direction, start, end
+    """
+
+    # read the primer scheme into a pandas dataframe and run type, length and null checks
+    primers = pd.read_csv(
+        fn,
+        sep="\t",
+        header=None,
+        names=["chrom", "start", "end", "Primer_ID", "PoolName", "direction"],
+        dtype={
+            "chrom": str,
+            "start": int,
+            "end": int,
+            "Primer_ID": str,
+            "PoolName": str,
+        },
+        usecols=(0, 1, 2, 3, 4, 5),
+        skiprows=0,
+    )
+    if len(primers.index) < 1:
+        print("primer scheme file is empty", file=sys.stderr)
+        raise SystemExit(1)
+    if primers.isnull().sum().sum():
+        print("malformed primer scheme file", file=sys.stderr)
+        raise SystemExit(1)
+
+    # separate alt primers into a new dataframe
+    altFilter = primers["Primer_ID"].str.contains("_alt")
+    alts = pd.DataFrame(
+        columns=("chrom", "start", "end", "Primer_ID", "PoolName", "direction")
+    )
+    alts = pd.concat([alts, primers[altFilter]])
+    primers = primers.drop(primers[altFilter].index.values)
+
+    # convert the primers dataframe to dictionary, indexed by Primer_ID
+    #  - verify_integrity is used to prevent duplicate Primer_IDs being processed
+    bedFile = primers.set_index(
+        "Primer_ID", drop=False, verify_integrity=True
+    ).T.to_dict()
+
+    # if there were no alts, return the bedfile as a list of dicts
+    if len(alts.index) == 0:
+        return list(bedFile.values())
+
+    # merge alts
+    for _, row in alts.iterrows():
+        primerID = row["Primer_ID"].split("_alt")[0]
+
+        # check the bedFile if another version of this primer exists
+        if primerID not in bedFile:
+
+            # add to the bed file and continue
+            bedFile[primerID] = row.to_dict()
+            continue
+
+        # otherwise, we've got a primer ID we've already seen so merge the alt
+        mergedSite = merge_sites(bedFile[primerID], row)
+
+        # update the bedFile
+        bedFile[primerID] = mergedSite
+
+    # return the bedFile as a list
+    return [value for value in bedFile.values()]
+
+
+def overlaps(coords, pos):
+    for v in coords:
+        if pos >= v["start"] and pos <= v["end"]:
+            return v
+    return False
+
+
+def check_hash(filepath, manifest_hash):
+    with open(filepath, "rb") as fh:
+        data = fh.read()
+        hash_md5 = hashlib.md5(data).hexdigest()
+    if hash_md5 != manifest_hash:
+        print(
+            colored.yellow(
+                f"md5 hash for {str(filepath)} does not match manifest, if this scheme has been downloaded previously, please ensure the file is not corrupted or has been tampered with. If this is a new download, please raise this as an issue here: https://github.com/quick-lab/primerschemes/issues"
+            ),
+            file=sys.stderr,
+        )
+        raise SystemExit(1)
+
+
+def get_scheme(
+    scheme_name: str,
+    scheme_version: str,
+    scheme_directory: str,
+    scheme_length: int = False,
+):
+    """Get the primer scheme and reference fasta file from the manifest
+
+    Args:
+        scheme_name (str): Name of the scheme
+        scheme_version (str): Version of the scheme
+        scheme_directory (str): Directory where schemes are stored
+        scheme_length (int, optional): Length of the scheme. Defaults to False.
+
+    Returns:
+        str: Path to the primer bed file
+        str: Path to the reference fasta file
+        str: Version of the scheme
+    """
+
+    try:
+        response = requests.get(
+            "https://raw.githubusercontent.com/quick-lab/primerschemes/main/index.json"
+        )
+    except requests.exceptions.RequestException as error:
+        print(colored.red(f"Failed to fetch manifest with Exception: {error}"))
+        raise SystemExit(1)
+
+    if response.status_code != 200:
+        print(
+            colored.red(
+                f"Failed to fetch primer schemes manifest with status code: {response.status_code}"
+            )
+        )
+        raise SystemExit(1)
+
+    manifest = response.json()
+
+    try:
+        response = requests.get(
+            "https://raw.githubusercontent.com/quick-lab/primerschemes/main/aliases.json"
+        )
+    except requests.exceptions.RequestException as error:
+        print(colored.red(f"Failed to fetch scheme aliases with Exception: {error}"))
+        raise SystemExit(1)
+
+    if response.status_code != 200:
+        print(
+            colored.red(
+                f"Failed to fetch primer schemes alias file with status code: {response.status_code}"
+            )
+        )
+        raise SystemExit(1)
+
+    aliases = response.json()
+
+    scheme_name = scheme_name.lower().rstrip()
+
+    if not manifest["primerschemes"].get(scheme_name):
+        if not aliases.get(scheme_name):
+            print(
+                colored.red(
+                    f"Requested scheme: {scheme_name} could not be found, please check https://github.com/quick-lab/primerschemes for available schemes or provide a scheme bed and fasta reference directly using --bed and --ref"
+                )
+            )
+            raise SystemExit(1)
+
+        scheme_name = aliases[scheme_name]
+
+        if not manifest["primerschemes"].get(scheme_name):
+            print(
+                colored.red(
+                    f"Scheme name alias does not exist in manifest, this should never happen, please create an issue on https://github.com/quick-lab/primerschemes/issues or https://github.com/artic-network/fieldbioinformatics/issues if you see this message"
+                )
+            )
+            raise SystemExit(1)
+
+    scheme = manifest["primerschemes"][scheme_name]
+
+    if len(scheme.keys()) > 1:
+        if not scheme_length:
+            print(
+                colored.red(
+                    f"Multiple lengths of the scheme {scheme_name} found, please provide a scheme length using --scheme-length, available lengths are: {', '.join(scheme.keys())}"
+                )
+            )
+            raise SystemExit(1)
+
+    else:
+        scheme_length = list(scheme.keys())[0]
+
+    if not scheme.get(scheme_length):
+        print(
+            colored.red(
+                f"Provided length: {scheme_length} of the scheme {scheme_name} not found, please provide one of the following lengths using --scheme-length: {', '.join(scheme.keys())}"
+            )
+        )
+        raise SystemExit(1)
+
+    scheme_version = scheme_version.lower().rstrip()
+
+    version_pattern = re.compile(r"v\d+\.\d+\.\d+")
+
+    if not version_pattern.match(scheme_version):
+        print(
+            colored.red(
+                f"Invalid scheme version format, please provide a version in the format 'vX.X.X', e.g. v1.0.0"
+            )
+        )
+        raise SystemExit(1)
+
+    if not scheme[scheme_length].get(scheme_version):
+        print(
+            colored.red(
+                f"Requested scheme version: {scheme_version} not found, available versions are: {', '.join(scheme[scheme_length].keys())}"
+            )
+        )
+        raise SystemExit(1)
+
+    scheme = scheme[scheme_length][scheme_version]
+
+    scheme_path = os.path.join(
+        scheme_directory, scheme_name, scheme_length, scheme_version
+    )
+
+    if not os.path.exists(scheme_path):
+        os.makedirs(scheme_path, exist_ok=True)
+
+    bed_url = scheme["primer_bed_url"]
+
+    bed_name = bed_url.split("/")[-1]
+
+    bed_path = os.path.join(scheme_path, bed_name)
+
+    if not os.path.exists(bed_path):
+        try:
+            response = requests.get(bed_url)
+        except requests.exceptions.RequestException as error:
+            print(
+                colored.red(f"Failed to fetch primer bed file with Exception: {error}")
+            )
+            raise SystemExit(1)
+
+        if response.status_code != 200:
+            print(
+                colored.red(
+                    f"Failed to fetch primer bed file with status code: {response.status_code}"
+                )
+            )
+            raise SystemExit(1)
+
+        with open(bed_path, "w") as bed_file:
+            bed_file.write(response.text)
+
+    check_hash(bed_path, scheme["primer_bed_md5"])
+
+    ref_url = scheme["reference_fasta_url"]
+
+    ref_name = ref_url.split("/")[-1]
+
+    ref_path = os.path.join(scheme_path, ref_name)
+
+    if not os.path.exists(ref_path):
+        try:
+            response = requests.get(ref_url)
+        except requests.exceptions.RequestException as error:
+            print(
+                colored.red(
+                    f"Failed to fetch reference fasta file with Exception: {error}"
+                )
+            )
+            raise SystemExit(1)
+
+        if response.status_code != 200:
+            print(
+                colored.red(
+                    f"Failed to fetch reference fasta file with status code: {response.status_code}"
+                )
+            )
+            raise SystemExit(1)
+
+        with open(ref_path, "w") as ref_file:
+            ref_file.write(response.text)
+
+    check_hash(ref_path, scheme["reference_fasta_md5"])
+
+    return bed_path, ref_path, scheme_version
+
+
+def get_scheme_legacy(scheme_name, scheme_directory, scheme_version="1"):
+    """Get and check the ARTIC primer scheme.
+    When determining a version, the original behaviour (parsing the scheme_name and
+    separating on /V ) is used over a specified scheme_version. If neither are
+    provided, the version defaults to 1.
+    If 0 is provided as version, the latest scheme will be downloaded.
+
+    Parameters
+    ----------
+    scheme_name : str
+        The primer scheme name
+    scheme_directory : str
+        The directory containing the primer scheme and reference sequence
+    scheme_version : str
+        The primer scheme version (optional)
+    Returns
+    -------
+    str
+        The location of the checked primer scheme
+    str
+        The location of the checked reference sequence
+    str
+        The version being used
+    """
+    # try getting the version from the scheme name (old behaviour)
+    if scheme_name.find("/V") != -1:
+        scheme_name, scheme_version = scheme_name.split("/V")
+
+    # create the filenames and check they exist
+    bed = "%s/%s/V%s/%s.scheme.bed" % (
+        scheme_directory,
+        scheme_name,
+        scheme_version,
+        scheme_name,
+    )
+    ref = "%s/%s/V%s/%s.reference.fasta" % (
+        scheme_directory,
+        scheme_name,
+        scheme_version,
+        scheme_name,
+    )
+    if os.path.exists(bed) and os.path.exists(ref):
+        return bed, ref, scheme_version
+
+    # if they don't exist, try downloading them to the current directory
+    print(
+        colored.yellow(
+            "could not find primer scheme and reference sequence, downloading"
+        ),
+        file=sys.stderr,
+    )
+
+    try:
+        manifest = requests.get(
+            "https://raw.githubusercontent.com/artic-network/primer-schemes/master/schemes_manifest.json"
+        ).json()
+    except requests.exceptions.RequestException as error:
+        print("Manifest Exception:", error)
+        raise SystemExit(2)
+
+    for scheme, scheme_contents in dict(manifest["schemes"]).items():
+        if (
+            scheme == scheme_name.lower()
+            or scheme_name.lower() in scheme_contents["aliases"]
+        ):
+            print(
+                colored.yellow(
+                    f"\tfound requested scheme:\t{scheme} (using alias {scheme_name})"
+                ),
+                file=sys.stderr,
+            )
+            if scheme_version == 0:
+                print(
+                    colored.yellow(
+                        f"Latest version for scheme {scheme} is -> {scheme_contents['latest_version']}"
+                    ),
+                    file=sys.stderr,
+                )
+                scheme_version = scheme_contents["latest_version"]
+            elif scheme_version not in dict(scheme_contents["primer_urls"]).keys():
+                print(
+                    colored.yellow(
+                        f"Requested scheme version {scheme_version} not found; using latest version ({scheme_contents['latest_version']}) instead"
+                    ),
+                    file=sys.stderr,
+                )
+                scheme_version = scheme_contents["latest_version"]
+                bed = "%s/%s/V%s/%s.scheme.bed" % (
+                    scheme_directory,
+                    scheme_name,
+                    scheme_version,
+                    scheme_name,
+                )
+                ref = "%s/%s/V%s/%s.reference.fasta" % (
+                    scheme_directory,
+                    scheme_name,
+                    scheme_version,
+                    scheme_name,
+                )
+
+            os.makedirs(os.path.dirname(bed), exist_ok=True)
+            with requests.get(scheme_contents["primer_urls"][scheme_version]) as fh:
+                open(bed, "wt").write(fh.text)
+
+            os.makedirs(os.path.dirname(ref), exist_ok=True)
+            with requests.get(scheme_contents["reference_urls"][scheme_version]) as fh:
+                open(ref, "wt").write(fh.text)
+
+            # check_scheme_hashes(
+            #     bed, scheme_contents["primer_sha256_checksums"][scheme_version]
+            # )
+            # check_scheme_hashes(
+            #     ref, scheme_contents["reference_sha256_checksums"][scheme_version]
+            # )
+
+            return bed, ref, scheme_version
+
+    print(
+        colored.yellow(
+            f"\tRequested scheme:\t{scheme_name} could not be found, exiting"
+        ),
+        file=sys.stderr,
+    )
+    raise SystemExit(1)
diff --git a/artic/vcf_filter.py b/artic/vcf_filter.py
index 1ea2d474..91e2a648 100644
--- a/artic/vcf_filter.py
+++ b/artic/vcf_filter.py
@@ -1,31 +1,30 @@
-import vcf
-import sys
-from operator import attrgetter
+from cyvcf2 import VCF, Writer
 from collections import defaultdict
-from .vcftagprimersites import read_bed_file
+
 
 def in_frame(v):
     if len(v.ALT) > 1:
-       print ("This code does not support multiple genotypes!")
-       raise SystemExit
+        print("This code does not support multiple genotypes!")
+        raise SystemExit
     ref = v.REF
     alt = v.ALT[0]
     bases = len(alt) - len(ref)
     if not bases:
-       return True
+        return True
     if bases % 3 == 0:
-       return True
+        return True
     return False
 
+
 class NanoporeFilter:
     def __init__(self, no_frameshifts):
         self.no_frameshifts = no_frameshifts
         pass
 
     def check_filter(self, v):
-        total_reads = float(v.INFO['TotalReads'])
+        total_reads = float(v.INFO["TotalReads"])
         qual = v.QUAL
-        strandbias = float(v.INFO['StrandFisherTest'])
+        # strandbias = float(v.INFO["StrandFisherTest"])
 
         if qual / total_reads < 3:
             return False
@@ -34,8 +33,8 @@ def check_filter(self, v):
             return False
 
         if v.is_indel:
-            strand_fraction_by_strand = v.INFO['SupportFractionByStrand']
-            if float(strand_fraction_by_strand[0]) < 0.5: 
+            strand_fraction_by_strand = v.INFO["SupportFractionByStrand"]
+            if float(strand_fraction_by_strand[0]) < 0.5:
                 return False
 
             if float(strand_fraction_by_strand[1]) < 0.5:
@@ -46,12 +45,13 @@ def check_filter(self, v):
 
         return True
 
+
 class MedakaFilter:
     def __init__(self, no_frameshifts):
         self.no_frameshifts = no_frameshifts
 
     def check_filter(self, v):
-        depth = v.INFO['DP']
+        depth = v.INFO["DP"]
         if depth < 20:
             return False
 
@@ -62,17 +62,14 @@ def check_filter(self, v):
             return False
         return True
 
+
 def go(args):
-    vcf_reader = vcf.Reader(filename=args.inputvcf)
-    vcf_writer = vcf.Writer(open(args.output_pass_vcf, 'w'), vcf_reader)
-    vcf_writer_filtered = vcf.Writer(open(args.output_fail_vcf, 'w'), vcf_reader)
-    if args.nanopolish:
-        filter = NanoporeFilter(args.no_frameshifts)
-    elif args.medaka:
-        filter = MedakaFilter(args.no_frameshifts)
-    else:
-        print("Please specify a VCF type, i.e. --nanopolish or --medaka\n")
-        raise SystemExit
+    vcf_reader = VCF(args.inputvcf)
+    vcf_writer = Writer(args.output_pass_vcf, vcf_reader, "w")
+    vcf_writer.write_header()
+    vcf_writer_filtered = Writer(args.output_fail_vcf, vcf_reader, "w")
+    vcf_writer_filtered.write_header()
+    filter = MedakaFilter(args.no_frameshifts)
 
     variants = [v for v in vcf_reader]
 
@@ -80,15 +77,15 @@ def go(args):
     for v in variants:
         indx = "%s-%s" % (v.CHROM, v.POS)
         group_variants[indx].append(v)
-    
+
     for v in variants:
 
-        # if using medaka, we need to do a quick pre-filter to remove rubbish that we don't want adding to the mask
-        if args.medaka:
-            if v.INFO['DP'] <= 1:
-                continue
-            if v.QUAL < 20:
-                continue
+        # quick pre-filter to remove rubbish that we don't want adding to the mask
+        # if v.INFO["DP"] <= 1:
+        #     continue
+
+        if v.QUAL < 20:
+            continue
 
         # now apply the filter to send variants to PASS or FAIL file
         if filter.check_filter(v):
@@ -100,29 +97,27 @@ def go(args):
             if len(group_variants[indx]) > 1:
                 for check_variant in group_variants[indx]:
                     if filter.check_filter(check_variant):
-                        variant_passes = True 
-            
+                        variant_passes = True
+
             if not variant_passes:
                 vcf_writer_filtered.write_record(v)
             else:
-                print ("Suppress variant %s\n" % (v.POS))
+                print("Suppress variant %s\n" % (v.POS))
+
 
 def main():
     import argparse
 
     parser = argparse.ArgumentParser()
-    parser.add_argument('--nanopolish', action='store_true')
-    parser.add_argument('--medaka', action='store_true')
-    parser.add_argument('--no-frameshifts', action='store_true')
-    parser.add_argument('inputvcf')
-    parser.add_argument('output_pass_vcf')
-    parser.add_argument('output_fail_vcf')
+    parser.add_argument("--no-frameshifts", action="store_true")
+    parser.add_argument("inputvcf")
+    parser.add_argument("output_pass_vcf")
+    parser.add_argument("output_fail_vcf")
 
     args = parser.parse_args()
 
     go(args)
 
+
 if __name__ == "__main__":
     main()
-
-
diff --git a/artic/vcf_merge.py b/artic/vcf_merge.py
index a900d697..57346a3d 100644
--- a/artic/vcf_merge.py
+++ b/artic/vcf_merge.py
@@ -1,59 +1,90 @@
-import vcf
+from cyvcf2 import VCF, Writer
 import sys
 from operator import attrgetter
 from collections import defaultdict
-from .vcftagprimersites import read_bed_file
+from artic.utils import read_bed_file
+
 
 def vcf_merge(args):
-   bed = read_bed_file(args.bedfile)
+    bed = read_bed_file(args.bedfile)
+
+    primer_map = defaultdict(dict)
+
+    for p in bed:
+        for n in range(p["start"], p["end"] + 1):
+            primer_map[p["PoolName"]][n] = p["Primer_ID"]
 
-   primer_map = defaultdict(dict)
+    template_vcf = None
 
-   for p in bed:
-      for n in range(p['start'], p['end']+1):
-         primer_map[p['PoolName']][n] = p['Primer_ID']
+    pool_map = {}
+    for param in args.vcflist:
+        pool_name, file_name = param.split(":")
+        pool_map[file_name] = pool_name
+        if not template_vcf:
+            vcf_reader = VCF(file_name)
+            if vcf_reader.seqnames:
+                template_vcf = vcf_reader
+            else:
+                print(
+                    f"Not using {file_name} as VCF template since it has no variants",
+                    file=sys.stderr,
+                )
 
-   first_vcf = None
+    # vcf_reader.infos["Pool"] = vcf.parser._Format("Pool", 1, "String", "The pool name")
+    vcf_reader.add_info_to_header(
+        {"ID": "Pool", "Number": 1, "Type": "String", "Description": "The pool name"}
+    )
+    vcf_writer = Writer(f"{args.prefix}.merged.vcf", template_vcf, "w")
+    vcf_writer.write_header()
+    vcf_writer_primers = Writer(f"{args.prefix}.primers.vcf", template_vcf, "w")
+    vcf_writer_primers.write_header()
 
-   pool_map = {}
-   for param in args.vcflist:
-      pool_name, file_name = param.split(":")
-      pool_map[file_name] = pool_name
-      if not first_vcf: 
-         first_vcf = file_name
+    variants = []
+    for file_name, pool_name in pool_map.items():
+        vcf_reader = VCF(file_name)
+        if not vcf_reader.seqnames:
+            print(f"Skipping {file_name} as it has no variants", file=sys.stderr)
+            continue
 
-   vcf_reader = vcf.Reader(filename=first_vcf)
-   vcf_reader.infos["Pool"] = vcf.parser._Format("Pool", 1, "String", "The pool name")
-   vcf_writer = vcf.Writer(open(args.prefix+'.merged.vcf', 'w'), vcf_reader)
-   vcf_writer_primers = vcf.Writer(open(args.prefix+'.primers.vcf', 'w'), vcf_reader)
+        vcf_reader.add_info_to_header(
+            {
+                "ID": "Pool",
+                "Number": 1,
+                "Type": "String",
+                "Description": "The pool name",
+            }
+        )
+        for v in vcf_reader:
+            v.INFO["Pool"] = pool_name
+            variants.append(v)
 
-   variants = []
-   for file_name, pool_name in pool_map.items():
-      vcf_reader = vcf.Reader(filename=file_name)
-      for v in vcf_reader:
-         v.INFO['Pool'] = pool_name
-         variants.append(v)
+    variants.sort(key=attrgetter("CHROM", "POS"))
 
-   variants.sort(key=attrgetter('CHROM', 'POS'))
+    for v in variants:
+        if v.POS in primer_map[v.INFO["Pool"]]:
+            vcf_writer_primers.write_record(v)
+            print(
+                "found primer binding site mismatch: %s"
+                % (primer_map[v.INFO["Pool"]][v.POS]),
+                file=sys.stderr,
+            )
+        else:
+            vcf_writer.write_record(v)
 
-   for v in variants:
-      if v.POS in primer_map[v.INFO['Pool']]:
-         vcf_writer_primers.write_record(v)
-         print("found primer binding site mismatch: %s" % (primer_map[v.INFO['Pool']][v.POS]), file=sys.stderr)
-      else:
-         vcf_writer.write_record(v)
 
 def main():
     import argparse
 
-    parser = argparse.ArgumentParser(description='Trim alignments from an amplicon scheme.')
-    parser.add_argument('prefix')
-    parser.add_argument('bedfile')
-    parser.add_argument('vcflist', nargs='+')
+    parser = argparse.ArgumentParser(
+        description="Trim alignments from an amplicon scheme."
+    )
+    parser.add_argument("prefix")
+    parser.add_argument("bedfile")
+    parser.add_argument("vcflist", nargs="+")
 
     args = parser.parse_args()
     vcf_merge(args)
 
+
 if __name__ == "__main__":
     main()
-
diff --git a/artic/vcfextract.py b/artic/vcfextract.py
index a10140da..0c18ef26 100755
--- a/artic/vcfextract.py
+++ b/artic/vcfextract.py
@@ -1,58 +1,58 @@
 #!/usr/bin/env python
 
-import vcf
 import sys
 import subprocess
-import csv
 import os
 from collections import defaultdict
-from operator import attrgetter
+from cyvcf2 import VCF
+
 
 def read_vcf(fn):
     vcfinfo = {}
-    vcf_reader = vcf.Reader(open(fn, 'r'))
+    vcf_reader = VCF(fn)
     for record in vcf_reader:
         vcfinfo[record.POS] = record
     return vcfinfo
 
+
 def collect_depths(bamfile):
     if not os.path.exists(bamfile):
         raise SystemExit("bamfile %s doesn't exist" % (bamfile,))
 
-    p = subprocess.Popen(['samtools', 'depth', bamfile], stdout=subprocess.PIPE)
+    p = subprocess.Popen(["samtools", "depth", bamfile], stdout=subprocess.PIPE)
     out, err = p.communicate()
     depths = defaultdict(dict)
-    for ln in out.decode('utf-8').split("\n"):
+    for ln in out.decode("utf-8").split("\n"):
         if ln:
             contig, pos, depth = ln.split("\t")
             depths[int(pos)] = int(depth)
     return depths
 
+
 def main():
     positions = {}
 
     for sample_tag in sys.argv[1:]:
-        for vcfset in ['', '.primertrimmed']:
+        for vcfset in ["", ".primertrimmed"]:
             vcffn = "%s%s.vcf" % (sample_tag, vcfset)
             if not os.path.exists(vcffn):
                 continue
 
             print(vcffn, file=sys.stderr)
 
-            vcf_reader = vcf.Reader(filename=vcffn)
+            vcf_reader = VCF(vcffn)
             for record in vcf_reader:
                 if len(record.ALT[0]) == 1 and len(record.REF) == 1:
-                    positions[record.POS] = 'snp'
+                    positions[record.POS] = "snp"
                 else:
-                    positions[record.POS] = 'indel'
-
+                    positions[record.POS] = "indel"
 
     print("pos\tset\tsample\tvartype\tdepth\tsupportfraction\tbasecalledfrequency")
 
-    #for run, samples in runs.iteritems():
+    # for run, samples in runs.iteritems():
     #    for sample_tag in samples.keys():
     for sample_tag in sys.argv[1:]:
-        for vcfset in ['', '.primertrimmed']:
+        for vcfset in ["", ".primertrimmed"]:
             vcffn = "%s%s.vcf" % (sample_tag, vcfset)
             if not os.path.exists(vcffn):
                 print("%s does not exist" % (vcffn))
@@ -62,19 +62,33 @@ def main():
             bamfn = "%s.primertrimmed.sorted.bam" % (sample_tag)
             depths = collect_depths(bamfn)
 
-            #1-based pyvcf
+            # 1-based pyvcf
             for pos, variant_type in positions.items():
-                if pos-1 in depths:
-                    depth = depths[pos-1]
+                if pos - 1 in depths:
+                    depth = depths[pos - 1]
                 else:
                     depth = 0
 
                 if pos in vcffile:
                     info = vcffile[pos].INFO
-                    print("%s\t%s\t%s\t%s\t%s\t%s\t%s" % (pos, vcfset, sample_tag, variant_type, depth, info['SupportFraction'], info['BaseCalledFraction']))
+                    print(
+                        "%s\t%s\t%s\t%s\t%s\t%s\t%s"
+                        % (
+                            pos,
+                            vcfset,
+                            sample_tag,
+                            variant_type,
+                            depth,
+                            info["SupportFraction"],
+                            info["BaseCalledFraction"],
+                        )
+                    )
                 else:
-                    print("%s\t%s\t%s\tinvariant\t%s\t0\t0" % (pos, vcfset, sample_tag, depth))
+                    print(
+                        "%s\t%s\t%s\tinvariant\t%s\t0\t0"
+                        % (pos, vcfset, sample_tag, depth)
+                    )
+
 
 if __name__ == "__main__":
     main()
-
diff --git a/artic/vcftagprimersites.py b/artic/vcftagprimersites.py
deleted file mode 100755
index 914afb2d..00000000
--- a/artic/vcftagprimersites.py
+++ /dev/null
@@ -1,157 +0,0 @@
-#!/usr/bin/env python
-
-import pandas as pd
-import vcf
-import sys
-import subprocess
-import csv
-from collections import defaultdict
-
-
-def getPrimerDirection(primerID):
-    """Infer the primer direction based on it's ID containing LEFT/RIGHT
-
-    Parameters
-    ----------
-    primerID : string
-        The primer ID from the 4th field of the primer scheme
-    """
-    if 'LEFT' in primerID:
-        return '+'
-    elif 'RIGHT' in primerID:
-        return '-'
-    else:
-        print("LEFT/RIGHT must be specified in Primer ID", file=sys.stderr)
-        raise SystemExit(1)
-
-
-def merge_sites(canonical, alt):
-    """Merges a canonical primer site with an alt site, producing an interval that encompasses both
-
-    Parameters
-    ----------
-    canonical : dict
-        The canonical primer site, provided as a dictionary of the bed file row
-    alt : dict
-        The alt primer site, provided as a dictionary of the bed file row
-
-    Returns
-    -------
-    dict
-        A dictionary of the merged site, where the dict represents a bed file row
-    """
-    # base the merged site on the canonical
-    mergedSite = canonical
-
-    # check the both the canonical and alt are the same direction
-    if canonical['direction'] != alt['direction']:
-        print(
-            "could not merge alt with different orientation to canonical", file=sys.stderr)
-        raise SystemExit(1)
-
-    # merge the start/ends of the alt with the canonical to get the largest window possible
-    if alt['start'] < canonical['start']:
-        mergedSite['start'] = alt['start']
-    if alt['end'] > canonical['end']:
-        mergedSite['end'] = alt['end']
-    return mergedSite
-
-
-def read_bed_file(fn):
-    """Parses a bed file and collapses alts into canonical primer sites
-
-    Parameters
-    ----------
-    fn : str
-        The bedfile to parse
-
-    Returns
-    -------
-    list
-        A list of dictionaries, where each dictionary contains a row of the parsed bedfile.
-        The available dictionary keys are - Primer_ID, direction, start, end
-    """
-
-    # read the primer scheme into a pandas dataframe and run type, length and null checks
-    primers = pd.read_csv(fn, sep='\t', header=None,
-                          names=['chrom', 'start', 'end',
-                                 'Primer_ID', 'PoolName'],
-                          dtype={'chrom': str, 'start': int, 'end': int,
-                                 'Primer_ID': str, 'PoolName': str},
-                          usecols=(0, 1, 2, 3, 4),
-                          skiprows=0)
-    if len(primers.index) < 1:
-        print("primer scheme file is empty", file=sys.stderr)
-        raise SystemExit(1)
-    if primers.isnull().sum().sum():
-        print("malformed primer scheme file", file=sys.stderr)
-        raise SystemExit(1)
-
-    # compute the direction
-    primers['direction'] = primers.apply(
-        lambda row: getPrimerDirection(row.Primer_ID), axis=1)
-
-    # separate alt primers into a new dataframe
-    altFilter = primers['Primer_ID'].str.contains('_alt')
-    alts = pd.DataFrame(
-        columns=('chrom', 'start', 'end', 'Primer_ID', 'PoolName', 'direction'))
-    alts = pd.concat([alts, primers[altFilter]])
-    primers = primers.drop(primers[altFilter].index.values)
-
-    # convert the primers dataframe to dictionary, indexed by Primer_ID
-    #  - verify_integrity is used to prevent duplicate Primer_IDs being processed
-    bedFile = primers.set_index('Primer_ID', drop=False,
-                                verify_integrity=True).T.to_dict()
-
-    # if there were no alts, return the bedfile as a list of dicts
-    if len(alts.index) == 0:
-        return list(bedFile.values())
-
-    # merge alts
-    for _, row in alts.iterrows():
-        primerID = row['Primer_ID'].split('_alt')[0]
-
-        # check the bedFile if another version of this primer exists
-        if primerID not in bedFile:
-
-            # add to the bed file and continue
-            bedFile[primerID] = row.to_dict()
-            continue
-
-        # otherwise, we've got a primer ID we've already seen so merge the alt
-        mergedSite = merge_sites(bedFile[primerID], row)
-
-        # update the bedFile
-        bedFile[primerID] = mergedSite
-
-    # return the bedFile as a list
-    return [value for value in bedFile.values()]
-
-
-def overlaps(coords, pos):
-    for v in coords:
-        if pos >= v['start'] and pos <= v['end']:
-            return v
-    return False
-
-
-if __name__ == "__main__":
-    if sys.argv[1] not in sets:
-        print("Invalid set")
-        raise SystemExit(1)
-
-    bedfile = read_bed_file(sys.argv[1])
-
-    vcf_reader = vcf.Reader(filename=sys.argv[2])
-    vcf_writer = vcf.Writer(sys.stdout, vcf_reader)
-    for record in vcf_reader:
-        v = overlaps(bedfile, record.POS)
-        if v:
-            record.INFO['PRIMER'] = v["Sequence_(5-3')"]
-
-#	PP = list(record.INFO)
-#	record.INFO = {}
-#	record.INFO['PP'] = PP
-#	record.INFO['DEPTH'] = depths[record.CHROM][record.POS]
-
-        vcf_writer.write_record(record)
diff --git a/artic/version.py b/artic/version.py
index b3f9ac7f..3e8d9f94 100644
--- a/artic/version.py
+++ b/artic/version.py
@@ -1 +1 @@
-__version__ = "1.2.4"
+__version__ = "1.4.0"
diff --git a/docs/commands.md b/docs/commands.md
index 68c5a5bc..ef1338c7 100644
--- a/docs/commands.md
+++ b/docs/commands.md
@@ -2,35 +2,16 @@
 title: commands
 summary: The available artic pipeline commands.
 authors:
+  - Sam Wilkinson
   - Will Rowe
   - Nick Loman
-date: 2020-03-30
+date: 2024-08-16
 ---
 
 # Commands
 
 This page documents the available commands via the `artic` command line interface.
 
----
-
-## basecaller
-
-### Overview
-
-Display basecallers in files
-
-### Input
-
-### Output
-
-### Usage example
-
-```bash
-artic basecaller <directory>
-```
-
----
-
 ## demultiplex
 
 ### Overview
@@ -145,38 +126,6 @@ artic filter --max-length 500 --min-length 50 <filename>
 
 ---
 
-## gather
-
-### Overview
-
-Gather up demultiplexed files
-
-### Input
-
-- director[y/ies] to gather from
-
-### Output
-
-- directory of gathered files
-
-### Usage example
-
-```bash
-artic gather --directory ./
-```
-
-| Argument name(s)   | Required | Default value | Description                               |
-| :----------------- | :------- | :------------ | :---------------------------------------- |
-| --directory        | Y        | NA            | The director[y/ies] to gather files from  |
-| prefix             | Y        | NA            | Prefix for gathered files                 |
-| --max-length       | N        | NA            | Remove reads greater than max-length      |
-| --min-length       | N        | NA            | Remove reads less than than min-length    |
-| --prompt-directory | N        | NA            | The run directory for interactive prompts |
-| --fast5-directory  | N        | NA            | The directory with fast5 files            |
-| --no-fast5s        | N        | NA            | Do not use fast5s and nanopolish          |
-
----
-
 ## guppyplex
 
 ### Overview
@@ -233,23 +182,20 @@ artic minion <scheme> <sample>
 | :------------------- | :------- | :------------- | :------------------------------------------------------------------------------------------- |
 | scheme               | Y        | NA             | The name of the primer scheme                                                                |
 | sample               | Y        | NA             | The name of the sample                                                                       |
-| --medaka             | N        | False          | Use medaka instead of nanopolish for variants                                                |
-| --medaka-model       | *        | NA             | Medaka model to use (required if --medaka set)                                               |
-| --minimap2           | N        | True           | Use minimap2                                                                                 |
-| --bwa                | N        | False          | Use bwa instead of minimap2                                                                  |
+| --clair3             | N        | False          | Use clair3 instead of medaka for variants  (experimental feature from v1.4.0)                |
+| --model       | Y        | NA             | Medaka or Clair3 model to use                                                                       |
 | --normalise          | N        | 100            | Normalise down to moderate coverage to save runtime                                          |
 | --threads            | N        | 8              | Number of threads                                                                            |
 | --scheme-directory   | N        | /artic/schemes | Default scheme directory                                                                     |
 | --max-haplotypes     | N        | 1000000        | Max-haplotypes value for nanopolish                                                          |
 | --read-file          | N        | NA             | Use alternative FASTA/FASTQ file to <sample>.fasta                                           |
-| --fast5-directory    | N        | NA             | FAST5 Directory                                                                              |
-| --sequencing-summary | N        | NA             | Path to Guppy sequencing summary                                                             |
-| --skip-nanopolish    | N        | False          | Skip nanopolish                                                                              |
 | --no-longshot        | N        | False          | Use medaka variant instead of longshot (experimental feautre from v1.2.0)                    |
-| --strict             | N        | False          | Enables experimental features (from v1.2.0), including VFC overlap checks and stats          |
+| --min-mapq           | Y        | 20             | Remove reads which map to the reference with a lower mapping quality than this               |
+| --no-indels          | N        | False          | Ignore insertions and deletions during variant calling, maintains the co-ordinates of the ref|
+| --no-frameshifts     | N        | False          | Do not allow frameshift variants (indels of lengths which are non divisible be 3 ) to be added to the consensus |
+| --use-muscle         | N        | False          | Use muscle to produce an alignment of the produced consensus sequence against the reference  |
 | --dry-run            | N        | False          | Perform a dry run of the minion pipeline, outputing commands to a log but not executing them |
 
-* `--medaka-model` is required if `--medaka` is set.
 
 ---
 
diff --git a/docs/faq.md b/docs/faq.md
index 7cf5f954..25cc14a9 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -13,9 +13,9 @@ date: 2020-03-30
 
 The standard operating proceedure for the ARTIC Network SARS-SoV-2 bioinformatics can be found [here](https://artic.network/ncov-2019/ncov2019-bioinformatics-sop.html).
 
-## Should I use the nanopolish or medaka workflow
+## Should I use the medaka or clair3 workflow
 
-We currently recommend the nanopolish workflow (if you have signal data available) as we have spent more time validating and supporting this workflow. That being said, both tend to give consistent results with our test datasets so the choice is yours.
+We currently recommend the medaka workflow as we have spent more time validating and supporting this workflow. That being said, both tend to give consistent results with our test datasets so the choice is yours.
 
 ## Lab-on-an-SSD
 
diff --git a/docs/installation.md b/docs/installation.md
index f5c96200..ce41f45d 100644
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -9,7 +9,7 @@ date: 2020-03-30
 
 # Installation
 
-As of [release 1.1.0](https://github.com/artic-network/fieldbioinformatics/releases/tag/1.1.0), it is probably easiest to install the `artic pipeline` using conda. Alternatively, you can install the pipeline itself via source/pip but you will have to also satisfy the pipeline dependencies.
+As of [release 1.4.0](https://github.com/artic-network/fieldbioinformatics/releases/tag/1.4.0), conda installation of fieldbioinformatics will become difficult due to the mutually exclusive requirements of medaka and clair3, for this reason we recommend either utilising the docker image [available here](https://quay.io/repository/artic/fieldbioinformatics) or to build the package from source after installing the dependencies via Conda.
 
 ## Via conda
 
@@ -19,21 +19,21 @@ conda install -c bioconda artic
 
 ## Via source
 
-### 1. installing the pipeline
+### 1. installing dependencies
+
+The `artic pipeline` has several [software dependencies](https://github.com/artic-network/fieldbioinformatics/blob/master/environment.yml). You can solve these dependencies using the minimal conda environment we have provided, we strongly recommend that you use either the mamba solver or conda version >= 23.10.0 since libmamba solver is now the default conda solver:
 
 ```sh
 git clone https://github.com/artic-network/fieldbioinformatics
 cd fieldbioinformatics
-python setup.py install
+conda env create -f environment.yml
 ```
 
-### 2. installing dependencies
-
-The `artic pipeline` has several [software dependencies](https://github.com/artic-network/fieldbioinformatics/blob/master/environment.yml). You can solve these dependencies using the minimal conda environment we have provided:
+### 2. installing the pipeline
 
 ```sh
-conda env create -f environment.yml
 conda activate artic
+pip install .
 ```
 
 ### 3. testing the pipeline
@@ -47,7 +47,7 @@ artic -v
 To check that you have all the required dependencies, you can try the pipeline tests with both workflows:
 
 ```
-./test-runner.sh nanopolish
+./test-runner.sh clair3
 ./test-runner.sh medaka
 ```
 
diff --git a/environment.yml b/environment.yml
index 1b4b891f..3f0c52d1 100644
--- a/environment.yml
+++ b/environment.yml
@@ -4,27 +4,25 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - artic-porechop =0.3.2pre
-  - artic-tools =0.2.6
-  - longshot =0.4.5
-  - bcftools =1.17
-  - biopython =1.81
-  - bwa = 0.7.17
-  - clint =0.5.1
-  - htslib =1.17
-  - medaka >=1.8.0
-  - minimap2 =2.26
-  - multiqc =1.15
-  - muscle =3.8
-  - nanopolish =0.14.0
-  - pandas =2.0.3
+  - longshot
+  - bcftools
+  - biopython
+  - bwa
+  - clint
+  - htslib
+  - minimap2
+  - clair3
+  - multiqc
+  - muscle<5.1
+  - pandas
   - pip
-  - pysam =0.21.0
+  - pysam
   - pytest 
-  - python =3.9.15
-  - pyvcf =0.6.8
-  - pyfaidx =0.6.0
-  - requests =2.31.0
-  - samtools =1.17
-  - tqdm =4.66.1
-  - keras-preprocessing =1.1.2
+  - python
+  - cyvcf2
+  - pyfaidx
+  - requests
+  - samtools
+  - tqdm
+  - pip:
+    - medaka
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 9dc60131..00000000
--- a/requirements.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-biopython
-clint
-pandas
-pysam
-pytest
-pyvcf
-requests
-tqdm
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 00000000..9afe6d2e
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,39 @@
+[metadata]
+name = artic
+version = 1.4.0
+author = Nick Loman
+author_email = n.j.loman@bham.ac.uk
+maintainer = Sam Wilkinson
+maintainer_email = s.a.j.wilkinson@bham.ac.uk
+description = ``artic`` is a pipeline for working with viral amplicon sequencing data
+long_description = file: README.md
+long_description_content_type = text/markdown
+license_files =  LICENSE
+classifiers =
+    Programming Language :: Python :: 3
+    Development Status :: 4 - Beta
+    Intended Audience :: Science/Research
+    License :: OSI Approved :: MIT
+    Topic :: Scientific/Engineering :: Bio-Informatics
+
+[options]
+zip_safe = False
+include_package_data = True
+packages = find:
+python_requires = <3.11
+
+[options.entry_points]
+console_scripts = 
+    artic=artic.pipeline:main
+    align_trim=artic.align_trim:main
+    align_trim_n=artic.align_trim_n:main
+    margin_cons=artic.margin_cons:main
+    margin_cons_medaka=artic.margin_cons_medaka:main
+    vcfextract=artic.vcfextract:main
+    artic_vcf_merge=artic.vcf_merge:main
+    artic_vcf_filter=artic.vcf_filter:main
+    artic_make_depth_mask=artic.make_depth_mask:main
+    artic_fasta_header=artic.fasta_header:main
+    artic_mask=artic.mask:main
+    artic_get_stats=artic.artic_mqc:main
+
diff --git a/setup.py b/setup.py
index 236f0679..60684932 100644
--- a/setup.py
+++ b/setup.py
@@ -1,53 +1,3 @@
-import os
 from setuptools import setup
 
-version_py = os.path.join(os.path.dirname(__file__), 'artic', 'version.py')
-version = open(version_py).read().strip().split(
-    '=')[-1].replace('"', '').strip()
-long_description = """
-``artic`` is a pipeline for working with virus sequencing data sequenced with nanopore
-"""
-
-HERE = os.path.dirname(__file__)
-
-with open(os.path.join(HERE, "requirements.txt"), "r") as f:
-    install_requires = [x.strip() for x in f.readlines()]
-
-setup(
-    name="artic",
-    version=version,
-    install_requires=install_requires,
-    requires=['python (>=3.5)'],
-    packages=['artic'],
-    author="Nick Loman",
-    description='A toolset for working with nanopore sequencing data',
-    long_description=long_description,
-    url="https://artic.network/ncov-2019/ncov2019-bioinformatics-sop.html",
-    package_dir={'artic': "artic"},
-    package_data={'artic': []},
-    zip_safe=False,
-    include_package_data=True,
-    entry_points={
-        'console_scripts': [
-            'artic=artic.pipeline:main',
-            'align_trim=artic.align_trim:main',
-            'align_trim_n=artic.align_trim_n:main',
-            'margin_cons=artic.margin_cons:main',
-            'margin_cons_medaka=artic.margin_cons_medaka:main',
-            'vcfextract=artic.vcfextract:main',
-            'artic_vcf_merge=artic.vcf_merge:main',
-            'artic_vcf_filter=artic.vcf_filter:main',
-            'artic_make_depth_mask=artic.make_depth_mask:main',
-            'artic_fasta_header=artic.fasta_header:main',
-            'artic_mask=artic.mask:main',
-            'artic_get_stats=artic.artic_mqc:main',
-        ],
-    },
-    author_email="n.j.loman@bham.ac.uk",
-    classifiers=[
-        'Development Status :: 4 - Beta',
-        'Intended Audience :: Science/Research',
-        'License :: OSI Approved :: MIT',
-        'Topic :: Scientific/Engineering :: Bio-Informatics'
-    ]
-)
+setup()
diff --git a/test-data/primer-schemes/IturiEBOV/V1/IturiEBOV.scheme.bed b/test-data/primer-schemes/IturiEBOV/V1/IturiEBOV.scheme.bed
index 612bdee4..0127cb9e 100644
--- a/test-data/primer-schemes/IturiEBOV/V1/IturiEBOV.scheme.bed
+++ b/test-data/primer-schemes/IturiEBOV/V1/IturiEBOV.scheme.bed
@@ -1,122 +1,122 @@
-BTB20484	27	55	Ebov-DRC_1_LEFT	Ebov-DRC_1
-BTB20484	383	407	Ebov-DRC_1_RIGHT	Ebov-DRC_1
-BTB20484	350	376	Ebov-DRC_2_LEFT	Ebov-DRC_2
-BTB20484	707	730	Ebov-DRC_2_RIGHT	Ebov-DRC_2
-BTB20484	665	687	Ebov-DRC_3_LEFT	Ebov-DRC_1
-BTB20484	1033	1055	Ebov-DRC_3_RIGHT	Ebov-DRC_1
-BTB20484	994	1017	Ebov-DRC_4_LEFT	Ebov-DRC_2
-BTB20484	1354	1376	Ebov-DRC_4_RIGHT	Ebov-DRC_2
-BTB20484	1294	1316	Ebov-DRC_5_LEFT	Ebov-DRC_1
-BTB20484	1680	1702	Ebov-DRC_5_RIGHT	Ebov-DRC_1
-BTB20484	1625	1647	Ebov-DRC_6_LEFT	Ebov-DRC_2
-BTB20484	1995	2017	Ebov-DRC_6_RIGHT	Ebov-DRC_2
-BTB20484	1926	1948	Ebov-DRC_7_LEFT	Ebov-DRC_1
-BTB20484	2317	2339	Ebov-DRC_7_RIGHT	Ebov-DRC_1
-BTB20484	2238	2260	Ebov-DRC_8_LEFT	Ebov-DRC_2
-BTB20484	2630	2652	Ebov-DRC_8_RIGHT	Ebov-DRC_2
-BTB20484	2540	2565	Ebov-DRC_9_LEFT	Ebov-DRC_1
-BTB20484	2895	2920	Ebov-DRC_9_RIGHT	Ebov-DRC_1
-BTB20484	2843	2872	Ebov-DRC_10_LEFT	Ebov-DRC_2
-BTB20484	3215	3237	Ebov-DRC_10_RIGHT	Ebov-DRC_2
-BTB20484	3155	3177	Ebov-DRC_11_LEFT	Ebov-DRC_1
-BTB20484	3524	3546	Ebov-DRC_11_RIGHT	Ebov-DRC_1
-BTB20484	3460	3485	Ebov-DRC_12_LEFT	Ebov-DRC_2
-BTB20484	3821	3843	Ebov-DRC_12_RIGHT	Ebov-DRC_2
-BTB20484	3769	3792	Ebov-DRC_13_LEFT	Ebov-DRC_1
-BTB20484	4121	4149	Ebov-DRC_13_RIGHT	Ebov-DRC_1
-BTB20484	4014	4036	Ebov-DRC_14_LEFT	Ebov-DRC_2
-BTB20484	4369	4395	Ebov-DRC_14_RIGHT	Ebov-DRC_2
-BTB20484	4287	4311	Ebov-DRC_15_LEFT	Ebov-DRC_1
-BTB20484	4674	4696	Ebov-DRC_15_RIGHT	Ebov-DRC_1
-BTB20484	4606	4628	Ebov-DRC_16_LEFT	Ebov-DRC_2
-BTB20484	4990	5014	Ebov-DRC_16_RIGHT	Ebov-DRC_2
-BTB20484	4945	4967	Ebov-DRC_17_LEFT	Ebov-DRC_1
-BTB20484	5315	5338	Ebov-DRC_17_RIGHT	Ebov-DRC_1
-BTB20484	5235	5260	Ebov-DRC_18_LEFT	Ebov-DRC_2
-BTB20484	5585	5615	Ebov-DRC_18_RIGHT	Ebov-DRC_2
-BTB20484	5528	5556	Ebov-DRC_19_LEFT	Ebov-DRC_1
-BTB20484	5895	5918	Ebov-DRC_19_RIGHT	Ebov-DRC_1
-BTB20484	5827	5849	Ebov-DRC_20_LEFT	Ebov-DRC_2
-BTB20484	6211	6237	Ebov-DRC_20_RIGHT	Ebov-DRC_2
-BTB20484	6164	6188	Ebov-DRC_21_LEFT	Ebov-DRC_1
-BTB20484	6535	6558	Ebov-DRC_21_RIGHT	Ebov-DRC_1
-BTB20484	6479	6501	Ebov-DRC_22_LEFT	Ebov-DRC_2
-BTB20484	6859	6881	Ebov-DRC_22_RIGHT	Ebov-DRC_2
-BTB20484	6791	6813	Ebov-DRC_23_LEFT	Ebov-DRC_1
-BTB20484	7184	7207	Ebov-DRC_23_RIGHT	Ebov-DRC_1
-BTB20484	7140	7162	Ebov-DRC_24_LEFT	Ebov-DRC_2
-BTB20484	7510	7532	Ebov-DRC_24_RIGHT	Ebov-DRC_2
-BTB20484	7437	7459	Ebov-DRC_25_LEFT	Ebov-DRC_1
-BTB20484	7805	7827	Ebov-DRC_25_RIGHT	Ebov-DRC_1
-BTB20484	7741	7763	Ebov-DRC_26_LEFT	Ebov-DRC_2
-BTB20484	8097	8121	Ebov-DRC_26_RIGHT	Ebov-DRC_2
-BTB20484	8056	8086	Ebov-DRC_27_LEFT	Ebov-DRC_1
-BTB20484	8417	8440	Ebov-DRC_27_RIGHT	Ebov-DRC_1
-BTB20484	8364	8387	Ebov-DRC_28_LEFT	Ebov-DRC_2
-BTB20484	8737	8764	Ebov-DRC_28_RIGHT	Ebov-DRC_2
-BTB20484	8694	8716	Ebov-DRC_29_LEFT	Ebov-DRC_1
-BTB20484	9056	9081	Ebov-DRC_29_RIGHT	Ebov-DRC_1
-BTB20484	9024	9046	Ebov-DRC_30_LEFT	Ebov-DRC_2
-BTB20484	9375	9404	Ebov-DRC_30_RIGHT	Ebov-DRC_2
-BTB20484	9307	9332	Ebov-DRC_31_LEFT	Ebov-DRC_1
-BTB20484	9666	9688	Ebov-DRC_31_RIGHT	Ebov-DRC_1
-BTB20484	9602	9624	Ebov-DRC_32_LEFT	Ebov-DRC_2
-BTB20484	9982	10006	Ebov-DRC_32_RIGHT	Ebov-DRC_2
-BTB20484	9937	9961	Ebov-DRC_33_LEFT	Ebov-DRC_1
-BTB20484	10305	10330	Ebov-DRC_33_RIGHT	Ebov-DRC_1
-BTB20484	10237	10259	Ebov-DRC_34_LEFT	Ebov-DRC_2
-BTB20484	10617	10639	Ebov-DRC_34_RIGHT	Ebov-DRC_2
-BTB20484	10539	10561	Ebov-DRC_35_LEFT	Ebov-DRC_1
-BTB20484	10923	10945	Ebov-DRC_35_RIGHT	Ebov-DRC_1
-BTB20484	10860	10887	Ebov-DRC_36_LEFT	Ebov-DRC_2
-BTB20484	11247	11270	Ebov-DRC_36_RIGHT	Ebov-DRC_2
-BTB20484	11152	11180	Ebov-DRC_37_LEFT	Ebov-DRC_1
-BTB20484	11517	11539	Ebov-DRC_37_RIGHT	Ebov-DRC_1
-BTB20484	11385	11409	Ebov-DRC_38_LEFT	Ebov-DRC_2
-BTB20484	11770	11792	Ebov-DRC_38_RIGHT	Ebov-DRC_2
-BTB20484	11700	11722	Ebov-DRC_39_LEFT	Ebov-DRC_1
-BTB20484	12073	12098	Ebov-DRC_39_RIGHT	Ebov-DRC_1
-BTB20484	12040	12063	Ebov-DRC_40_LEFT	Ebov-DRC_2
-BTB20484	12397	12420	Ebov-DRC_40_RIGHT	Ebov-DRC_2
-BTB20484	12346	12370	Ebov-DRC_41_LEFT	Ebov-DRC_1
-BTB20484	12709	12733	Ebov-DRC_41_RIGHT	Ebov-DRC_1
-BTB20484	12645	12667	Ebov-DRC_42_LEFT	Ebov-DRC_2
-BTB20484	13032	13054	Ebov-DRC_42_RIGHT	Ebov-DRC_2
-BTB20484	12961	12983	Ebov-DRC_43_LEFT	Ebov-DRC_1
-BTB20484	13358	13380	Ebov-DRC_43_RIGHT	Ebov-DRC_1
-BTB20484	13313	13336	Ebov-DRC_44_LEFT	Ebov-DRC_2
-BTB20484	13681	13704	Ebov-DRC_44_RIGHT	Ebov-DRC_2
-BTB20484	13639	13661	Ebov-DRC_45_LEFT	Ebov-DRC_1
-BTB20484	14004	14026	Ebov-DRC_45_RIGHT	Ebov-DRC_1
-BTB20484	13941	13969	Ebov-DRC_46_LEFT	Ebov-DRC_2
-BTB20484	14325	14352	Ebov-DRC_46_RIGHT	Ebov-DRC_2
-BTB20484	14274	14296	Ebov-DRC_47_LEFT	Ebov-DRC_1
-BTB20484	14646	14671	Ebov-DRC_47_RIGHT	Ebov-DRC_1
-BTB20484	14590	14612	Ebov-DRC_48_LEFT	Ebov-DRC_2
-BTB20484	14951	14974	Ebov-DRC_48_RIGHT	Ebov-DRC_2
-BTB20484	14913	14935	Ebov-DRC_49_LEFT	Ebov-DRC_1
-BTB20484	15272	15294	Ebov-DRC_49_RIGHT	Ebov-DRC_1
-BTB20484	15202	15224	Ebov-DRC_50_LEFT	Ebov-DRC_2
-BTB20484	15575	15597	Ebov-DRC_50_RIGHT	Ebov-DRC_2
-BTB20484	15504	15529	Ebov-DRC_51_LEFT	Ebov-DRC_1
-BTB20484	15899	15922	Ebov-DRC_51_RIGHT	Ebov-DRC_1
-BTB20484	15844	15866	Ebov-DRC_52_LEFT	Ebov-DRC_2
-BTB20484	16207	16229	Ebov-DRC_52_RIGHT	Ebov-DRC_2
-BTB20484	16171	16193	Ebov-DRC_53_LEFT	Ebov-DRC_1
-BTB20484	16530	16553	Ebov-DRC_53_RIGHT	Ebov-DRC_1
-BTB20484	16470	16492	Ebov-DRC_54_LEFT	Ebov-DRC_2
-BTB20484	16848	16870	Ebov-DRC_54_RIGHT	Ebov-DRC_2
-BTB20484	16777	16800	Ebov-DRC_55_LEFT	Ebov-DRC_1
-BTB20484	17144	17166	Ebov-DRC_55_RIGHT	Ebov-DRC_1
-BTB20484	17085	17107	Ebov-DRC_56_LEFT	Ebov-DRC_2
-BTB20484	17447	17472	Ebov-DRC_56_RIGHT	Ebov-DRC_2
-BTB20484	17381	17406	Ebov-DRC_57_LEFT	Ebov-DRC_1
-BTB20484	17748	17772	Ebov-DRC_57_RIGHT	Ebov-DRC_1
-BTB20484	17649	17675	Ebov-DRC_58_LEFT	Ebov-DRC_2
-BTB20484	18010	18032	Ebov-DRC_58_RIGHT	Ebov-DRC_2
-BTB20484	17940	17967	Ebov-DRC_59_LEFT	Ebov-DRC_1
-BTB20484	18324	18353	Ebov-DRC_59_RIGHT	Ebov-DRC_1
-BTB20484	18209	18237	Ebov-DRC_60_LEFT	Ebov-DRC_2
-BTB20484	18561	18589	Ebov-DRC_60_RIGHT	Ebov-DRC_2
-BTB20484	18539	18565	Ebov-DRC_61_LEFT	Ebov-DRC_1
-BTB20484	18898	18920	Ebov-DRC_61_RIGHT	Ebov-DRC_1
+BTB20484	27	55	Ebov-DRC_1_LEFT	Ebov-DRC_1	+
+BTB20484	383	407	Ebov-DRC_1_RIGHT	Ebov-DRC_1	-
+BTB20484	350	376	Ebov-DRC_2_LEFT	Ebov-DRC_2	+
+BTB20484	707	730	Ebov-DRC_2_RIGHT	Ebov-DRC_2	-
+BTB20484	665	687	Ebov-DRC_3_LEFT	Ebov-DRC_1	+
+BTB20484	1033	1055	Ebov-DRC_3_RIGHT	Ebov-DRC_1	-
+BTB20484	994	1017	Ebov-DRC_4_LEFT	Ebov-DRC_2	+
+BTB20484	1354	1376	Ebov-DRC_4_RIGHT	Ebov-DRC_2	-
+BTB20484	1294	1316	Ebov-DRC_5_LEFT	Ebov-DRC_1	+
+BTB20484	1680	1702	Ebov-DRC_5_RIGHT	Ebov-DRC_1	-
+BTB20484	1625	1647	Ebov-DRC_6_LEFT	Ebov-DRC_2	+
+BTB20484	1995	2017	Ebov-DRC_6_RIGHT	Ebov-DRC_2	-
+BTB20484	1926	1948	Ebov-DRC_7_LEFT	Ebov-DRC_1	+
+BTB20484	2317	2339	Ebov-DRC_7_RIGHT	Ebov-DRC_1	-
+BTB20484	2238	2260	Ebov-DRC_8_LEFT	Ebov-DRC_2	+
+BTB20484	2630	2652	Ebov-DRC_8_RIGHT	Ebov-DRC_2	-
+BTB20484	2540	2565	Ebov-DRC_9_LEFT	Ebov-DRC_1	+
+BTB20484	2895	2920	Ebov-DRC_9_RIGHT	Ebov-DRC_1	-
+BTB20484	2843	2872	Ebov-DRC_10_LEFT	Ebov-DRC_2	+
+BTB20484	3215	3237	Ebov-DRC_10_RIGHT	Ebov-DRC_2	-
+BTB20484	3155	3177	Ebov-DRC_11_LEFT	Ebov-DRC_1	+
+BTB20484	3524	3546	Ebov-DRC_11_RIGHT	Ebov-DRC_1	-
+BTB20484	3460	3485	Ebov-DRC_12_LEFT	Ebov-DRC_2	+
+BTB20484	3821	3843	Ebov-DRC_12_RIGHT	Ebov-DRC_2	-
+BTB20484	3769	3792	Ebov-DRC_13_LEFT	Ebov-DRC_1	+
+BTB20484	4121	4149	Ebov-DRC_13_RIGHT	Ebov-DRC_1	-
+BTB20484	4014	4036	Ebov-DRC_14_LEFT	Ebov-DRC_2	+
+BTB20484	4369	4395	Ebov-DRC_14_RIGHT	Ebov-DRC_2	-
+BTB20484	4287	4311	Ebov-DRC_15_LEFT	Ebov-DRC_1	+
+BTB20484	4674	4696	Ebov-DRC_15_RIGHT	Ebov-DRC_1	-
+BTB20484	4606	4628	Ebov-DRC_16_LEFT	Ebov-DRC_2	+
+BTB20484	4990	5014	Ebov-DRC_16_RIGHT	Ebov-DRC_2	-
+BTB20484	4945	4967	Ebov-DRC_17_LEFT	Ebov-DRC_1	+
+BTB20484	5315	5338	Ebov-DRC_17_RIGHT	Ebov-DRC_1	-
+BTB20484	5235	5260	Ebov-DRC_18_LEFT	Ebov-DRC_2	+
+BTB20484	5585	5615	Ebov-DRC_18_RIGHT	Ebov-DRC_2	-
+BTB20484	5528	5556	Ebov-DRC_19_LEFT	Ebov-DRC_1	+
+BTB20484	5895	5918	Ebov-DRC_19_RIGHT	Ebov-DRC_1	-
+BTB20484	5827	5849	Ebov-DRC_20_LEFT	Ebov-DRC_2	+
+BTB20484	6211	6237	Ebov-DRC_20_RIGHT	Ebov-DRC_2	-
+BTB20484	6164	6188	Ebov-DRC_21_LEFT	Ebov-DRC_1	+
+BTB20484	6535	6558	Ebov-DRC_21_RIGHT	Ebov-DRC_1	-
+BTB20484	6479	6501	Ebov-DRC_22_LEFT	Ebov-DRC_2	+
+BTB20484	6859	6881	Ebov-DRC_22_RIGHT	Ebov-DRC_2	-
+BTB20484	6791	6813	Ebov-DRC_23_LEFT	Ebov-DRC_1	+
+BTB20484	7184	7207	Ebov-DRC_23_RIGHT	Ebov-DRC_1	-
+BTB20484	7140	7162	Ebov-DRC_24_LEFT	Ebov-DRC_2	+
+BTB20484	7510	7532	Ebov-DRC_24_RIGHT	Ebov-DRC_2	-
+BTB20484	7437	7459	Ebov-DRC_25_LEFT	Ebov-DRC_1	+
+BTB20484	7805	7827	Ebov-DRC_25_RIGHT	Ebov-DRC_1	-
+BTB20484	7741	7763	Ebov-DRC_26_LEFT	Ebov-DRC_2	+
+BTB20484	8097	8121	Ebov-DRC_26_RIGHT	Ebov-DRC_2	-
+BTB20484	8056	8086	Ebov-DRC_27_LEFT	Ebov-DRC_1	+
+BTB20484	8417	8440	Ebov-DRC_27_RIGHT	Ebov-DRC_1	-
+BTB20484	8364	8387	Ebov-DRC_28_LEFT	Ebov-DRC_2	+
+BTB20484	8737	8764	Ebov-DRC_28_RIGHT	Ebov-DRC_2	-
+BTB20484	8694	8716	Ebov-DRC_29_LEFT	Ebov-DRC_1	+
+BTB20484	9056	9081	Ebov-DRC_29_RIGHT	Ebov-DRC_1	-
+BTB20484	9024	9046	Ebov-DRC_30_LEFT	Ebov-DRC_2	+
+BTB20484	9375	9404	Ebov-DRC_30_RIGHT	Ebov-DRC_2	-
+BTB20484	9307	9332	Ebov-DRC_31_LEFT	Ebov-DRC_1	+
+BTB20484	9666	9688	Ebov-DRC_31_RIGHT	Ebov-DRC_1	-
+BTB20484	9602	9624	Ebov-DRC_32_LEFT	Ebov-DRC_2	+
+BTB20484	9982	10006	Ebov-DRC_32_RIGHT	Ebov-DRC_2	-
+BTB20484	9937	9961	Ebov-DRC_33_LEFT	Ebov-DRC_1	+
+BTB20484	10305	10330	Ebov-DRC_33_RIGHT	Ebov-DRC_1	-
+BTB20484	10237	10259	Ebov-DRC_34_LEFT	Ebov-DRC_2	+
+BTB20484	10617	10639	Ebov-DRC_34_RIGHT	Ebov-DRC_2	-
+BTB20484	10539	10561	Ebov-DRC_35_LEFT	Ebov-DRC_1	+
+BTB20484	10923	10945	Ebov-DRC_35_RIGHT	Ebov-DRC_1	-
+BTB20484	10860	10887	Ebov-DRC_36_LEFT	Ebov-DRC_2	+
+BTB20484	11247	11270	Ebov-DRC_36_RIGHT	Ebov-DRC_2	-
+BTB20484	11152	11180	Ebov-DRC_37_LEFT	Ebov-DRC_1	+
+BTB20484	11517	11539	Ebov-DRC_37_RIGHT	Ebov-DRC_1	-
+BTB20484	11385	11409	Ebov-DRC_38_LEFT	Ebov-DRC_2	+
+BTB20484	11770	11792	Ebov-DRC_38_RIGHT	Ebov-DRC_2	-
+BTB20484	11700	11722	Ebov-DRC_39_LEFT	Ebov-DRC_1	+
+BTB20484	12073	12098	Ebov-DRC_39_RIGHT	Ebov-DRC_1	-
+BTB20484	12040	12063	Ebov-DRC_40_LEFT	Ebov-DRC_2	+
+BTB20484	12397	12420	Ebov-DRC_40_RIGHT	Ebov-DRC_2	-
+BTB20484	12346	12370	Ebov-DRC_41_LEFT	Ebov-DRC_1	+
+BTB20484	12709	12733	Ebov-DRC_41_RIGHT	Ebov-DRC_1	-
+BTB20484	12645	12667	Ebov-DRC_42_LEFT	Ebov-DRC_2	+
+BTB20484	13032	13054	Ebov-DRC_42_RIGHT	Ebov-DRC_2	-
+BTB20484	12961	12983	Ebov-DRC_43_LEFT	Ebov-DRC_1	+
+BTB20484	13358	13380	Ebov-DRC_43_RIGHT	Ebov-DRC_1	-
+BTB20484	13313	13336	Ebov-DRC_44_LEFT	Ebov-DRC_2	+
+BTB20484	13681	13704	Ebov-DRC_44_RIGHT	Ebov-DRC_2	-
+BTB20484	13639	13661	Ebov-DRC_45_LEFT	Ebov-DRC_1	+
+BTB20484	14004	14026	Ebov-DRC_45_RIGHT	Ebov-DRC_1	-
+BTB20484	13941	13969	Ebov-DRC_46_LEFT	Ebov-DRC_2	+
+BTB20484	14325	14352	Ebov-DRC_46_RIGHT	Ebov-DRC_2	-
+BTB20484	14274	14296	Ebov-DRC_47_LEFT	Ebov-DRC_1	+
+BTB20484	14646	14671	Ebov-DRC_47_RIGHT	Ebov-DRC_1	-
+BTB20484	14590	14612	Ebov-DRC_48_LEFT	Ebov-DRC_2	+
+BTB20484	14951	14974	Ebov-DRC_48_RIGHT	Ebov-DRC_2	-
+BTB20484	14913	14935	Ebov-DRC_49_LEFT	Ebov-DRC_1	+
+BTB20484	15272	15294	Ebov-DRC_49_RIGHT	Ebov-DRC_1	-
+BTB20484	15202	15224	Ebov-DRC_50_LEFT	Ebov-DRC_2	+
+BTB20484	15575	15597	Ebov-DRC_50_RIGHT	Ebov-DRC_2	-
+BTB20484	15504	15529	Ebov-DRC_51_LEFT	Ebov-DRC_1	+
+BTB20484	15899	15922	Ebov-DRC_51_RIGHT	Ebov-DRC_1	-
+BTB20484	15844	15866	Ebov-DRC_52_LEFT	Ebov-DRC_2	+
+BTB20484	16207	16229	Ebov-DRC_52_RIGHT	Ebov-DRC_2	-
+BTB20484	16171	16193	Ebov-DRC_53_LEFT	Ebov-DRC_1	+
+BTB20484	16530	16553	Ebov-DRC_53_RIGHT	Ebov-DRC_1	-
+BTB20484	16470	16492	Ebov-DRC_54_LEFT	Ebov-DRC_2	+
+BTB20484	16848	16870	Ebov-DRC_54_RIGHT	Ebov-DRC_2	-
+BTB20484	16777	16800	Ebov-DRC_55_LEFT	Ebov-DRC_1	+
+BTB20484	17144	17166	Ebov-DRC_55_RIGHT	Ebov-DRC_1	-
+BTB20484	17085	17107	Ebov-DRC_56_LEFT	Ebov-DRC_2	+
+BTB20484	17447	17472	Ebov-DRC_56_RIGHT	Ebov-DRC_2	-
+BTB20484	17381	17406	Ebov-DRC_57_LEFT	Ebov-DRC_1	+
+BTB20484	17748	17772	Ebov-DRC_57_RIGHT	Ebov-DRC_1	-
+BTB20484	17649	17675	Ebov-DRC_58_LEFT	Ebov-DRC_2	+
+BTB20484	18010	18032	Ebov-DRC_58_RIGHT	Ebov-DRC_2	-
+BTB20484	17940	17967	Ebov-DRC_59_LEFT	Ebov-DRC_1	+
+BTB20484	18324	18353	Ebov-DRC_59_RIGHT	Ebov-DRC_1	-
+BTB20484	18209	18237	Ebov-DRC_60_LEFT	Ebov-DRC_2	+
+BTB20484	18561	18589	Ebov-DRC_60_RIGHT	Ebov-DRC_2	-
+BTB20484	18539	18565	Ebov-DRC_61_LEFT	Ebov-DRC_1	+
+BTB20484	18898	18920	Ebov-DRC_61_RIGHT	Ebov-DRC_1	-
diff --git a/test-data/primer-schemes/nCoV-2019/V1/nCoV-2019.scheme.bed b/test-data/primer-schemes/nCoV-2019/V1/nCoV-2019.scheme.bed
index cffbba29..4284729b 100644
--- a/test-data/primer-schemes/nCoV-2019/V1/nCoV-2019.scheme.bed
+++ b/test-data/primer-schemes/nCoV-2019/V1/nCoV-2019.scheme.bed
@@ -1,196 +1,196 @@
-MN908947.3	30	54	nCoV-2019_1_LEFT	nCoV-2019_1
-MN908947.3	385	410	nCoV-2019_1_RIGHT	nCoV-2019_1
-MN908947.3	320	342	nCoV-2019_2_LEFT	nCoV-2019_2
-MN908947.3	704	726	nCoV-2019_2_RIGHT	nCoV-2019_2
-MN908947.3	642	664	nCoV-2019_3_LEFT	nCoV-2019_1
-MN908947.3	1004	1028	nCoV-2019_3_RIGHT	nCoV-2019_1
-MN908947.3	943	965	nCoV-2019_4_LEFT	nCoV-2019_2
-MN908947.3	1312	1337	nCoV-2019_4_RIGHT	nCoV-2019_2
-MN908947.3	1242	1264	nCoV-2019_5_LEFT	nCoV-2019_1
-MN908947.3	1623	1651	nCoV-2019_5_RIGHT	nCoV-2019_1
-MN908947.3	1573	1595	nCoV-2019_6_LEFT	nCoV-2019_2
-MN908947.3	1942	1964	nCoV-2019_6_RIGHT	nCoV-2019_2
-MN908947.3	1875	1897	nCoV-2019_7_LEFT	nCoV-2019_1
-MN908947.3	2247	2269	nCoV-2019_7_RIGHT	nCoV-2019_1
-MN908947.3	2181	2205	nCoV-2019_8_LEFT	nCoV-2019_2
-MN908947.3	2568	2592	nCoV-2019_8_RIGHT	nCoV-2019_2
-MN908947.3	2505	2529	nCoV-2019_9_LEFT	nCoV-2019_1
-MN908947.3	2882	2904	nCoV-2019_9_RIGHT	nCoV-2019_1
-MN908947.3	2826	2850	nCoV-2019_10_LEFT	nCoV-2019_2
-MN908947.3	3183	3210	nCoV-2019_10_RIGHT	nCoV-2019_2
-MN908947.3	3144	3166	nCoV-2019_11_LEFT	nCoV-2019_1
-MN908947.3	3507	3531	nCoV-2019_11_RIGHT	nCoV-2019_1
-MN908947.3	3460	3482	nCoV-2019_12_LEFT	nCoV-2019_2
-MN908947.3	3826	3853	nCoV-2019_12_RIGHT	nCoV-2019_2
-MN908947.3	3771	3795	nCoV-2019_13_LEFT	nCoV-2019_1
-MN908947.3	4142	4164	nCoV-2019_13_RIGHT	nCoV-2019_1
-MN908947.3	4054	4077	nCoV-2019_14_LEFT	nCoV-2019_2
-MN908947.3	4428	4450	nCoV-2019_14_RIGHT	nCoV-2019_2
-MN908947.3	4294	4321	nCoV-2019_15_LEFT	nCoV-2019_1
-MN908947.3	4674	4696	nCoV-2019_15_RIGHT	nCoV-2019_1
-MN908947.3	4636	4658	nCoV-2019_16_LEFT	nCoV-2019_2
-MN908947.3	4995	5017	nCoV-2019_16_RIGHT	nCoV-2019_2
-MN908947.3	4939	4966	nCoV-2019_17_LEFT	nCoV-2019_1
-MN908947.3	5296	5321	nCoV-2019_17_RIGHT	nCoV-2019_1
-MN908947.3	5230	5259	nCoV-2019_18_LEFT	nCoV-2019_2
-MN908947.3	5620	5644	nCoV-2019_18_RIGHT	nCoV-2019_2
-MN908947.3	5563	5586	nCoV-2019_19_LEFT	nCoV-2019_1
-MN908947.3	5932	5957	nCoV-2019_19_RIGHT	nCoV-2019_1
-MN908947.3	5867	5894	nCoV-2019_20_LEFT	nCoV-2019_2
-MN908947.3	6247	6272	nCoV-2019_20_RIGHT	nCoV-2019_2
-MN908947.3	6167	6196	nCoV-2019_21_LEFT	nCoV-2019_1
-MN908947.3	6528	6550	nCoV-2019_21_RIGHT	nCoV-2019_1
-MN908947.3	6466	6495	nCoV-2019_22_LEFT	nCoV-2019_2
-MN908947.3	6846	6873	nCoV-2019_22_RIGHT	nCoV-2019_2
-MN908947.3	6718	6745	nCoV-2019_23_LEFT	nCoV-2019_1
-MN908947.3	7092	7117	nCoV-2019_23_RIGHT	nCoV-2019_1
-MN908947.3	7035	7058	nCoV-2019_24_LEFT	nCoV-2019_2
-MN908947.3	7389	7415	nCoV-2019_24_RIGHT	nCoV-2019_2
-MN908947.3	7305	7332	nCoV-2019_25_LEFT	nCoV-2019_1
-MN908947.3	7671	7694	nCoV-2019_25_RIGHT	nCoV-2019_1
-MN908947.3	7626	7651	nCoV-2019_26_LEFT	nCoV-2019_2
-MN908947.3	7997	8019	nCoV-2019_26_RIGHT	nCoV-2019_2
-MN908947.3	7943	7968	nCoV-2019_27_LEFT	nCoV-2019_1
-MN908947.3	8319	8341	nCoV-2019_27_RIGHT	nCoV-2019_1
-MN908947.3	8249	8275	nCoV-2019_28_LEFT	nCoV-2019_2
-MN908947.3	8635	8661	nCoV-2019_28_RIGHT	nCoV-2019_2
-MN908947.3	8595	8619	nCoV-2019_29_LEFT	nCoV-2019_1
-MN908947.3	8954	8983	nCoV-2019_29_RIGHT	nCoV-2019_1
-MN908947.3	8888	8913	nCoV-2019_30_LEFT	nCoV-2019_2
-MN908947.3	9245	9271	nCoV-2019_30_RIGHT	nCoV-2019_2
-MN908947.3	9204	9226	nCoV-2019_31_LEFT	nCoV-2019_1
-MN908947.3	9557	9585	nCoV-2019_31_RIGHT	nCoV-2019_1
-MN908947.3	9477	9502	nCoV-2019_32_LEFT	nCoV-2019_2
-MN908947.3	9834	9858	nCoV-2019_32_RIGHT	nCoV-2019_2
-MN908947.3	9784	9806	nCoV-2019_33_LEFT	nCoV-2019_1
-MN908947.3	10146	10171	nCoV-2019_33_RIGHT	nCoV-2019_1
-MN908947.3	10076	10099	nCoV-2019_34_LEFT	nCoV-2019_2
-MN908947.3	10437	10459	nCoV-2019_34_RIGHT	nCoV-2019_2
-MN908947.3	10362	10384	nCoV-2019_35_LEFT	nCoV-2019_1
-MN908947.3	10737	10763	nCoV-2019_35_RIGHT	nCoV-2019_1
-MN908947.3	10666	10688	nCoV-2019_36_LEFT	nCoV-2019_2
-MN908947.3	11048	11074	nCoV-2019_36_RIGHT	nCoV-2019_2
-MN908947.3	10999	11022	nCoV-2019_37_LEFT	nCoV-2019_1
-MN908947.3	11372	11394	nCoV-2019_37_RIGHT	nCoV-2019_1
-MN908947.3	11306	11331	nCoV-2019_38_LEFT	nCoV-2019_2
-MN908947.3	11668	11693	nCoV-2019_38_RIGHT	nCoV-2019_2
-MN908947.3	11555	11584	nCoV-2019_39_LEFT	nCoV-2019_1
-MN908947.3	11927	11949	nCoV-2019_39_RIGHT	nCoV-2019_1
-MN908947.3	11863	11889	nCoV-2019_40_LEFT	nCoV-2019_2
-MN908947.3	12234	12256	nCoV-2019_40_RIGHT	nCoV-2019_2
-MN908947.3	12110	12133	nCoV-2019_41_LEFT	nCoV-2019_1
-MN908947.3	12465	12490	nCoV-2019_41_RIGHT	nCoV-2019_1
-MN908947.3	12417	12439	nCoV-2019_42_LEFT	nCoV-2019_2
-MN908947.3	12779	12802	nCoV-2019_42_RIGHT	nCoV-2019_2
-MN908947.3	12710	12732	nCoV-2019_43_LEFT	nCoV-2019_1
-MN908947.3	13074	13096	nCoV-2019_43_RIGHT	nCoV-2019_1
-MN908947.3	13005	13027	nCoV-2019_44_LEFT	nCoV-2019_2
-MN908947.3	13378	13400	nCoV-2019_44_RIGHT	nCoV-2019_2
-MN908947.3	13319	13344	nCoV-2019_45_LEFT	nCoV-2019_1
-MN908947.3	13669	13699	nCoV-2019_45_RIGHT	nCoV-2019_1
-MN908947.3	13599	13621	nCoV-2019_46_LEFT	nCoV-2019_2
-MN908947.3	13962	13984	nCoV-2019_46_RIGHT	nCoV-2019_2
-MN908947.3	13918	13946	nCoV-2019_47_LEFT	nCoV-2019_1
-MN908947.3	14271	14299	nCoV-2019_47_RIGHT	nCoV-2019_1
-MN908947.3	14207	14232	nCoV-2019_48_LEFT	nCoV-2019_2
-MN908947.3	14579	14601	nCoV-2019_48_RIGHT	nCoV-2019_2
-MN908947.3	14545	14570	nCoV-2019_49_LEFT	nCoV-2019_1
-MN908947.3	14898	14926	nCoV-2019_49_RIGHT	nCoV-2019_1
-MN908947.3	14865	14895	nCoV-2019_50_LEFT	nCoV-2019_2
-MN908947.3	15224	15246	nCoV-2019_50_RIGHT	nCoV-2019_2
-MN908947.3	15171	15193	nCoV-2019_51_LEFT	nCoV-2019_1
-MN908947.3	15538	15560	nCoV-2019_51_RIGHT	nCoV-2019_1
-MN908947.3	15481	15503	nCoV-2019_52_LEFT	nCoV-2019_2
-MN908947.3	15861	15886	nCoV-2019_52_RIGHT	nCoV-2019_2
-MN908947.3	15827	15851	nCoV-2019_53_LEFT	nCoV-2019_1
-MN908947.3	16186	16209	nCoV-2019_53_RIGHT	nCoV-2019_1
-MN908947.3	16118	16144	nCoV-2019_54_LEFT	nCoV-2019_2
-MN908947.3	16485	16510	nCoV-2019_54_RIGHT	nCoV-2019_2
-MN908947.3	16416	16444	nCoV-2019_55_LEFT	nCoV-2019_1
-MN908947.3	16804	16833	nCoV-2019_55_RIGHT	nCoV-2019_1
-MN908947.3	16748	16770	nCoV-2019_56_LEFT	nCoV-2019_2
-MN908947.3	17130	17152	nCoV-2019_56_RIGHT	nCoV-2019_2
-MN908947.3	17065	17087	nCoV-2019_57_LEFT	nCoV-2019_1
-MN908947.3	17430	17452	nCoV-2019_57_RIGHT	nCoV-2019_1
-MN908947.3	17381	17406	nCoV-2019_58_LEFT	nCoV-2019_2
-MN908947.3	17738	17761	nCoV-2019_58_RIGHT	nCoV-2019_2
-MN908947.3	17674	17697	nCoV-2019_59_LEFT	nCoV-2019_1
-MN908947.3	18036	18062	nCoV-2019_59_RIGHT	nCoV-2019_1
-MN908947.3	17966	17993	nCoV-2019_60_LEFT	nCoV-2019_2
-MN908947.3	18324	18348	nCoV-2019_60_RIGHT	nCoV-2019_2
-MN908947.3	18253	18275	nCoV-2019_61_LEFT	nCoV-2019_1
-MN908947.3	18650	18672	nCoV-2019_61_RIGHT	nCoV-2019_1
-MN908947.3	18596	18618	nCoV-2019_62_LEFT	nCoV-2019_2
-MN908947.3	18957	18979	nCoV-2019_62_RIGHT	nCoV-2019_2
-MN908947.3	18896	18918	nCoV-2019_63_LEFT	nCoV-2019_1
-MN908947.3	19275	19297	nCoV-2019_63_RIGHT	nCoV-2019_1
-MN908947.3	19204	19232	nCoV-2019_64_LEFT	nCoV-2019_2
-MN908947.3	19591	19616	nCoV-2019_64_RIGHT	nCoV-2019_2
-MN908947.3	19548	19570	nCoV-2019_65_LEFT	nCoV-2019_1
-MN908947.3	19911	19939	nCoV-2019_65_RIGHT	nCoV-2019_1
-MN908947.3	19844	19866	nCoV-2019_66_LEFT	nCoV-2019_2
-MN908947.3	20231	20255	nCoV-2019_66_RIGHT	nCoV-2019_2
-MN908947.3	20172	20200	nCoV-2019_67_LEFT	nCoV-2019_1
-MN908947.3	20542	20572	nCoV-2019_67_RIGHT	nCoV-2019_1
-MN908947.3	20472	20496	nCoV-2019_68_LEFT	nCoV-2019_2
-MN908947.3	20867	20890	nCoV-2019_68_RIGHT	nCoV-2019_2
-MN908947.3	20786	20813	nCoV-2019_69_LEFT	nCoV-2019_1
-MN908947.3	21146	21169	nCoV-2019_69_RIGHT	nCoV-2019_1
-MN908947.3	21075	21104	nCoV-2019_70_LEFT	nCoV-2019_2
-MN908947.3	21427	21455	nCoV-2019_70_RIGHT	nCoV-2019_2
-MN908947.3	21357	21386	nCoV-2019_71_LEFT	nCoV-2019_1
-MN908947.3	21716	21743	nCoV-2019_71_RIGHT	nCoV-2019_1
-MN908947.3	21658	21682	nCoV-2019_72_LEFT	nCoV-2019_2
-MN908947.3	22013	22038	nCoV-2019_72_RIGHT	nCoV-2019_2
-MN908947.3	21961	21990	nCoV-2019_73_LEFT	nCoV-2019_1
-MN908947.3	22324	22346	nCoV-2019_73_RIGHT	nCoV-2019_1
-MN908947.3	22262	22290	nCoV-2019_74_LEFT	nCoV-2019_2
-MN908947.3	22626	22650	nCoV-2019_74_RIGHT	nCoV-2019_2
-MN908947.3	22516	22542	nCoV-2019_75_LEFT	nCoV-2019_1
-MN908947.3	22877	22903	nCoV-2019_75_RIGHT	nCoV-2019_1
-MN908947.3	22797	22819	nCoV-2019_76_LEFT	nCoV-2019_2
-MN908947.3	23192	23214	nCoV-2019_76_RIGHT	nCoV-2019_2
-MN908947.3	23122	23144	nCoV-2019_77_LEFT	nCoV-2019_1
-MN908947.3	23500	23522	nCoV-2019_77_RIGHT	nCoV-2019_1
-MN908947.3	23443	23466	nCoV-2019_78_LEFT	nCoV-2019_2
-MN908947.3	23822	23847	nCoV-2019_78_RIGHT	nCoV-2019_2
-MN908947.3	23789	23812	nCoV-2019_79_LEFT	nCoV-2019_1
-MN908947.3	24145	24169	nCoV-2019_79_RIGHT	nCoV-2019_1
-MN908947.3	24078	24100	nCoV-2019_80_LEFT	nCoV-2019_2
-MN908947.3	24443	24467	nCoV-2019_80_RIGHT	nCoV-2019_2
-MN908947.3	24391	24416	nCoV-2019_81_LEFT	nCoV-2019_1
-MN908947.3	24765	24789	nCoV-2019_81_RIGHT	nCoV-2019_1
-MN908947.3	24696	24721	nCoV-2019_82_LEFT	nCoV-2019_2
-MN908947.3	25052	25076	nCoV-2019_82_RIGHT	nCoV-2019_2
-MN908947.3	24978	25003	nCoV-2019_83_LEFT	nCoV-2019_1
-MN908947.3	25347	25369	nCoV-2019_83_RIGHT	nCoV-2019_1
-MN908947.3	25279	25301	nCoV-2019_84_LEFT	nCoV-2019_2
-MN908947.3	25646	25673	nCoV-2019_84_RIGHT	nCoV-2019_2
-MN908947.3	25601	25623	nCoV-2019_85_LEFT	nCoV-2019_1
-MN908947.3	25969	25994	nCoV-2019_85_RIGHT	nCoV-2019_1
-MN908947.3	25902	25924	nCoV-2019_86_LEFT	nCoV-2019_2
-MN908947.3	26290	26315	nCoV-2019_86_RIGHT	nCoV-2019_2
-MN908947.3	26197	26219	nCoV-2019_87_LEFT	nCoV-2019_1
-MN908947.3	26566	26590	nCoV-2019_87_RIGHT	nCoV-2019_1
-MN908947.3	26520	26542	nCoV-2019_88_LEFT	nCoV-2019_2
-MN908947.3	26890	26913	nCoV-2019_88_RIGHT	nCoV-2019_2
-MN908947.3	26835	26857	nCoV-2019_89_LEFT	nCoV-2019_1
-MN908947.3	27202	27227	nCoV-2019_89_RIGHT	nCoV-2019_1
-MN908947.3	27141	27164	nCoV-2019_90_LEFT	nCoV-2019_2
-MN908947.3	27511	27533	nCoV-2019_90_RIGHT	nCoV-2019_2
-MN908947.3	27446	27471	nCoV-2019_91_LEFT	nCoV-2019_1
-MN908947.3	27825	27854	nCoV-2019_91_RIGHT	nCoV-2019_1
-MN908947.3	27784	27808	nCoV-2019_92_LEFT	nCoV-2019_2
-MN908947.3	28145	28172	nCoV-2019_92_RIGHT	nCoV-2019_2
-MN908947.3	28081	28104	nCoV-2019_93_LEFT	nCoV-2019_1
-MN908947.3	28442	28464	nCoV-2019_93_RIGHT	nCoV-2019_1
-MN908947.3	28394	28416	nCoV-2019_94_LEFT	nCoV-2019_2
-MN908947.3	28756	28779	nCoV-2019_94_RIGHT	nCoV-2019_2
-MN908947.3	28677	28699	nCoV-2019_95_LEFT	nCoV-2019_1
-MN908947.3	29041	29063	nCoV-2019_95_RIGHT	nCoV-2019_1
-MN908947.3	28985	29007	nCoV-2019_96_LEFT	nCoV-2019_2
-MN908947.3	29356	29378	nCoV-2019_96_RIGHT	nCoV-2019_2
-MN908947.3	29288	29316	nCoV-2019_97_LEFT	nCoV-2019_1
-MN908947.3	29665	29693	nCoV-2019_97_RIGHT	nCoV-2019_1
-MN908947.3	29486	29510	nCoV-2019_98_LEFT	nCoV-2019_2
-MN908947.3	29836	29866	nCoV-2019_98_RIGHT	nCoV-2019_2
+MN908947.3	30	54	nCoV-2019_1_LEFT_1	1	+	ACCAACCAACTTTCGATCTCTTGT
+MN908947.3	385	410	nCoV-2019_1_RIGHT_1	1	-	CATCTTTAAGATGTTGACGTGCCTC
+MN908947.3	320	342	nCoV-2019_2_LEFT_1	2	+	CTGTTTTACAGGTTCGCGACGT
+MN908947.3	704	726	nCoV-2019_2_RIGHT_1	2	-	TAAGGATCAGTGCCAAGCTCGT
+MN908947.3	642	664	nCoV-2019_3_LEFT_1	1	+	CGGTAATAAAGGAGCTGGTGGC
+MN908947.3	1004	1028	nCoV-2019_3_RIGHT_1	1	-	AAGGTGTCTGCAATTCATAGCTCT
+MN908947.3	943	965	nCoV-2019_4_LEFT_1	2	+	GGTGTATACTGCTGCCGTGAAC
+MN908947.3	1312	1337	nCoV-2019_4_RIGHT_1	2	-	CACAAGTAGTGGCACCTTCTTTAGT
+MN908947.3	1242	1264	nCoV-2019_5_LEFT_1	1	+	TGGTGAAACTTCATGGCAGACG
+MN908947.3	1623	1651	nCoV-2019_5_RIGHT_1	1	-	ATTGATGTTGACTTTCTCTTTTTGGAGT
+MN908947.3	1573	1595	nCoV-2019_6_LEFT_1	2	+	GGTGTTGTTGGAGAAGGTTCCG
+MN908947.3	1942	1964	nCoV-2019_6_RIGHT_1	2	-	TAGCGGCCTTCTGTAAAACACG
+MN908947.3	1875	1897	nCoV-2019_7_LEFT_1	1	+	ATCAGAGGCTGCTCGTGTTGTA
+MN908947.3	2247	2269	nCoV-2019_7_RIGHT_1	1	-	TGCACAGGTGACAATTTGTCCA
+MN908947.3	2181	2205	nCoV-2019_8_LEFT_1	2	+	AGAGTTTCTTAGAGACGGTTGGGA
+MN908947.3	2568	2592	nCoV-2019_8_RIGHT_1	2	-	GCTTCAACAGCTTCACTAGTAGGT
+MN908947.3	2505	2529	nCoV-2019_9_LEFT_1	1	+	TCCCACAGAAGTGTTAACAGAGGA
+MN908947.3	2882	2904	nCoV-2019_9_RIGHT_1	1	-	ATGACAGCATCTGCCACAACAC
+MN908947.3	2826	2850	nCoV-2019_10_LEFT_1	2	+	TGAGAAGTGCTCTGCCTATACAGT
+MN908947.3	3183	3210	nCoV-2019_10_RIGHT_1	2	-	TCATCTAACCAATCTTCTTCTTGCTCT
+MN908947.3	3144	3166	nCoV-2019_11_LEFT_1	1	+	GGAATTTGGTGCCACTTCTGCT
+MN908947.3	3507	3531	nCoV-2019_11_RIGHT_1	1	-	TCATCAGATTCAACTTGCATGGCA
+MN908947.3	3460	3482	nCoV-2019_12_LEFT_1	2	+	AAACATGGAGGAGGTGTTGCAG
+MN908947.3	3826	3853	nCoV-2019_12_RIGHT_1	2	-	TTCACTCTTCATTTCCAAAAAGCTTGA
+MN908947.3	3771	3795	nCoV-2019_13_LEFT_1	1	+	TCGCACAAATGTCTACTTAGCTGT
+MN908947.3	4142	4164	nCoV-2019_13_RIGHT_1	1	-	ACCACAGCAGTTAAAACACCCT
+MN908947.3	4054	4077	nCoV-2019_14_LEFT_1	2	+	CATCCAGATTCTGCCACTCTTGT
+MN908947.3	4428	4450	nCoV-2019_14_RIGHT_1	2	-	AGTTTCCACACAGACAGGCATT
+MN908947.3	4294	4321	nCoV-2019_15_LEFT_1	1	+	ACAGTGCTTAAAAAGTGTAAAAGTGCC
+MN908947.3	4674	4696	nCoV-2019_15_RIGHT_1	1	-	AACAGAAACTGTAGCTGGCACT
+MN908947.3	4636	4658	nCoV-2019_16_LEFT_1	2	+	AATTTGGAAGAAGCTGCTCGGT
+MN908947.3	4995	5017	nCoV-2019_16_RIGHT_1	2	-	CACAACTTGCGTGTGGAGGTTA
+MN908947.3	4939	4966	nCoV-2019_17_LEFT_1	1	+	CTTCTTTCTTTGAGAGAAGTGAGGACT
+MN908947.3	5296	5321	nCoV-2019_17_RIGHT_1	1	-	TTTGTTGGAGTGTTAACAATGCAGT
+MN908947.3	5230	5259	nCoV-2019_18_LEFT_1	2	+	TGGAAATACCCACAAGTTAATGGTTTAAC
+MN908947.3	5620	5644	nCoV-2019_18_RIGHT_1	2	-	AGCTTGTTTACCACACGTACAAGG
+MN908947.3	5563	5586	nCoV-2019_19_LEFT_1	1	+	GCTGTTATGTACATGGGCACACT
+MN908947.3	5932	5957	nCoV-2019_19_RIGHT_1	1	-	TGTCCAACTTAGGGTCAATTTCTGT
+MN908947.3	5867	5894	nCoV-2019_20_LEFT_1	2	+	ACAAAGAAAACAGTTACACAACAACCA
+MN908947.3	6247	6272	nCoV-2019_20_RIGHT_1	2	-	ACGTGGCTTTATTAGTTGCATTGTT
+MN908947.3	6167	6196	nCoV-2019_21_LEFT_1	1	+	TGGCTATTGATTATAAACACTACACACCC
+MN908947.3	6528	6550	nCoV-2019_21_RIGHT_1	1	-	TAGATCTGTGTGGCCAACCTCT
+MN908947.3	6466	6495	nCoV-2019_22_LEFT_1	2	+	ACTACCGAAGTTGTAGGAGACATTATACT
+MN908947.3	6846	6873	nCoV-2019_22_RIGHT_1	2	-	ACAGTATTCTTTGCTATAGTAGTCGGC
+MN908947.3	6718	6745	nCoV-2019_23_LEFT_1	1	+	ACAACTACTAACATAGTTACACGGTGT
+MN908947.3	7092	7117	nCoV-2019_23_RIGHT_1	1	-	ACCAGTACAGTAGGTTGCAATAGTG
+MN908947.3	7035	7058	nCoV-2019_24_LEFT_1	2	+	AGGCATGCCTTCTTACTGTACTG
+MN908947.3	7389	7415	nCoV-2019_24_RIGHT_1	2	-	ACATTCTAACCATAGCTGAAATCGGG
+MN908947.3	7305	7332	nCoV-2019_25_LEFT_1	1	+	GCAATTGTTTTTCAGCTATTTTGCAGT
+MN908947.3	7671	7694	nCoV-2019_25_RIGHT_1	1	-	ACTGTAGTGACAAGTCTCTCGCA
+MN908947.3	7626	7651	nCoV-2019_26_LEFT_1	2	+	TTGTGATACATTCTGTGCTGGTAGT
+MN908947.3	7997	8019	nCoV-2019_26_RIGHT_1	2	-	TCCGCACTATCACCAACATCAG
+MN908947.3	7943	7968	nCoV-2019_27_LEFT_1	1	+	ACTACAGTCAGCTTATGTGTCAACC
+MN908947.3	8319	8341	nCoV-2019_27_RIGHT_1	1	-	AATACAAGCACCAAGGTCACGG
+MN908947.3	8249	8275	nCoV-2019_28_LEFT_1	2	+	ACATAGAAGTTACTGGCGATAGTTGT
+MN908947.3	8635	8661	nCoV-2019_28_RIGHT_1	2	-	TGTTTAGACATGACATGAACAGGTGT
+MN908947.3	8595	8619	nCoV-2019_29_LEFT_1	1	+	ACTTGTGTTCCTTTTTGTTGCTGC
+MN908947.3	8954	8983	nCoV-2019_29_RIGHT_1	1	-	AGTGTACTCTATAAGTTTTGATGGTGTGT
+MN908947.3	8888	8913	nCoV-2019_30_LEFT_1	2	+	GCACAACTAATGGTGACTTTTTGCA
+MN908947.3	9245	9271	nCoV-2019_30_RIGHT_1	2	-	ACCACTAGTAGATACACAAACACCAG
+MN908947.3	9204	9226	nCoV-2019_31_LEFT_1	1	+	TTCTGAGTACTGTAGGCACGGC
+MN908947.3	9557	9585	nCoV-2019_31_RIGHT_1	1	-	ACAGAATAAACACCAGGTAAGAATGAGT
+MN908947.3	9477	9502	nCoV-2019_32_LEFT_1	2	+	TGGTGAATACAGTCATGTAGTTGCC
+MN908947.3	9834	9858	nCoV-2019_32_RIGHT_1	2	-	AGCACATCACTACGCAACTTTAGA
+MN908947.3	9784	9806	nCoV-2019_33_LEFT_1	1	+	ACTTTTGAAGAAGCTGCGCTGT
+MN908947.3	10146	10171	nCoV-2019_33_RIGHT_1	1	-	TGGACAGTAAACTACGTCATCAAGC
+MN908947.3	10076	10099	nCoV-2019_34_LEFT_1	2	+	TCCCATCTGGTAAAGTTGAGGGT
+MN908947.3	10437	10459	nCoV-2019_34_RIGHT_1	2	-	AGTGAAATTGGGCCTCATAGCA
+MN908947.3	10362	10384	nCoV-2019_35_LEFT_1	1	+	TGTTCGCATTCAACCAGGACAG
+MN908947.3	10737	10763	nCoV-2019_35_RIGHT_1	1	-	ACTTCATAGCCACAAGGTTAAAGTCA
+MN908947.3	10666	10688	nCoV-2019_36_LEFT_1	2	+	TTAGCTTGGTTGTACGCTGCTG
+MN908947.3	11048	11074	nCoV-2019_36_RIGHT_1	2	-	GAACAAAGACCATTGAGTACTCTGGA
+MN908947.3	10999	11022	nCoV-2019_37_LEFT_1	1	+	ACACACCACTGGTTGTTACTCAC
+MN908947.3	11372	11394	nCoV-2019_37_RIGHT_1	1	-	GTCCACACTCTCCTAGCACCAT
+MN908947.3	11306	11331	nCoV-2019_38_LEFT_1	2	+	ACTGTGTTATGTATGCATCAGCTGT
+MN908947.3	11668	11693	nCoV-2019_38_RIGHT_1	2	-	CACCAAGAGTCAGTCTAAAGTAGCG
+MN908947.3	11555	11584	nCoV-2019_39_LEFT_1	1	+	AGTATTGCCCTATTTTCTTCATAACTGGT
+MN908947.3	11927	11949	nCoV-2019_39_RIGHT_1	1	-	TGTAACTGGACACATTGAGCCC
+MN908947.3	11863	11889	nCoV-2019_40_LEFT_1	2	+	TGCACATCAGTAGTCTTACTCTCAGT
+MN908947.3	12234	12256	nCoV-2019_40_RIGHT_1	2	-	CATGGCTGCATCACGGTCAAAT
+MN908947.3	12110	12133	nCoV-2019_41_LEFT_1	1	+	GTTCCCTTCCATCATATGCAGCT
+MN908947.3	12465	12490	nCoV-2019_41_RIGHT_1	1	-	TGGTATGACAACCATTAGTTTGGCT
+MN908947.3	12417	12439	nCoV-2019_42_LEFT_1	2	+	TGCAAGAGATGGTTGTGTTCCC
+MN908947.3	12779	12802	nCoV-2019_42_RIGHT_1	2	-	CCTACCTCCCTTTGTTGTGTTGT
+MN908947.3	12710	12732	nCoV-2019_43_LEFT_1	1	+	TACGACAGATGTCTTGTGCTGC
+MN908947.3	13074	13096	nCoV-2019_43_RIGHT_1	1	-	AGCAGCATCTACAGCAAAAGCA
+MN908947.3	13005	13027	nCoV-2019_44_LEFT_1	2	+	TGCCACAGTACGTCTACAAGCT
+MN908947.3	13378	13400	nCoV-2019_44_RIGHT_1	2	-	AACCTTTCCACATACCGCAGAC
+MN908947.3	13319	13344	nCoV-2019_45_LEFT_1	1	+	TACCTACAACTTGTGCTAATGACCC
+MN908947.3	13669	13699	nCoV-2019_45_RIGHT_1	1	-	AAATTGTTTCTTCATGTTGGTAGTTAGAGA
+MN908947.3	13599	13621	nCoV-2019_46_LEFT_1	2	+	TGTCGCTTCCAAGAAAAGGACG
+MN908947.3	13962	13984	nCoV-2019_46_RIGHT_1	2	-	CACGTTCACCTAAGTTGGCGTA
+MN908947.3	13918	13946	nCoV-2019_47_LEFT_1	1	+	AGGACTGGTATGATTTTGTAGAAAACCC
+MN908947.3	14271	14299	nCoV-2019_47_RIGHT_1	1	-	AATAACGGTCAAAGAGTTTTAACCTCTC
+MN908947.3	14207	14232	nCoV-2019_48_LEFT_1	2	+	TGTTGACACTGACTTAACAAAGCCT
+MN908947.3	14579	14601	nCoV-2019_48_RIGHT_1	2	-	TAGATTACCAGAAGCAGCGTGC
+MN908947.3	14545	14570	nCoV-2019_49_LEFT_1	1	+	AGGAATTACTTGTGTATGCTGCTGA
+MN908947.3	14898	14926	nCoV-2019_49_RIGHT_1	1	-	TGACGATGACTTGGTTAGCATTAATACA
+MN908947.3	14865	14895	nCoV-2019_50_LEFT_1	2	+	GTTGATAAGTACTTTGATTGTTACGATGGT
+MN908947.3	15224	15246	nCoV-2019_50_RIGHT_1	2	-	TAACATGTTGTGCCAACCACCA
+MN908947.3	15171	15193	nCoV-2019_51_LEFT_1	1	+	TCAATAGCCGCCACTAGAGGAG
+MN908947.3	15538	15560	nCoV-2019_51_RIGHT_1	1	-	AGTGCATTAACATTGGCCGTGA
+MN908947.3	15481	15503	nCoV-2019_52_LEFT_1	2	+	CATCAGGAGATGCCACAACTGC
+MN908947.3	15861	15886	nCoV-2019_52_RIGHT_1	2	-	GTTGAGAGCAAAATTCATGAGGTCC
+MN908947.3	15827	15851	nCoV-2019_53_LEFT_1	1	+	AGCAAAATGTTGGACTGAGACTGA
+MN908947.3	16186	16209	nCoV-2019_53_RIGHT_1	1	-	AGCCTCATAAAACTCAGGTTCCC
+MN908947.3	16118	16144	nCoV-2019_54_LEFT_1	2	+	TGAGTTAACAGGACACATGTTAGACA
+MN908947.3	16485	16510	nCoV-2019_54_RIGHT_1	2	-	AACCAAAAACTTGTCCATTAGCACA
+MN908947.3	16416	16444	nCoV-2019_55_LEFT_1	1	+	ACTCAACTTTACTTAGGAGGTATGAGCT
+MN908947.3	16804	16833	nCoV-2019_55_RIGHT_1	1	-	GGTGTACTCTCCTATTTGTACTTTACTGT
+MN908947.3	16748	16770	nCoV-2019_56_LEFT_1	2	+	ACCTAGACCACCACTTAACCGA
+MN908947.3	17130	17152	nCoV-2019_56_RIGHT_1	2	-	ACACTATGCGAGCAGAAGGGTA
+MN908947.3	17065	17087	nCoV-2019_57_LEFT_1	1	+	ATTCTACACTCCAGGGACCACC
+MN908947.3	17430	17452	nCoV-2019_57_RIGHT_1	1	-	GTAATTGAGCAGGGTCGCCAAT
+MN908947.3	17381	17406	nCoV-2019_58_LEFT_1	2	+	TGATTTGAGTGTTGTCAATGCCAGA
+MN908947.3	17738	17761	nCoV-2019_58_RIGHT_1	2	-	CTTTTCTCCAAGCAGGGTTACGT
+MN908947.3	17674	17697	nCoV-2019_59_LEFT_1	1	+	TCACGCATGATGTTTCATCTGCA
+MN908947.3	18036	18062	nCoV-2019_59_RIGHT_1	1	-	AAGAGTCCTGTTACATTTTCAGCTTG
+MN908947.3	17966	17993	nCoV-2019_60_LEFT_1	2	+	TGATAGAGACCTTTATGACAAGTTGCA
+MN908947.3	18324	18348	nCoV-2019_60_RIGHT_1	2	-	GGTACCAACAGCTTCTCTAGTAGC
+MN908947.3	18253	18275	nCoV-2019_61_LEFT_1	1	+	TGTTTATCACCCGCGAAGAAGC
+MN908947.3	18650	18672	nCoV-2019_61_RIGHT_1	1	-	ATCACATAGACAACAGGTGCGC
+MN908947.3	18596	18618	nCoV-2019_62_LEFT_1	2	+	GGCACATGGCTTTGAGTTGACA
+MN908947.3	18957	18979	nCoV-2019_62_RIGHT_1	2	-	GTTGAACCTTTCTACAAGCCGC
+MN908947.3	18896	18918	nCoV-2019_63_LEFT_1	1	+	TGTTAAGCGTGTTGACTGGACT
+MN908947.3	19275	19297	nCoV-2019_63_RIGHT_1	1	-	ACAAACTGCCACCATCACAACC
+MN908947.3	19204	19232	nCoV-2019_64_LEFT_1	2	+	TCGATAGATATCCTGCTAATTCCATTGT
+MN908947.3	19591	19616	nCoV-2019_64_RIGHT_1	2	-	AGTCTTGTAAAAGTGTTCCAGAGGT
+MN908947.3	19548	19570	nCoV-2019_65_LEFT_1	1	+	GCTGGCTTTAGCTTGTGGGTTT
+MN908947.3	19911	19939	nCoV-2019_65_RIGHT_1	1	-	TGTCAGTCATAGAACAAACACCAATAGT
+MN908947.3	19844	19866	nCoV-2019_66_LEFT_1	2	+	GGGTGTGGACATTGCTGCTAAT
+MN908947.3	20231	20255	nCoV-2019_66_RIGHT_1	2	-	TCAATTTCCATTTGACTCCTGGGT
+MN908947.3	20172	20200	nCoV-2019_67_LEFT_1	1	+	GTTGTCCAACAATTACCTGAAACTTACT
+MN908947.3	20542	20572	nCoV-2019_67_RIGHT_1	1	-	CAACCTTAGAAACTACAGATAAATCTTGGG
+MN908947.3	20472	20496	nCoV-2019_68_LEFT_1	2	+	ACAGGTTCATCTAAGTGTGTGTGT
+MN908947.3	20867	20890	nCoV-2019_68_RIGHT_1	2	-	CTCCTTTATCAGAACCAGCACCA
+MN908947.3	20786	20813	nCoV-2019_69_LEFT_1	1	+	TGTCGCAAAATATACTCAACTGTGTCA
+MN908947.3	21146	21169	nCoV-2019_69_RIGHT_1	1	-	TCTTTATAGCCACGGAACCTCCA
+MN908947.3	21075	21104	nCoV-2019_70_LEFT_1	2	+	ACAAAAGAAAATGACTCTAAAGAGGGTTT
+MN908947.3	21427	21455	nCoV-2019_70_RIGHT_1	2	-	TGACCTTCTTTTAAAGACATAACAGCAG
+MN908947.3	21357	21386	nCoV-2019_71_LEFT_1	1	+	ACAAATCCAATTCAGTTGTCTTCCTATTC
+MN908947.3	21716	21743	nCoV-2019_71_RIGHT_1	1	-	TGGAAAAGAAAGGTAAGAACAAGTCCT
+MN908947.3	21658	21682	nCoV-2019_72_LEFT_1	2	+	ACACGTGGTGTTTATTACCCTGAC
+MN908947.3	22013	22038	nCoV-2019_72_RIGHT_1	2	-	ACTCTGAACTCACTTTCCATCCAAC
+MN908947.3	21961	21990	nCoV-2019_73_LEFT_1	1	+	CAATTTTGTAATGATCCATTTTTGGGTGT
+MN908947.3	22324	22346	nCoV-2019_73_RIGHT_1	1	-	CACCAGCTGTCCAACCTGAAGA
+MN908947.3	22262	22290	nCoV-2019_74_LEFT_1	2	+	ACATCACTAGGTTTCAAACTTTACTTGC
+MN908947.3	22626	22650	nCoV-2019_74_RIGHT_1	2	-	GCAACACAGTTGCTGATTCTCTTC
+MN908947.3	22516	22542	nCoV-2019_75_LEFT_1	1	+	AGAGTCCAACCAACAGAATCTATTGT
+MN908947.3	22877	22903	nCoV-2019_75_RIGHT_1	1	-	ACCACCAACCTTAGAATCAAGATTGT
+MN908947.3	22797	22819	nCoV-2019_76_LEFT_1	2	+	AGGGCAAACTGGAAAGATTGCT
+MN908947.3	23192	23214	nCoV-2019_76_RIGHT_1	2	-	ACACCTGTGCCTGTTAAACCAT
+MN908947.3	23122	23144	nCoV-2019_77_LEFT_1	1	+	CCAGCAACTGTTTGTGGACCTA
+MN908947.3	23500	23522	nCoV-2019_77_RIGHT_1	1	-	CAGCCCCTATTAAACAGCCTGC
+MN908947.3	23443	23466	nCoV-2019_78_LEFT_1	2	+	CAACTTACTCCTACTTGGCGTGT
+MN908947.3	23822	23847	nCoV-2019_78_RIGHT_1	2	-	TGTGTACAAAAACTGCCATATTGCA
+MN908947.3	23789	23812	nCoV-2019_79_LEFT_1	1	+	GTGGTGATTCAACTGAATGCAGC
+MN908947.3	24145	24169	nCoV-2019_79_RIGHT_1	1	-	CATTTCATCTGTGAGCAAAGGTGG
+MN908947.3	24078	24100	nCoV-2019_80_LEFT_1	2	+	TTGCCTTGGTGATATTGCTGCT
+MN908947.3	24443	24467	nCoV-2019_80_RIGHT_1	2	-	TGGAGCTAAGTTGTTTAACAAGCG
+MN908947.3	24391	24416	nCoV-2019_81_LEFT_1	1	+	GCACTTGGAAAACTTCAAGATGTGG
+MN908947.3	24765	24789	nCoV-2019_81_RIGHT_1	1	-	GTGAAGTTCTTTTCTTGTGCAGGG
+MN908947.3	24696	24721	nCoV-2019_82_LEFT_1	2	+	GGGCTATCATCTTATGTCCTTCCCT
+MN908947.3	25052	25076	nCoV-2019_82_RIGHT_1	2	-	TGCCAGAGATGTCACCTAAATCAA
+MN908947.3	24978	25003	nCoV-2019_83_LEFT_1	1	+	TCCTTTGCAACCTGAATTAGACTCA
+MN908947.3	25347	25369	nCoV-2019_83_RIGHT_1	1	-	TTTGACTCCTTTGAGCACTGGC
+MN908947.3	25279	25301	nCoV-2019_84_LEFT_1	2	+	TGCTGTAGTTGTCTCAAGGGCT
+MN908947.3	25646	25673	nCoV-2019_84_RIGHT_1	2	-	AGGTGTGAGTAAACTGTTACAAACAAC
+MN908947.3	25601	25623	nCoV-2019_85_LEFT_1	1	+	ACTAGCACTCTCCAAGGGTGTT
+MN908947.3	25969	25994	nCoV-2019_85_RIGHT_1	1	-	ACACAGTCTTTTACTCCAGATTCCC
+MN908947.3	25902	25924	nCoV-2019_86_LEFT_1	2	+	TCAGGTGATGGCACAACAAGTC
+MN908947.3	26290	26315	nCoV-2019_86_RIGHT_1	2	-	ACGAAAGCAAGAAAAAGAAGTACGC
+MN908947.3	26197	26219	nCoV-2019_87_LEFT_1	1	+	CGACTACTAGCGTGCCTTTGTA
+MN908947.3	26566	26590	nCoV-2019_87_RIGHT_1	1	-	ACTAGGTTCCATTGTTCAAGGAGC
+MN908947.3	26520	26542	nCoV-2019_88_LEFT_1	2	+	CCATGGCAGATTCCAACGGTAC
+MN908947.3	26890	26913	nCoV-2019_88_RIGHT_1	2	-	TGGTCAGAATAGTGCCATGGAGT
+MN908947.3	26835	26857	nCoV-2019_89_LEFT_1	1	+	GTACGCGTTCCATGTGGTCATT
+MN908947.3	27202	27227	nCoV-2019_89_RIGHT_1	1	-	ACCTGAAAGTCAACGAGATGAAACA
+MN908947.3	27141	27164	nCoV-2019_90_LEFT_1	2	+	ACACAGACCATTCCAGTAGCAGT
+MN908947.3	27511	27533	nCoV-2019_90_RIGHT_1	2	-	TGAAATGGTGAATTGCCCTCGT
+MN908947.3	27446	27471	nCoV-2019_91_LEFT_1	1	+	TCACTACCAAGAGTGTGTTAGAGGT
+MN908947.3	27825	27854	nCoV-2019_91_RIGHT_1	1	-	TTCAAGTGAGAACCAAAAGATAATAAGCA
+MN908947.3	27784	27808	nCoV-2019_92_LEFT_1	2	+	TTTGTGCTTTTTAGCCTTTCTGCT
+MN908947.3	28145	28172	nCoV-2019_92_RIGHT_1	2	-	AGGTTCCTGGCAATTAATTGTAAAAGG
+MN908947.3	28081	28104	nCoV-2019_93_LEFT_1	1	+	TGAGGCTGGTTCTAAATCACCCA
+MN908947.3	28442	28464	nCoV-2019_93_RIGHT_1	1	-	AGGTCTTCCTTGCCATGTTGAG
+MN908947.3	28394	28416	nCoV-2019_94_LEFT_1	2	+	GGCCCCAAGGTTTACCCAATAA
+MN908947.3	28756	28779	nCoV-2019_94_RIGHT_1	2	-	TTTGGCAATGTTGTTCCTTGAGG
+MN908947.3	28677	28699	nCoV-2019_95_LEFT_1	1	+	TGAGGGAGCCTTGAATACACCA
+MN908947.3	29041	29063	nCoV-2019_95_RIGHT_1	1	-	CAGTACGTTTTTGCCGAGGCTT
+MN908947.3	28985	29007	nCoV-2019_96_LEFT_1	2	+	GCCAACAACAACAAGGCCAAAC
+MN908947.3	29356	29378	nCoV-2019_96_RIGHT_1	2	-	TAGGCTCTGTTGGTGGGAATGT
+MN908947.3	29288	29316	nCoV-2019_97_LEFT_1	1	+	TGGATGACAAAGATCCAAATTTCAAAGA
+MN908947.3	29665	29693	nCoV-2019_97_RIGHT_1	1	-	ACACACTGATTAAAGATTGCTATGTGAG
+MN908947.3	29486	29510	nCoV-2019_98_LEFT_1	2	+	AACAATTGCAACAATCCATGAGCA
+MN908947.3	29836	29866	nCoV-2019_98_RIGHT_1	2	-	TTCTCCTAAGAAGCTATTAAAATCACATGG
\ No newline at end of file
diff --git a/test-runner.sh b/test-runner.sh
index 802c9ba0..044083b7 100755
--- a/test-runner.sh
+++ b/test-runner.sh
@@ -7,7 +7,7 @@ set -e
 # full data available: http://artic.s3.climb.ac.uk/run-folders/EBOV_Amplicons_flongle.tar.gz
 #
 # usage:
-#       ./test-runner.sh [medaka|nanopolish]
+#       ./test-runner.sh [medaka|clair3]
 #
 #   specify either medaka or nanopolish to run the respective workflow of the pipeline
 #
@@ -15,65 +15,44 @@ set -e
 # Setup the data, commands and the testing function.
 
 # data
-inputData="./20190830_1509_MN22126_AAQ411_9efc5448"
+inputData="./20190830_1509_MN22126_AAQ411_9efc5448_barcoded"
 primerSchemes="../test-data/primer-schemes"
 primerScheme="IturiEBOV/V1"
 prefix="ebov-mayinga"
+bed="../test-data/primer-schemes/IturiEBOV/V1/IturiEBOV.scheme.bed"
+ref="../test-data/primer-schemes/IturiEBOV/V1/IturiEBOV.reference.fasta"
 barcode="03"
 threads=2
-downloadCmd="wget http://artic.s3.climb.ac.uk/run-folders/EBOV_Amplicons_flongle.tar.gz"
-extractCmd="tar -vxzf EBOV_Amplicons_flongle.tar.gz"
+downloadCmd="wget https://loman-labz-public-datasets.s3.climb.ac.uk/EBOV_Amplicons_flongle_barcoded.tar.gz"
+extractCmd="tar -vxzf EBOV_Amplicons_flongle_barcoded.tar.gz"
 
-# pipeline commands
-## nanopolish workflow specific
-gatherCmd_n="artic gather \
-        --min-length 400 \
-        --max-length 800 \
-        --prefix ${prefix} \
-        --directory ${inputData} \
-        --fast5-directory ${inputData}/fast5_pass"
-
-demuxCmd_n="artic demultiplex \
-            --threads ${threads} \
-            ${prefix}_fastq_pass.fastq"
-
-minionCmd_n="artic minion \
-                --normalise 200 \
-                --threads ${threads} \
-                --scheme-directory ${primerSchemes} \
-                --read-file ${prefix}_fastq_pass-NB${barcode}.fastq \
-                --fast5-directory ${inputData}/fast5_pass \
-                --sequencing-summary ${inputData}/lab-on-an-ssd_20190830_160932_AAQ411_minion_sequencing_run_EBOV_Amplicons_flongle_sequencing_summary.txt \
-                ${primerScheme} \
-                ${prefix}"
-
-## medaka workflow specific
-gatherCmd_m="artic gather \
-        --min-length 400 \
-        --max-length 800 \
-        --prefix ${prefix} \
-        --directory ${inputData} \
-        --no-fast5s"
 
-demuxCmd_m="artic demultiplex \
-            --threads ${threads} \
-            ${prefix}_fastq_pass.fastq"
-
-guppyplexCmd_m="artic guppyplex \
+guppyplexCmd="artic guppyplex \
         --min-length 400 \
         --max-length 800 \
         --prefix ${prefix} \
-        --directory ./ \
+        --directory ./${inputData}/pass/barcode${barcode} \
         --output ${prefix}_guppyplex_fastq_pass-NB${barcode}.fastq"
 
+## medaka workflow specific
 minionCmd_m="artic minion \
             --normalise 200 \
             --threads ${threads} \
-            --scheme-directory ${primerSchemes} \
             --read-file ${prefix}_guppyplex_fastq_pass-NB${barcode}.fastq \
-            --medaka \
-            --medaka-model r941_min_high_g351 \
-            ${primerScheme} \
+            --model r941_e81_hac_g514 \
+            --bed ${bed} \
+            --ref ${ref} \
+            ${prefix}"
+
+# clair3 workflow specific
+minionCmd_c="artic minion \
+            --normalise 200 \
+            --threads ${threads} \
+            --read-file ${prefix}_guppyplex_fastq_pass-NB${barcode}.fastq \
+            --clair3 \
+            --model r941_prom_hac_g360+g422 \
+            --bed ${bed} \
+            --ref ${ref} \
             ${prefix}"
 
 # colours
@@ -104,19 +83,20 @@ function cmdTester {
 ###########################################################################################
 # Run the tests.
 
-# check that nanopolish or medaka is specified
-if [ "$1" == "nanopolish" ] || [ "$1" == "medaka" ]; then
+# check that clair3 or medaka is specified
+if [ "$1" == "clair3" ] || [ "$1" == "medaka" ]; then
     echo -e "${BLUE}Starting tests...${NC}"
     echo -e "${BLUE} - using the $1 workflow${NC}"
     echo
 else
-    echo "please specify medaka or nanopolish"
-    echo "./test-runner.sh [medaka|nanopolish]"
+    echo "please specify medaka or clair3"
+    echo "./test-runner.sh [medaka|clair3]"
     exit 1
 fi
 
 # setup a tmp directory to work in
-mkdir tmp && cd tmp || exit
+mkdir tmp || true
+cd tmp || exit
 
 # download the data
 echo "downloading the test data..."
@@ -125,30 +105,19 @@ cmdTester $extractCmd
 
 # run the correct workflow
 echo "running the pipeline..."
-if [ "$1" == "nanopolish" ]
+if [ "$1" == "medaka" ]
 then
-
-    # collect the reads
-    cmdTester $gatherCmd_n
-
-    # demultiplex
-    cmdTester $demuxCmd_n
-
-    # run the core pipeline with nanopolish
-    cmdTester $minionCmd_n
-else
-
     # collect the reads
-    cmdTester $gatherCmd_m
-
-    # demultiplex
-    cmdTester $demuxCmd_m
-
-    # guppyplex
-    cmdTester $guppyplexCmd_m
+    cmdTester $guppyplexCmd
 
     # run the core pipeline with medaka
     cmdTester $minionCmd_m
+else
+    # guppyplex the reads
+    cmdTester $guppyplexCmd
+
+    # run the core pipeline with clair3
+    cmdTester $minionCmd_c
 fi
 
 ###########################################################################################
diff --git a/artic/align_trim_unit_test.py b/tests/align_trim_unit_test.py
similarity index 60%
rename from artic/align_trim_unit_test.py
rename to tests/align_trim_unit_test.py
index a22c0460..ad298ead 100644
--- a/artic/align_trim_unit_test.py
+++ b/tests/align_trim_unit_test.py
@@ -3,45 +3,26 @@
 import pysam
 import pytest
 
-from . import align_trim
-from . import vcftagprimersites
+from artic import align_trim
+from artic import utils
 
 
 # help pytest resolve where test data is kept
 TEST_DIR = os.path.dirname(os.path.abspath(__file__))
 
 # dummy primers (using min required fields)
-p1 = {
-    "start": 0,
-    "end": 10,
-    "direction": "+",
-    "primerID": "primer1_LEFT"
-}
-p2 = {
-    "start": 30,
-    "end": 40,
-    "direction": "-",
-    "primerID": "primer1_RIGHT"
-}
-p3 = {
-    "start": 10,
-    "end": 20,
-    "direction": "+",
-    "primerID": "primer2_LEFT"
-}
-p4 = {
-    "start": 40,
-    "end": 50,
-    "direction": "-",
-    "primerID": "primer2_RIGHT"
-}
+p1 = {"start": 0, "end": 10, "direction": "+", "primerID": "primer1_LEFT"}
+p2 = {"start": 30, "end": 40, "direction": "-", "primerID": "primer1_RIGHT"}
+p3 = {"start": 10, "end": 20, "direction": "+", "primerID": "primer2_LEFT"}
+p4 = {"start": 40, "end": 50, "direction": "-", "primerID": "primer2_RIGHT"}
 
 # primer scheme to hold dummy primers
 dummyPrimerScheme = [p1, p2, p3, p4]
 
 # actual the primer scheme for nCov
-primerScheme = vcftagprimersites.read_bed_file(
-    TEST_DIR + "/../test-data/primer-schemes/nCoV-2019/V1/nCoV-2019.scheme.bed")
+primerScheme = utils.read_bed_file(
+    TEST_DIR + "/../test-data/primer-schemes/nCoV-2019/V1/nCoV-2019.scheme.bed"
+)
 
 # nCov alignment segment (derived from a real nCov read)
 seg1 = pysam.AlignedSegment()
@@ -61,7 +42,9 @@
 seg2.reference_id = 0
 seg2.reference_start = 4294
 seg2.mapping_quality = 60
-seg2.cigarstring = "41S9M1D17M1D69M5D1M1D40M2I12M1D41M1D117M1I3M1D4M2D11M2I2M1D18M1D18M1I25M56S"
+seg2.cigarstring = (
+    "41S9M1D17M1D69M5D1M1D40M2I12M1D41M1D117M1I3M1D4M2D11M2I2M1D18M1D18M1I25M56S"
+)
 seg2.query_sequence = "CCAGGTTAACACAAAGACACCGACAACTTTCTTCAGCACCTACAGTGCTTAAAAGTGTAAAAGTGCCTTTACATTCTACCATCTATTATCTCTAATGAGAAGCAAGAAATTCTTGGAACTGTTTCTTGGAATTTGCAGCTTGCACATGCAGAAGAAACACGCAAATTAATGCCTGTCTGTGTGTGGAAACTAAGCCATAGTTTCAACTATACAGCGTAAATATAAGGGTATTAAATACAAGAGGGTGTGGTTGATTATGGTGCTAGATTTTACTTTTACACCAGTAAAACAACTGTAGCGTCACTTATCAACACGCTTAACGATCTAAATGAAACTCTTGTTACAATGCACACTGGCTGTAACACATGAAACTAAATTTGGAAGAAGCTGTCGGTATATGAGATCTCTCCAAAGTGCCAGCTACGGTTTCTGTTAGGTGCTGAAAAGAAAGTTGTCGGTGTCTTTGTGTGAACCTTAGCAATACGTAACC"
 seg2.query_qualities = [30] * 490
 
@@ -71,45 +54,59 @@
 
 
 def test_find_primer():
-    """test for the find primer function
-    """
+    """test for the find primer function"""
 
+    print(dummyPrimerScheme)
     # test the primer finder on the primers themselves
     for primer in dummyPrimerScheme:
         if primer["direction"] == "+":
             result = align_trim.find_primer(
-                dummyPrimerScheme, primer["start"], primer["direction"])
+                dummyPrimerScheme, primer["start"], primer["direction"]
+            )
         else:
             result = align_trim.find_primer(
-                dummyPrimerScheme, primer["end"], primer["direction"])
-        assert result[2]["primerID"] == primer["primerID"], "find_primer did not produce the query primer, which should be nearest"
+                dummyPrimerScheme, primer["end"], primer["direction"]
+            )
+
+        assert result
+        assert (
+            result[2]["primerID"] == primer["primerID"]
+        ), "find_primer did not produce the query primer, which should be nearest"
 
     # test against other ref positions
-    result = align_trim.find_primer(
-        dummyPrimerScheme, 8, "+")
-    assert result[2]["primerID"] == "primer2_LEFT", "find_primer returned incorrect primer"
-    result = align_trim.find_primer(
-        dummyPrimerScheme, 25, "-")
-    assert result[2]["primerID"] == "primer1_RIGHT", "find_primer returned incorrect primer"
+    result = align_trim.find_primer(dummyPrimerScheme, 8, "+")
+    assert result
+    assert (
+        result[2]["primerID"] == "primer2_LEFT"
+    ), "find_primer returned incorrect primer"
+
+    result = align_trim.find_primer(dummyPrimerScheme, 25, "-")
+    assert result
+    assert (
+        result[2]["primerID"] == "primer1_RIGHT"
+    ), "find_primer returned incorrect primer"
 
 
 def test_trim():
-    """test for the trim function
-    """
+    """test for the trim function"""
 
     def testRunner(seg, expectedCIGAR):
 
         # get the nearest primers to the alignment segment
-        p1 = align_trim.find_primer(primerScheme, seg.reference_start, '+')
-        p2 = align_trim.find_primer(primerScheme, seg.reference_end, '-')
+        p1 = align_trim.find_primer(primerScheme, seg.reference_start, "+")
+        p2 = align_trim.find_primer(primerScheme, seg.reference_end, "-")
 
         # get the primer positions
-        p1_position = p1[2]['end']
-        p2_position = p2[2]['start']
+        p1_position = p1[2]["end"]
+        p2_position = p2[2]["start"]
 
         # this segment should need forward and reverse softmasking
-        assert seg.reference_start < p1_position, "missed a forward soft masking opportunity (read: %s)" % seg.query_name
-        assert seg.reference_end > p2_position, "missed a reverse soft masking opportunity (read: %s)" % seg.query_name
+        assert seg.reference_start < p1_position, (
+            "missed a forward soft masking opportunity (read: %s)" % seg.query_name
+        )
+        assert seg.reference_end > p2_position, (
+            "missed a reverse soft masking opportunity (read: %s)" % seg.query_name
+        )
 
         # before masking, get the query_alignment_length and the CIGAR to use for testing later
         originalCigar = seg.cigarstring
@@ -120,29 +117,49 @@ def testRunner(seg, expectedCIGAR):
             align_trim.trim(seg, p1_position, False, False)
         except Exception as e:
             raise Exception(
-                "problem soft masking left primer in {} (error: {})" .format(seg.query_name, e))
+                "problem soft masking left primer in {} (error: {})".format(
+                    seg.query_name, e
+                )
+            )
 
         # check the CIGAR and query alignment length is updated
-        assert seg.cigarstring != originalCigar, "cigar was not updated with a softmask (read: %s)" % seg.query_name
-        assert seg.query_alignment_length != originalQueryAlnLength, "query alignment was not updated after softmask (read: %s)" % seg.query_name
+        assert seg.cigarstring != originalCigar, (
+            "cigar was not updated with a softmask (read: %s)" % seg.query_name
+        )
+        assert seg.query_alignment_length != originalQueryAlnLength, (
+            "query alignment was not updated after softmask (read: %s)" % seg.query_name
+        )
 
         # trim the reverse primer
         try:
             align_trim.trim(seg, p2_position, True, False)
         except Exception as e:
-            raise Exception("problem soft masking right primer in {} (error: {})" .format(
-                seg.query_name, e))
+            raise Exception(
+                "problem soft masking right primer in {} (error: {})".format(
+                    seg.query_name, e
+                )
+            )
 
         # check the CIGAR and query alignment length is updated
-        assert seg.cigarstring != originalCigar, "cigar was not updated with a softmask (read: %s)" % seg.query_name
-        assert seg.query_alignment_length != originalQueryAlnLength, "query alignment was not updated after softmask (read: %s)" % seg.query_name
+        assert seg.cigarstring != originalCigar, (
+            "cigar was not updated with a softmask (read: %s)" % seg.query_name
+        )
+        assert seg.query_alignment_length != originalQueryAlnLength, (
+            "query alignment was not updated after softmask (read: %s)" % seg.query_name
+        )
 
         # check we have the right CIGAR
-        assert seg.cigarstring == expectedCIGAR, "cigar does not match expected cigar string (read: %s)" % seg.query_name
+        assert seg.cigarstring == expectedCIGAR, (
+            "cigar does not match expected cigar string (read: %s)" % seg.query_name
+        )
 
         # check the query alignment now matches the expected primer product
-        assert seg.reference_start >= p1_position, "left primer not masked corrrectly (read: %s)" % seg.query_name
-        assert seg.reference_end <= p2_position, "right primer not masked correctly (read: %s)" % seg.query_name
+        assert seg.reference_start >= p1_position, (
+            "left primer not masked corrrectly (read: %s)" % seg.query_name
+        )
+        assert seg.reference_end <= p2_position, (
+            "right primer not masked correctly (read: %s)" % seg.query_name
+        )
 
     # run the test with the two alignment segments
     testRunner(seg1, seg1expectedCIGAR)
diff --git a/tests/get_scheme_unit_test.py b/tests/get_scheme_unit_test.py
new file mode 100644
index 00000000..a100b97d
--- /dev/null
+++ b/tests/get_scheme_unit_test.py
@@ -0,0 +1,34 @@
+import unittest
+import os
+
+from artic.utils import get_scheme
+
+
+class get_scheme_unit_test(unittest.TestCase):
+    def test_get_scheme_no_length(self):
+
+        bed, ref, scheme_version = get_scheme(
+            scheme_name="nCoV-2019",
+            scheme_version="v1.0.0",
+            scheme_directory=f"{os.getcwd()}/primer-schemes",
+        )
+
+        self.assertEqual(
+            bed, f"{os.getcwd()}/primer-schemes/artic-sars-cov-2/400/v1.0.0/primer.bed"
+        )
+
+        self.assertEqual(
+            ref,
+            f"{os.getcwd()}/primer-schemes/artic-sars-cov-2/400/v1.0.0/reference.fasta",
+        )
+
+    def test_get_scheme_unclear_length(self):
+
+        with self.assertRaises(SystemExit) as cm:
+            bed, ref, scheme_version = get_scheme(
+                scheme_name="hbv",
+                scheme_version="v1.0.0",
+                scheme_directory=f"{os.getcwd()}/primer-schemes",
+            )
+
+        self.assertEqual(cm.exception.code, 1)
diff --git a/tests/minion_validator.py b/tests/minion_validator.py
new file mode 100644
index 00000000..705f3023
--- /dev/null
+++ b/tests/minion_validator.py
@@ -0,0 +1,379 @@
+"""minion_validator.py runs the functional tests of the minion pipeline, for both nanopolish and medaka workflows.
+
+For each test dataset, it will make the following checks for each workflow:
+
+    * check for existence of the final consensus sequence
+    * check the final consensus matches the expected consensus
+    * check for existence of the final VCF file (generated by artic_vcf_filter)
+    * check all expected variants are reported
+    * check no unexpected variants are reported
+    * if there is an expected deletion, the consensus sequence will be checked for it's absence
+
+NOTE:
+
+    * only a basic grep is used for checking the deletion in the consensus - test will fail if deletion sequence is present multiple times (I should improve this part of the test...)
+
+"""
+
+from Bio import SeqIO
+from tqdm import tqdm
+import glob
+import os
+import pathlib
+import unittest
+import requests
+import sys
+import tarfile
+from cyvcf2 import VCF
+
+
+from artic import pipeline
+
+# help pytest resolve where test data is kept
+dataDir = str(pathlib.Path(__file__).parent.parent) + "/test-data/"
+
+# testData is a lookup of sampleIDs to download urls
+testData = {
+    "MT007544": "https://raw.githubusercontent.com/artic-network/fieldbioinformatics/master/test-data/MT007544/MT007544.fastq",
+    "CVR1": "https://artic.s3.climb.ac.uk/validation-sets/CVR1.tgz",
+    "NRW01": "http://artic.s3.climb.ac.uk/validation-teams/NRW01.tgz",
+    "SP1": "http://artic.s3.climb.ac.uk/validation-teams/SP1.tgz",
+}
+
+# refConsensuses is a nested dict of sample IDs and their expected consensus sequences from the nanopolish workflow
+refConsensuses = {
+    "CVR1": dataDir + "consensus-sequences/CVR1.consensus.fasta",
+    "NRW01": dataDir + "consensus-sequences/NRW01.consensus.fasta",
+    "SP1": dataDir + "consensus-sequences/SP1.consensus.fasta",
+}
+
+# refMedakaConsensuses is a nested dict of sample IDs and their expected consensus sequences from the medaka workflow
+refMedakaConsensuses = {
+    "MT007544": dataDir + "consensus-sequences/MT007544.consensus.medaka.fasta",
+    "CVR1": dataDir + "consensus-sequences/CVR1.consensus.medaka.fasta",
+    "NRW01": dataDir + "consensus-sequences/NRW01.consensus.medaka.fasta",
+    "SP1": dataDir + "consensus-sequences/SP1.consensus.medaka.fasta",
+}
+
+# nanopolishTestVariants is a nested dict of sample IDs and their expected variants when using the nanopolish workflow
+clair3TestVariants = {
+    "CVR1": {
+        # pos: (ref, alt, type, count)
+        241: ["C", "T", "snp", 1],
+        3037: ["C", "T", "snp", 1],
+        12733: [
+            "C",
+            "T",
+            "snp",
+            1,
+        ],
+        14408: ["C", "T", "snp", 1],
+        23403: ["A", "G", "snp", 1],
+        27752: ["C", "T", "snp", 1],
+        28881: ["G", "A", "snp", 1],
+        28882: ["G", "A", "snp", 1],
+        28883: ["G", "C", "snp", 1],
+    },
+    "NRW01": {
+        1440: ["G", "A", "snp", 1],
+        2891: ["G", "A", "snp", 1],
+        4655: ["C", "T", "snp", 1],
+        8422: ["G", "A", "snp", 1],
+        22323: ["C", "T", "snp", 1],
+        29546: ["C", "A", "snp", 2],
+    },
+    "SP1": {
+        241: ["C", "T", "snp", 1],
+        3037: ["C", "T", "snp", 1],
+        14408: ["C", "T", "snp", 1],
+        23403: ["A", "G", "snp", 1],
+    },
+}
+
+# medakaTestVariants is a nested dict of sample IDs and their expected variants when using the medaka workflow
+medakaTestVariants = {
+    "MT007544": {
+        # pos: (ref, alt, type, count)
+        29749: ["ACGATCGAGTG", "A", "del", 1],
+    },
+    "CVR1": {
+        241: ["C", "T", "snp", 1],
+        3037: ["C", "T", "snp", 1],
+        12733: ["C", "T", "snp", 1],
+        14408: ["C", "T", "snp", 1],
+        23403: ["A", "G", "snp", 1],
+        27752: ["C", "T", "snp", 1],
+        28881: ["GGG", "AAC", "indel", 1],
+    },
+    "NRW01": {
+        1440: ["G", "A", "snp", 1],
+        2891: ["G", "A", "snp", 1],
+        4655: ["C", "T", "snp", 1],
+        8422: ["G", "A", "snp", 1],
+        22323: ["C", "T", "snp", 2],
+        29546: ["C", "A", "snp", 2],
+    },
+    "SP1": {
+        241: ["C", "T", "snp", 1],
+        3037: ["C", "T", "snp", 1],
+        14408: ["C", "T", "snp", 1],
+        23403: ["A", "G", "snp", 1],
+    },
+}
+
+# extraFlags is a way to add extra sample-specific commands to the validation test cmd
+extraFlags = {
+    "medaka": {
+        "SP1": ["--no-frameshifts"],
+    },
+    "clair3": {
+        "SP1": ["--no-frameshifts"],
+        "CVR1": ["--no-frameshifts"],
+    },
+}
+
+
+# dataChecker will run before the tests and download all the test datasets if not present
+def dataChecker():
+    print("checking for validation datasets...")
+    for sampleID, url in testData.items():
+        targetPath = dataDir + sampleID
+        if not os.path.exists(targetPath):
+            print("\tno data for {}".format(sampleID))
+            print("\tmaking dir at {}".format(targetPath))
+            os.mkdir(targetPath)
+            print("\tdownloading from {}".format(url))
+            try:
+                download(url, dataDir, sampleID)
+            except Exception as e:
+                print("download failed: ", e)
+                sys.exit(1)
+        else:
+            print("\tfound data dir for {}".format(sampleID))
+    print("validation datasets ready\n")
+
+
+# download will download and untar a test dataset
+def download(url, dataDir, sampleID):
+    filename = url.rsplit("/", 1)[1]
+    with open(f"{dataDir}/{filename}", "wb+") as f:
+        response = requests.get(url, stream=True)
+        total = int(response.headers.get("content-length"))
+        if total is None:
+            f.write(response.content)
+        else:
+            with tqdm(total=total, unit="B", unit_scale=True, desc=filename) as pbar:
+                for data in tqdm(response.iter_content(chunk_size=1024)):
+                    f.write(data)
+                    pbar.update(1024)
+
+    tar = tarfile.open(dataDir + "/" + filename, "r:gz")
+    tar.extractall(dataDir)
+    tar.close()
+    os.remove(dataDir + "/" + filename)
+
+
+# genCommand will create the minion command for the requested workflow (nanopolish or medaka)
+def genCommand(sampleID, workflow):
+    cmd = [
+        "minion",
+        "--threads",
+        "2",
+        "--read-file",
+        dataDir + sampleID + "/" + sampleID + ".fastq",
+        "--scheme-directory",
+        dataDir + "primer-schemes",
+    ]
+    if workflow == "medaka":
+        cmd.append("--model")
+        cmd.append("r941_min_high_g351")
+
+    if workflow == "clair3":
+        cmd.append("--clair3")
+        cmd.append("--model")
+        cmd.append("r941_prom_hac_g360+g422")
+
+    if sampleID in extraFlags[workflow]:
+        for flag in extraFlags[workflow][sampleID]:
+            cmd.append(flag)
+
+    cmd.append("--scheme-name")
+    cmd.append("SARS-CoV-2")
+    cmd.append("--scheme-version")
+    cmd.append("v1.0.0")
+    cmd.append(sampleID)
+    return cmd
+
+
+# cleanUp will remove all files generated by the pipeline for a given sampleID
+def cleanUp(sampleID):
+    fileList = glob.glob("{}*".format(sampleID))
+    for filePath in fileList:
+        try:
+            os.remove(filePath)
+        except Exception:
+            sys.stderr.write(f"Error while deleting file : {filePath}")
+
+
+# checkConsensus will return 1 if a subsequence is present in a consensus file, or 0 if absent
+def checkConsensus(consensusFile, subSeq):
+    for record in SeqIO.parse(open(consensusFile, "r"), "fasta"):
+        if subSeq in record.seq:
+            return 1
+    return 0
+
+
+# runner is the test runner
+def runner(workflow, sampleID):
+
+    if workflow == "clair3":
+        data = clair3TestVariants
+    elif workflow == "medaka":
+        data = medakaTestVariants
+    else:
+        sys.stderr.write("invalid workflow specified")
+        assert False
+
+    if sampleID not in data:
+        sys.stderr.write("no test data for {}".format(sampleID))
+        assert False
+
+    expVariants = data[sampleID]
+
+    # check the number of validation datasets requested
+    # counter = numValidations
+    # if (numValidations < 0) or (numValidations > len(testData)):
+    #     counter = len(testData)
+
+    # generate the command
+    cmd = genCommand(sampleID, workflow)
+
+    # setup and run the minion parser
+    parser = pipeline.init_pipeline_parser()
+
+    # parse the arguments
+    try:
+        args = parser.parse_args(cmd)
+    except SystemExit:
+        sys.stderr.write("failed to parse valid command for `artic minion`")
+
+    # run the minion pipeline
+    try:
+        args.func(parser, args)
+    except SystemExit:
+        sys.stderr.write("artic minion exited early with an error")
+        assert False
+
+    # check the ARTIC consensus was created
+    consensusFile = "%s.consensus.fasta" % sampleID
+    assert os.path.exists(consensusFile), "no consensus produced for {}".format(
+        sampleID
+    )
+    testSeqs = SeqIO.parse(open(consensusFile, "r"), "fasta")
+    testConsensus = next(testSeqs)
+
+    # check the ARTIC consensus sequence matches the one on record
+    # if workflow == "medaka":
+    #     refSeqs = SeqIO.parse(open(refMedakaConsensuses[sampleID], "r"), "fasta")
+    # else:
+    #     refSeqs = SeqIO.parse(open(refConsensuses[sampleID], "r"), "fasta")
+    # refConsensus = next(refSeqs)
+    # assert (
+    #     testConsensus.seq == refConsensus.seq
+    # ), "produced ARTIC consensus does not match expected consensus for {}".format(
+    #     sampleID
+    # )
+
+    # check the ARTIC VCF was created
+    vcfFile = "%s.normalised.vcf.gz" % sampleID
+    assert os.path.exists(vcfFile), "no VCF produced for {}".format(sampleID)
+
+    obs_variants = {}
+    # open the VCF and check the reported variants match the expected
+    for record in VCF(vcfFile):
+        obs_variants[record.POS] = [record.REF, str(record.ALT[0]), record.var_type]
+        if record.POS in expVariants:
+            assert (
+                record.REF == expVariants[record.POS][0]
+            ), "incorrect REF reported in VCF for {} at position {}".format(
+                sampleID, record.POS
+            )
+            assert (
+                str(record.ALT[0]) == expVariants[record.POS][1]
+            ), "incorrect ALT reported in VCF for {} at position {}".format(
+                sampleID, record.POS
+            )
+
+            # if this is an expected deletion, check the consensus sequence for it's absence
+            if expVariants[record.POS][2] == "del":
+                assert (
+                    checkConsensus(consensusFile, record.REF) == 0
+                ), "expected deletion for {} was reported but was left in consensus".format(
+                    sampleID
+                )
+
+                # also check that the VCF record is correctly labelled as DEL
+                assert (
+                    record.is_deletion
+                ), "deletion for {} not formatted correctly in VCF".format(sampleID)
+
+            # if this is an expected indel, check that the VCF record is correctly labelled as INDEL
+            if expVariants[record.POS][2] == "indel":
+                assert (
+                    record.is_indel
+                ), "indel for {} not formatted correctly in VCF".format(sampleID)
+
+            # else, check that the VCF record is correctly labelled as SNP
+            if expVariants[record.POS][2] == "snp":
+                assert (
+                    record.is_snp
+                ), "snp for {} not formatted correctly in VCF".format(sampleID)
+
+            # decrement/remove the variant from the expected list, so we can keep track of checked variants
+            # expVariants[record.POS][3] -= 1
+            # if expVariants[record.POS][3] == 0:
+            #     del expVariants[record.POS]
+
+        else:
+            sys.stderr.write(
+                f"unexpected variant found for {sampleID}: {str(record.ALT[0])} at {record.POS}"
+            )
+            assert False
+
+    # check we've confirmed all the expected variants
+    for key, val in expVariants.items():
+        assert obs_variants.get(
+            key
+        ), f"expected variant not found for {sampleID}: {val[0]} -> {val[1]} at {key}"
+        assert (
+            val[0:2] == obs_variants[key][0:2]
+        ), f"expected variant not found for {sampleID}: {val[0]} -> {val[1]} at {key}"
+
+    # clean up pipeline files
+    cleanUp(sampleID)
+
+
+class TestMinion(unittest.TestCase):
+    def setUp(self):
+        dataChecker()
+
+    def test_Clair3_CVR1(self):
+        runner("clair3", "CVR1")
+
+    def test_Clair3_NRW01(self):
+        runner("clair3", "NRW01")
+
+    def test_Clair3_SP1(self):
+        runner("clair3", "SP1")
+
+    def test_Medaka_MT007544(self):
+        runner("medaka", "MT007544")
+
+    def test_Medaka_CVR1(self):
+        runner("medaka", "CVR1")
+
+    def test_Medaka_NRW01(self):
+        runner("medaka", "NRW01")
+
+    def test_Medaka_SP1(self):
+        runner("medaka", "SP1")
diff --git a/artic/pipeline_unit_test.py b/tests/pipeline_unit_test.py
similarity index 78%
rename from artic/pipeline_unit_test.py
rename to tests/pipeline_unit_test.py
index c8247c07..9a21f14f 100644
--- a/artic/pipeline_unit_test.py
+++ b/tests/pipeline_unit_test.py
@@ -2,21 +2,22 @@
 import argparse
 import pytest
 
-from . import pipeline
+from artic import pipeline
 
 
 def test_pipeline_parser():
-    """basic test for the pipeline parser
-    """
+    """basic test for the pipeline parser"""
     # setup a parser
     parser = pipeline.init_pipeline_parser()
 
     # set up a valid command
     dummyCLI = [
         "minion",
-        "--medaka",
-        "some-scheme.bed",
-        "some-prefix"
+        "--model",
+        "some_nonsense_model",
+        "--read-file",
+        "some_reads.fastq",
+        "some-prefix",
     ]
 
     # try with required arguments missing
@@ -29,6 +30,7 @@ def test_pipeline_parser():
     except SystemExit:
         print("failed to parse valid command")
         assert False
+
     assert args.command == dummyCLI[0], "incorrect subcommand registered"
 
     # for arg, val in vars(args).items():
diff --git a/artic/vcftagprimersites_unit_test.py b/tests/utils_unit_test.py
similarity index 52%
rename from artic/vcftagprimersites_unit_test.py
rename to tests/utils_unit_test.py
index 49ca7908..944bf424 100644
--- a/artic/vcftagprimersites_unit_test.py
+++ b/tests/utils_unit_test.py
@@ -2,7 +2,7 @@
 import os
 import pytest
 
-from . import vcftagprimersites
+from artic import utils
 
 
 # help pytest resolve where test data is kept
@@ -12,30 +12,40 @@
 def test_read_bed_file():
 
     # process the nCoV-2019 V3 primer scheme
-    primerScheme = vcftagprimersites.read_bed_file(
-        TEST_DIR + "/../test-data/primer-schemes/nCoV-2019/V3/nCoV-2019.scheme.bed")
+    primerScheme = utils.read_bed_file(
+        TEST_DIR + "/../test-data/primer-schemes/nCoV-2019/V3/nCoV-2019.bed"
+    )
 
     # check the the alts have been collapsed into a canonical primer site
     assert len(primerScheme) == 196, "alts were not collapsed"
 
     # check that alts are merging by union (not intersection)
     for entry in primerScheme:
-        if entry['Primer_ID'] == 'nCoV-2019_45_RIGHT':
-            assert entry['start'] == 13660, "failed to merge nCov-2019_45_RIGHT alt, bad start"
-            assert entry['end'] == 13699, "failed to merge nCov-2019_45_RIGHT alt, bad end"
-        if entry['Primer_ID'] == 'nCoV-2019_89_LEFT':
-            assert entry['start'] == 26835, "failed to merge nCoV-2019_89_LEFT alt, bad start"
-            assert entry['end'] == 26860, "failed to merge nCoV-2019_89_LEFT alt, bad end"
-
-    # check access to primer scheme fields
+        if entry["Primer_ID"] == "nCoV-2019_45_RIGHT":
+            assert (
+                entry["start"] == 13660
+            ), "failed to merge nCov-2019_45_RIGHT alt, bad start"
+            assert (
+                entry["end"] == 13699
+            ), "failed to merge nCov-2019_45_RIGHT alt, bad end"
+        if entry["Primer_ID"] == "nCoV-2019_89_LEFT":
+            assert (
+                entry["start"] == 26835
+            ), "failed to merge nCoV-2019_89_LEFT alt, bad start"
+            assert (
+                entry["end"] == 26860
+            ), "failed to merge nCoV-2019_89_LEFT alt, bad end"
+
+    # check access to primer scheme fields
     for row in primerScheme:
-        assert 'Primer_ID' in row, "failed to parse primer scheme for Primer_ID"
-        assert 'PoolName' in row, "failed to parse primer scheme for PoolName"
+        assert "Primer_ID" in row, "failed to parse primer scheme for Primer_ID"
+        assert "PoolName" in row, "failed to parse primer scheme for PoolName"
 
     # process the nCoV-2019 V2 primer scheme
     # this scheme has a single alt that has replaced the original primer so no alts should be collapsed
-    primerScheme2 = vcftagprimersites.read_bed_file(
-        TEST_DIR + "/../test-data/primer-schemes/nCoV-2019/V2/nCoV-2019.bed")
+    primerScheme2 = utils.read_bed_file(
+        TEST_DIR + "/../test-data/primer-schemes/nCoV-2019/V2/nCoV-2019.bed"
+    )
     assert len(primerScheme2) == 196, "no alts should have been collapsed"
 
     # TODO: this is a starting point for the unit tests - more to come...

From ed69b8290fd7656bfe48e4e00e9d3c1eb094d8f7 Mon Sep 17 00:00:00 2001
From: Rachel Colquhoun <rachel.colquhoun@ed.ac.uk>
Date: Thu, 22 Aug 2024 10:59:22 +0100
Subject: [PATCH 02/11] 1.4.0 dev m1 (#136)

* mods to Dockerfile

* wip

* update
---
 Dockerfile      | 5 ++++-
 environment.yml | 5 +++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 272a109a..4bbe0ef7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -21,8 +21,11 @@ USER root
 
 RUN python3 -m pip install ./fieldbioinformatics
 
+RUN pip uninstall -y tensorflow keras pyabpoa \
+  && micromamba install -y -c conda-forge -c bioconda tensorflow=2.11 keras=2.11
+
 USER $MAMBA_USER
 
 ENTRYPOINT ["/usr/local/bin/_entrypoint.sh"]
 
-CMD ["/bin/bash"]
\ No newline at end of file
+CMD ["/bin/bash"]
diff --git a/environment.yml b/environment.yml
index 3f0c52d1..6f90fecc 100644
--- a/environment.yml
+++ b/environment.yml
@@ -4,14 +4,15 @@ channels:
   - bioconda
   - defaults
 dependencies:
+  - python==3.9
   - longshot
   - bcftools
   - biopython
   - bwa
+  - clair3
   - clint
   - htslib
   - minimap2
-  - clair3
   - multiqc
   - muscle<5.1
   - pandas
@@ -25,4 +26,4 @@ dependencies:
   - samtools
   - tqdm
   - pip:
-    - medaka
\ No newline at end of file
+      - medaka==1.12.0

From 19d1e4e8129a15c3e381aa760ec4325c7c095417 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 22 Aug 2024 11:08:10 +0100
Subject: [PATCH 03/11] 1.4.1 version bump

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 9afe6d2e..a885ca4d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = artic
-version = 1.4.0
+version = 1.4.1
 author = Nick Loman
 author_email = n.j.loman@bham.ac.uk
 maintainer = Sam Wilkinson

From 0a7fc7ec7079bad7b29a42c95a7d1c4b926f8a63 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Tue, 3 Sep 2024 09:14:19 +0100
Subject: [PATCH 04/11] Properly name fasta header

---
 artic/minion.py | 3 ++-
 environment.yml | 1 -
 setup.cfg       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/artic/minion.py b/artic/minion.py
index 60e065e1..3478e881 100755
--- a/artic/minion.py
+++ b/artic/minion.py
@@ -242,7 +242,8 @@ def run(parser, args):
     )
 
     # 11) apply the header to the consensus sequence and run alignment against the reference sequence
-    fasta_header = "%s/ARTIC/medaka" % (args.sample)
+    caller = "medaka" if not args.clair3 else "clair3"
+    fasta_header = f"{args.sample}/ARTIC/{caller}"
     cmds.append(
         'artic_fasta_header %s.consensus.fasta "%s"' % (args.sample, fasta_header)
     )
diff --git a/environment.yml b/environment.yml
index 6f90fecc..5fb54dbf 100644
--- a/environment.yml
+++ b/environment.yml
@@ -19,7 +19,6 @@ dependencies:
   - pip
   - pysam
   - pytest 
-  - python
   - cyvcf2
   - pyfaidx
   - requests
diff --git a/setup.cfg b/setup.cfg
index a885ca4d..93e57943 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = artic
-version = 1.4.1
+version = 1.4.2
 author = Nick Loman
 author_email = n.j.loman@bham.ac.uk
 maintainer = Sam Wilkinson

From 6fa6aa81fbf134841a17665081feee2b54481d42 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 19 Sep 2024 15:46:24 +0100
Subject: [PATCH 05/11] Add multi-ref scheme support

---
 artic/minion.py |  22 +++----
 artic/utils.py  | 157 +++++++++++++++++++++++++++++++++++++++++++++++-
 environment.yml |   3 +-
 3 files changed, 166 insertions(+), 16 deletions(-)

diff --git a/artic/minion.py b/artic/minion.py
index 3478e881..45e42b56 100755
--- a/artic/minion.py
+++ b/artic/minion.py
@@ -51,6 +51,13 @@ def run(parser, args):
         )
         raise SystemExit(1)
 
+    if not os.path.exists(args.read_file):
+        print(
+            colored.red("failed to find read-file: {}".format(args.read_file)),
+            file=sys.stderr,
+        )
+        raise SystemExit(1)
+
     # check the parameters and set up the filenames
     ## find the primer scheme, reference sequence and confirm scheme version
 
@@ -63,6 +70,7 @@ def run(parser, args):
             scheme_version=args.scheme_version,
             scheme_directory=args.scheme_directory,
             scheme_length=args.scheme_length,
+            read_file=args.read_file,
         )
 
     if not os.path.exists(bed) or not os.path.exists(ref):
@@ -76,18 +84,6 @@ def run(parser, args):
         )
         raise SystemExit(1)
 
-    ## set up the read file
-    if args.read_file:
-        read_file = args.read_file
-    else:
-        read_file = "%s.fasta" % (args.sample)
-    if not os.path.exists(read_file):
-        print(
-            colored.red("failed to find read-file: {}".format(read_file)),
-            file=sys.stderr,
-        )
-        raise SystemExit(1)
-
     ## collect the primer pools
     pools = set([row["PoolName"] for row in read_bed_file(bed)])
 
@@ -100,7 +96,7 @@ def run(parser, args):
 
     # 3) index the ref & align with minimap
     cmds.append(
-        f"minimap2 -a -x map-ont -t {args.threads} {ref} {read_file} | samtools view -bS -F 4 - | samtools sort -o {args.sample}.sorted.bam -"
+        f"minimap2 -a -x map-ont -t {args.threads} {ref} {args.read_file} | samtools view -bS -F 4 - | samtools sort -o {args.sample}.sorted.bam -"
     )
 
     cmds.append(f"samtools index {args.sample}.sorted.bam")
diff --git a/artic/utils.py b/artic/utils.py
index 86dc5ffe..a5c3fa54 100644
--- a/artic/utils.py
+++ b/artic/utils.py
@@ -6,6 +6,9 @@
 import hashlib
 import re
 import pandas as pd
+from Bio import SeqIO
+import subprocess
+import csv
 from clint.textui import colored
 
 
@@ -315,6 +318,7 @@ def get_scheme(
     scheme_version: str,
     scheme_directory: str,
     scheme_length: int = False,
+    read_file: str = False
 ):
     """Get the primer scheme and reference fasta file from the manifest
 
@@ -382,7 +386,7 @@ def get_scheme(
         if not manifest["primerschemes"].get(scheme_name):
             print(
                 colored.red(
-                    f"Scheme name alias does not exist in manifest, this should never happen, please create an issue on https://github.com/quick-lab/primerschemes/issues or https://github.com/artic-network/fieldbioinformatics/issues if you see this message"
+                    "Scheme name alias does not exist in manifest, this should never happen, please create an issue on https://github.com/quick-lab/primerschemes/issues or https://github.com/artic-network/fieldbioinformatics/issues if you see this message"
                 )
             )
             raise SystemExit(1)
@@ -416,7 +420,7 @@ def get_scheme(
     if not version_pattern.match(scheme_version):
         print(
             colored.red(
-                f"Invalid scheme version format, please provide a version in the format 'vX.X.X', e.g. v1.0.0"
+                "Invalid scheme version format, please provide a version in the format 'vX.X.X', e.g. v1.0.0"
             )
         )
         raise SystemExit(1)
@@ -431,6 +435,27 @@ def get_scheme(
 
     scheme = scheme[scheme_length][scheme_version]
 
+    if scheme.get("ref_selection"):
+        if not read_file:
+            print(
+                colored.red(
+                    "Reference selection is available for this scheme but reads were not provided to 'get_scheme'. This should never happen, please submit an issue to github.com/artic-network/fieldbioinformatics/issues"
+                )
+            )
+            raise SystemExit(1)
+
+        print(
+            colored.yellow(
+                f"Reference selection is available for scheme {scheme_name}, deciding which reference to use based on your reads. If you would prefer to specify the reference to use, provide the same scheme name with the appropriate suffix, choices are: {', '.join(str(x) for x in scheme[scheme_length].keys() if "-" in x)}"
+            )
+        )
+
+        suffix = pick_best_ref(multi_ref_url=scheme["ref_selection"], multi_ref_md5=scheme["ref_selection_md5"], read_file=read_file, n_reads=10000, scheme_path=scheme_directory, mm2_threads=4)
+
+        scheme_version = f"{scheme_version}-{suffix}"
+
+        print(colored.yellow(f"Selected reference suffix: {suffix}"))
+
     scheme_path = os.path.join(
         scheme_directory, scheme_name, scheme_length, scheme_version
     )
@@ -623,3 +648,131 @@ def get_scheme_legacy(scheme_name, scheme_directory, scheme_version="1"):
         file=sys.stderr,
     )
     raise SystemExit(1)
+
+
+def pick_best_ref(
+    multi_ref_url: str,
+    multi_ref_md5: str,
+    read_file: str,
+    n_reads: int,
+    scheme_path: str,
+    mm2_threads: int,
+):
+    """Pick the best reference from a multi-reference alignment
+
+    Args:
+        multi_ref_url (str): URL to the multi-reference alignment from quick-lab/primerschemes
+        multi_ref_md5 (str): MD5 hash of the multi-reference alignment
+        read_file (str): Path to the read file to test against the references
+        n_reads (int): How many reads to sample from the read file for testing (default: 10000)
+        scheme_path (str): Path to the scheme directory
+        mm2_threads (int): Number of threads to use when aligning using minimap2
+
+    Raises:
+        SystemExit: If the ref selection fasta file cannot be fetched
+        SystemExit: If the ref selection fasta file get returns a status code other than 200
+
+    Returns:
+        str: Primer scheme suffix of the most appropriate reference for the reads
+    """
+
+    ref_selection_path = os.path.join(scheme_path, "ref_select.fasta")
+
+    if not os.path.exists(scheme_path):
+        try:
+            response = requests.get(multi_ref_url)
+        except requests.exceptions.RequestException as error:
+            print(colored.red(f"Failed to ref selection fasta with Exception: {error}"))
+            raise SystemExit(1)
+
+        if response.status_code != 200:
+            print(
+                colored.red(
+                    f"Failed to fetch ref selection fasta with status code: {response.status_code}"
+                )
+            )
+            raise SystemExit(1)
+
+        with open(ref_selection_path, "w") as bed_file:
+            bed_file.write(response.text)
+
+    check_hash(ref_selection_path, multi_ref_md5)
+
+    flat_ref_path = os.path.join(scheme_path, "flattened_references.fasta")
+
+    possible_references = {}
+
+    with open(ref_selection_path, "r") as ref_selection_fh:
+
+        for reference in SeqIO.parse(ref_selection_fh, "fasta"):
+            suffix = reference.Description.split()[1]
+            possible_references[reference.id] = suffix
+
+            # Flatten out the alignment into flat fasta reference
+            flattened = str(reference.seq).replace("-", "")
+
+            with open(flat_ref_path, "a") as flat_ref_fh:
+                flat_ref_fh.write(f">{reference.Description}\n{flattened}\n")
+
+    # fq_it = mappy.fastx_read(fastq_path)
+    seqtk = subprocess.run(
+        ["seqtk", "sample", str(read_file), str(n_reads)], stdout=subprocess.PIPE
+    )
+
+    result = subprocess.run(
+        [
+            "minimap2",
+            "-x",
+            "map-ont",
+            "-t",
+            str(mm2_threads),
+            str(flat_ref_path),
+            "-",
+        ],
+        input=seqtk.stdout,
+        stdout=subprocess.PIPE,
+    )
+
+    reader = csv.DictReader(
+        result.stdout.decode("utf-8").split("\n"),
+        delimiter="\t",
+        fieldnames=[
+            "query_name",
+            "query_len",
+            "query_start",
+            "query_end",
+            "strand",
+            "ctg_name",
+            "ctg_len",
+            "ref_start",
+            "ref_end",
+            "n_matches",
+            "alignment_len",
+            "mapq",
+        ],
+    )
+
+    read_results = {}
+
+    for alignment in reader:
+
+        if not alignment:
+            continue
+
+        identity = int(alignment["n_matches"]) / int(alignment["alignment_len"])
+
+        read_results.setdefault(alignment["query_name"], {})
+        read_results[alignment["query_name"]].setdefault(alignment["ctg_name"], 0)
+
+        if identity > read_results[alignment["query_name"]][alignment["ctg_name"]]:
+            read_results[alignment["query_name"]][alignment["ctg_name"]] = identity
+
+    ref_results = {ref: 0 for ref in possible_references.keys()}
+
+    for read, details in read_results.items():
+        best_ctg = max(details, key=details.get)
+        ref_results[best_ctg] += 1
+
+    best_ref = max(ref_results, key=ref_results.get)
+
+    return possible_references[best_ref]
diff --git a/environment.yml b/environment.yml
index 5fb54dbf..ebe70482 100644
--- a/environment.yml
+++ b/environment.yml
@@ -24,5 +24,6 @@ dependencies:
   - requests
   - samtools
   - tqdm
+  - seqtk
   - pip:
-      - medaka==1.12.0
+      - medaka-cpu

From bc196f5182b0570b242e8ab09ae350a96bceb971 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 19 Sep 2024 15:54:49 +0100
Subject: [PATCH 06/11] Fix message log generator

---
 artic/utils.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/artic/utils.py b/artic/utils.py
index a5c3fa54..a2fd8607 100644
--- a/artic/utils.py
+++ b/artic/utils.py
@@ -318,7 +318,7 @@ def get_scheme(
     scheme_version: str,
     scheme_directory: str,
     scheme_length: int = False,
-    read_file: str = False
+    read_file: str = False,
 ):
     """Get the primer scheme and reference fasta file from the manifest
 
@@ -446,11 +446,18 @@ def get_scheme(
 
         print(
             colored.yellow(
-                f"Reference selection is available for scheme {scheme_name}, deciding which reference to use based on your reads. If you would prefer to specify the reference to use, provide the same scheme name with the appropriate suffix, choices are: {', '.join(str(x) for x in scheme[scheme_length].keys() if "-" in x)}"
+                f"Reference selection is available for scheme {scheme_name}, deciding which reference to use based on your reads. If you would prefer to specify the reference to use, provide the same scheme name with the appropriate suffix, choices are: {', '.join(str(x) for x in scheme[scheme_length].keys() if '-' in x)}"
             )
         )
 
-        suffix = pick_best_ref(multi_ref_url=scheme["ref_selection"], multi_ref_md5=scheme["ref_selection_md5"], read_file=read_file, n_reads=10000, scheme_path=scheme_directory, mm2_threads=4)
+        suffix = pick_best_ref(
+            multi_ref_url=scheme["ref_selection"],
+            multi_ref_md5=scheme["ref_selection_md5"],
+            read_file=read_file,
+            n_reads=10000,
+            scheme_path=scheme_directory,
+            mm2_threads=4,
+        )
 
         scheme_version = f"{scheme_version}-{suffix}"
 

From a08bd087a4f5f3efacb837516ef8a859e20bc994 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Thu, 19 Sep 2024 15:56:50 +0100
Subject: [PATCH 07/11] Don't use medaka cpu

---
 environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index ebe70482..2d27c46a 100644
--- a/environment.yml
+++ b/environment.yml
@@ -26,4 +26,4 @@ dependencies:
   - tqdm
   - seqtk
   - pip:
-      - medaka-cpu
+      - medaka==1.12.0

From 8f9de84b62d603601b67b0f964fdb3bba547a39a Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Fri, 20 Sep 2024 10:48:14 +0100
Subject: [PATCH 08/11] Version bump

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 93e57943..d565b181 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = artic
-version = 1.4.2
+version = 1.4.3
 author = Nick Loman
 author_email = n.j.loman@bham.ac.uk
 maintainer = Sam Wilkinson

From 1d0ac33e2f9e5b5ee235ca97b7ee709d9e363b0d Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Mon, 21 Oct 2024 13:47:21 +0100
Subject: [PATCH 09/11] Fix fetch scheme, add fetch_scheme entrypoint

---
 artic/fetch_scheme.py | 51 +++++++++++++++++++++++++++++++++++++++++
 artic/utils.py        | 53 +++++++++++++++++++++++++++++++------------
 setup.cfg             |  1 +
 3 files changed, 91 insertions(+), 14 deletions(-)
 create mode 100644 artic/fetch_scheme.py

diff --git a/artic/fetch_scheme.py b/artic/fetch_scheme.py
new file mode 100644
index 00000000..5e15128d
--- /dev/null
+++ b/artic/fetch_scheme.py
@@ -0,0 +1,51 @@
+from artic.utils import get_scheme
+from pathlib import Path
+
+
+def main():
+    import argparse
+    import os
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--scheme-name",
+        type=str,
+        required=True,
+        help="Name of the scheme you wish to fetch",
+    )
+    parser.add_argument(
+        "--scheme-version",
+        type=str,
+        required=True,
+        help="Version of the scheme you wish to fetch",
+    )
+    parser.add_argument(
+        "--scheme-directory",
+        type=Path,
+        required=True,
+        help="Directory where schemes are stored",
+        default=f"{os.getcwd()}/primer-schemes",
+    )
+    parser.add_argument(
+        "--scheme-length",
+        type=str,
+        help="Length of the scheme to fetch, only required if the scheme has multiple possible lengths.",
+    )
+    parser.add_argument(
+        "--read-file",
+        type=Path,
+        help="FASTQ file containing reads sequenced with the scheme in question, this is only required when fetching a scheme with reference selection functionality.",
+    )
+    args = parser.parse_args()
+
+    get_scheme(
+        scheme_name=args.scheme_name,
+        scheme_version=args.scheme_version,
+        scheme_directory=args.scheme_directory,
+        scheme_length=args.scheme_length,
+        read_file=args.read_file,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/artic/utils.py b/artic/utils.py
index a2fd8607..89d08d68 100644
--- a/artic/utils.py
+++ b/artic/utils.py
@@ -317,7 +317,7 @@ def get_scheme(
     scheme_name: str,
     scheme_version: str,
     scheme_directory: str,
-    scheme_length: int = False,
+    scheme_length: str = False,
     read_file: str = False,
 ):
     """Get the primer scheme and reference fasta file from the manifest
@@ -326,7 +326,7 @@ def get_scheme(
         scheme_name (str): Name of the scheme
         scheme_version (str): Version of the scheme
         scheme_directory (str): Directory where schemes are stored
-        scheme_length (int, optional): Length of the scheme. Defaults to False.
+        scheme_length (str, optional): Length of the scheme. Defaults to False.
 
     Returns:
         str: Path to the primer bed file
@@ -335,8 +335,11 @@ def get_scheme(
     """
 
     try:
+        # response = requests.get(
+        #     "https://raw.githubusercontent.com/quick-lab/primerschemes/main/index.json"
+        # )
         response = requests.get(
-            "https://raw.githubusercontent.com/quick-lab/primerschemes/main/index.json"
+            "https://raw.githubusercontent.com/ChrisgKent/primerschemes/main/index.json"
         )
     except requests.exceptions.RequestException as error:
         print(colored.red(f"Failed to fetch manifest with Exception: {error}"))
@@ -353,8 +356,11 @@ def get_scheme(
     manifest = response.json()
 
     try:
+        # response = requests.get(
+        #     "https://raw.githubusercontent.com/quick-lab/primerschemes/main/aliases.json"
+        # )
         response = requests.get(
-            "https://raw.githubusercontent.com/quick-lab/primerschemes/main/aliases.json"
+            "https://raw.githubusercontent.com/ChrisgKent/primerschemes/main/aliases.json"
         )
     except requests.exceptions.RequestException as error:
         print(colored.red(f"Failed to fetch scheme aliases with Exception: {error}"))
@@ -433,13 +439,11 @@ def get_scheme(
         )
         raise SystemExit(1)
 
-    scheme = scheme[scheme_length][scheme_version]
-
-    if scheme.get("ref_selection"):
+    if scheme[scheme_length][scheme_version].get("refselect"):
         if not read_file:
             print(
                 colored.red(
-                    "Reference selection is available for this scheme but reads were not provided to 'get_scheme'. This should never happen, please submit an issue to github.com/artic-network/fieldbioinformatics/issues"
+                    f"Reference selection is available for this scheme but reads were not provided. Either provide a read file using --read-file or specify the reference to use by providing the same scheme name with the appropriate suffix, choices are: {', '.join(str(x) for x in scheme[scheme_length].keys() if '-' in x)}"
                 )
             )
             raise SystemExit(1)
@@ -450,12 +454,28 @@ def get_scheme(
             )
         )
 
+        if len(scheme[scheme_length][scheme_version]["refselect"].keys()) > 1:
+            print(
+                colored.red(
+                    f"Multiple reference selection options found for scheme {scheme_name}, fieldbioinformatics does not currently support multi pathogen schemes"
+                )
+            )
+            raise SystemExit(1)
+
+        msa_data = next(
+            iter(scheme[scheme_length][scheme_version]["refselect"].values())
+        )
+
+        scheme_select_dir = os.path.join(
+            scheme_directory, scheme_name, scheme_length, scheme_version
+        )
+
         suffix = pick_best_ref(
-            multi_ref_url=scheme["ref_selection"],
-            multi_ref_md5=scheme["ref_selection_md5"],
+            multi_ref_url=msa_data["url"],
+            multi_ref_md5=msa_data["md5"],
             read_file=read_file,
             n_reads=10000,
-            scheme_path=scheme_directory,
+            scheme_path=scheme_select_dir,
             mm2_threads=4,
         )
 
@@ -467,6 +487,8 @@ def get_scheme(
         scheme_directory, scheme_name, scheme_length, scheme_version
     )
 
+    scheme = scheme[scheme_length][scheme_version]
+
     if not os.path.exists(scheme_path):
         os.makedirs(scheme_path, exist_ok=True)
 
@@ -683,9 +705,12 @@ def pick_best_ref(
         str: Primer scheme suffix of the most appropriate reference for the reads
     """
 
-    ref_selection_path = os.path.join(scheme_path, "ref_select.fasta")
+    ref_selection_path = os.path.join(scheme_path, "refselect.fasta")
 
     if not os.path.exists(scheme_path):
+        os.makedirs(scheme_path, exist_ok=True)
+
+    if not os.path.exists(ref_selection_path):
         try:
             response = requests.get(multi_ref_url)
         except requests.exceptions.RequestException as error:
@@ -712,14 +737,14 @@ def pick_best_ref(
     with open(ref_selection_path, "r") as ref_selection_fh:
 
         for reference in SeqIO.parse(ref_selection_fh, "fasta"):
-            suffix = reference.Description.split()[1]
+            suffix = reference.description.split()[1]
             possible_references[reference.id] = suffix
 
             # Flatten out the alignment into flat fasta reference
             flattened = str(reference.seq).replace("-", "")
 
             with open(flat_ref_path, "a") as flat_ref_fh:
-                flat_ref_fh.write(f">{reference.Description}\n{flattened}\n")
+                flat_ref_fh.write(f">{reference.description}\n{flattened}\n")
 
     # fq_it = mappy.fastx_read(fastq_path)
     seqtk = subprocess.run(
diff --git a/setup.cfg b/setup.cfg
index d565b181..893bfd05 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -36,4 +36,5 @@ console_scripts =
     artic_fasta_header=artic.fasta_header:main
     artic_mask=artic.mask:main
     artic_get_stats=artic.artic_mqc:main
+    fetch_scheme=artic.fetch_scheme:main
 

From 021d8492a3ed0fcbca0827e7c65ee10ca37e04da Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Mon, 21 Oct 2024 13:58:11 +0100
Subject: [PATCH 10/11] Improve arguments

---
 artic/fetch_scheme.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/artic/fetch_scheme.py b/artic/fetch_scheme.py
index 5e15128d..28ec5a56 100644
--- a/artic/fetch_scheme.py
+++ b/artic/fetch_scheme.py
@@ -22,7 +22,6 @@ def main():
     parser.add_argument(
         "--scheme-directory",
         type=Path,
-        required=True,
         help="Directory where schemes are stored",
         default=f"{os.getcwd()}/primer-schemes",
     )

From 42b58d91aec19cc1774bce165a036e822a800a15 Mon Sep 17 00:00:00 2001
From: Biowilko <sam1.wilkinson1@gmail.com>
Date: Mon, 21 Oct 2024 14:01:22 +0100
Subject: [PATCH 11/11] Revert to production primerschemes repo in anticipation
 of multiref update

---
 artic/pipeline.py |  1 -
 artic/utils.py    | 14 ++++----------
 artic/version.py  |  2 +-
 3 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/artic/pipeline.py b/artic/pipeline.py
index 76f56656..efd4b0a1 100755
--- a/artic/pipeline.py
+++ b/artic/pipeline.py
@@ -4,7 +4,6 @@
 # Thanks to Aaron Quinlan for the argparse implementation from poretools.
 
 import argparse
-import sys
 import os
 
 from . import version
diff --git a/artic/utils.py b/artic/utils.py
index 89d08d68..3a167d88 100644
--- a/artic/utils.py
+++ b/artic/utils.py
@@ -124,7 +124,7 @@ def identify_bed_file(bed_file):
             if version != 3:
                 print(
                     colored.red(
-                        f"Scheme BED does not appear to be a consistent scheme version, for primer scheme formats please see https://github.com/ChrisgKent/primal-page"
+                        "Scheme BED does not appear to be a consistent scheme version, for primer scheme formats please see https://github.com/ChrisgKent/primal-page"
                     )
                 )
                 raise SystemExit(1)
@@ -147,7 +147,7 @@ def identify_bed_file(bed_file):
             if version == 3:
                 print(
                     colored.red(
-                        f"Scheme BED mixed primer ID formats, please ensure your BED file is consistent"
+                        "Scheme BED mixed primer ID formats, please ensure your BED file is consistent"
                     )
                 )
                 raise SystemExit(1)
@@ -335,11 +335,8 @@ def get_scheme(
     """
 
     try:
-        # response = requests.get(
-        #     "https://raw.githubusercontent.com/quick-lab/primerschemes/main/index.json"
-        # )
         response = requests.get(
-            "https://raw.githubusercontent.com/ChrisgKent/primerschemes/main/index.json"
+            "https://raw.githubusercontent.com/quick-lab/primerschemes/main/index.json"
         )
     except requests.exceptions.RequestException as error:
         print(colored.red(f"Failed to fetch manifest with Exception: {error}"))
@@ -356,11 +353,8 @@ def get_scheme(
     manifest = response.json()
 
     try:
-        # response = requests.get(
-        #     "https://raw.githubusercontent.com/quick-lab/primerschemes/main/aliases.json"
-        # )
         response = requests.get(
-            "https://raw.githubusercontent.com/ChrisgKent/primerschemes/main/aliases.json"
+            "https://raw.githubusercontent.com/quick-lab/primerschemes/main/aliases.json"
         )
     except requests.exceptions.RequestException as error:
         print(colored.red(f"Failed to fetch scheme aliases with Exception: {error}"))
diff --git a/artic/version.py b/artic/version.py
index 3e8d9f94..aa56ed40 100644
--- a/artic/version.py
+++ b/artic/version.py
@@ -1 +1 @@
-__version__ = "1.4.0"
+__version__ = "1.4.3"