Skip to content

Add support for segmented genomes / amplicon panels #146

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 69 additions & 61 deletions artic/align_trim.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
consumesQuery = [True, True, False, False, True, False, False, True]


def find_primer(bed, pos, direction, threshold=20):
def find_primer(bed, pos, direction, chrom, threshold=35):
"""Given a reference position and a direction of travel, walk out and find the nearest primer site.

Parameters
Expand All @@ -39,14 +39,14 @@ def find_primer(bed, pos, direction, threshold=20):
primer_distances = [
(abs(p["start"] - pos), p["start"] - pos, p)
for p in bed
if (p["direction"] == direction) and (pos >= (p["start"] - threshold))
if (p["direction"] == direction) and (pos >= (p["start"] - threshold)) and chrom == p["chrom"]
]

else:
primer_distances = [
(abs(p["end"] - pos), p["end"] - pos, p)
for p in bed
if (p["direction"] == direction) and (pos <= (p["end"] + threshold))
if (p["direction"] == direction) and (pos <= (p["end"] + threshold)) and chrom == p["chrom"]
]

if not primer_distances:
Expand Down Expand Up @@ -205,8 +205,10 @@ def handle_segment(
return False

# locate the nearest primers to this alignment segment
p1 = find_primer(bed, segment.reference_start, "+", args.primer_match_threshold)
p2 = find_primer(bed, segment.reference_end, "-", args.primer_match_threshold)
# p1 = find_primer(bed, segment.reference_start, "+", segment.reference_name, args.primer_match_threshold)
p1 = find_primer(bed=bed, pos=segment.reference_start, direction="+", chrom=segment.reference_name, threshold=args.primer_match_threshold)
# p2 = find_primer(bed, segment.reference_end, "-", segment.reference_name, args.primer_match_threshold)
p2 = find_primer(bed=bed, pos=segment.reference_end, direction="-", chrom=segment.reference_name, threshold=args.primer_match_threshold)

if not p1 or not p2:
if args.verbose:
Expand Down Expand Up @@ -235,6 +237,7 @@ def handle_segment(
if args.report:
# update the report with this alignment segment + primer details
report = {
"chrom": segment.reference_name,
"QueryName": segment.query_name,
"ReferenceStart": segment.reference_start,
"ReferenceEnd": segment.reference_end,
Expand Down Expand Up @@ -342,32 +345,33 @@ def generate_amplicons(bed: list):

amplicon = primer["Primer_ID"].split("_")[1]

amplicons.setdefault(amplicon, {})
amplicons.setdefault(primer["chrom"], {})
amplicons[primer["chrom"]].setdefault(amplicon, {})

if primer["direction"] == "+":
amplicons[amplicon]["p_start"] = primer["start"]
amplicons[amplicon]["start"] = primer["end"] + 1
amplicons[primer["chrom"]][amplicon]["p_start"] = primer["start"]
amplicons[primer["chrom"]][amplicon]["start"] = primer["end"] + 1

elif primer["direction"] == "-":
amplicons[amplicon]["p_end"] = primer["end"]
amplicons[amplicon]["end"] = primer["start"] - 1
amplicons[primer["chrom"]][amplicon]["p_end"] = primer["end"]
amplicons[primer["chrom"]][amplicon]["end"] = primer["start"] - 1

else:
raise ValueError("Primer direction not recognised")
for chrom, amplicons_dict in amplicons.items():
for amplicon in amplicons_dict:
if not all([x in amplicons_dict[amplicon] for x in ["p_start", "p_end"]]):
raise ValueError(f"Primer scheme for amplicon {amplicon} for reference {chrom} is incomplete")

# Check if primer runs accross reference start / end -> circular virus
amplicons_dict[amplicon]["circular"] = (
amplicons_dict[amplicon]["p_start"] > amplicons_dict[amplicon]["p_end"]
)

for amplicon in amplicons:
if not all([x in amplicons[amplicon] for x in ["p_start", "p_end"]]):
raise ValueError(f"Primer scheme for amplicon {amplicon} is incomplete")

# Check if primer runs accross reference start / end -> circular virus
amplicons[amplicon]["circular"] = (
amplicons[amplicon]["p_start"] > amplicons[amplicon]["p_end"]
)

# Calculate amplicon length considering that the "length" may be negative if the genome is circular
amplicons[amplicon]["length"] = abs(
amplicons[amplicon]["p_end"] - amplicons[amplicon]["p_start"]
)
# Calculate amplicon length considering that the "length" may be negative if the genome is circular
amplicons_dict[amplicon]["length"] = abs(
amplicons_dict[amplicon]["p_end"] - amplicons_dict[amplicon]["p_start"]
)

return amplicons

Expand All @@ -392,51 +396,53 @@ def normalise(trimmed_segments: dict, normalise: int, bed: list, verbose: bool =

output_segments = []

mean_depths = {x: 0 for x in amplicons}
# mean_depths = {x: {} for x in amplicons}
mean_depths = {}

for amplicon, segments in trimmed_segments.items():
if amplicon not in amplicons:
raise ValueError(f"Segment {amplicon} not found in primer scheme file")

desired_depth = np.full_like(
(amplicons[amplicon]["length"],), normalise, dtype=int
)
for chrom, amplicon_dict in trimmed_segments.items():
for amplicon, segments in amplicon_dict.items():
if amplicon not in amplicons[chrom]:
raise ValueError(f"Segment {amplicon} not found in primer scheme file")

amplicon_depth = np.zeros((amplicons[amplicon]["length"],), dtype=int)
desired_depth = np.full_like(
(amplicons[chrom][amplicon]["length"],), normalise, dtype=int
)

if not segments:
if verbose:
print(
f"No segments assigned to amplicon {amplicon}, skipping",
file=sys.stderr,
)
continue
amplicon_depth = np.zeros((amplicons[chrom][amplicon]["length"],), dtype=int)

random.shuffle(segments)
if not segments:
if verbose:
print(
f"No segments assigned to amplicon {amplicon}, skipping",
file=sys.stderr,
)
continue

distance = np.mean(np.abs(amplicon_depth - desired_depth))
random.shuffle(segments)

for segment in segments:
test_depths = np.copy(amplicon_depth)
distance = np.mean(np.abs(amplicon_depth - desired_depth))

relative_start = segment.reference_start - amplicons[amplicon]["p_start"]
for segment in segments:
test_depths = np.copy(amplicon_depth)

if relative_start < 0:
relative_start = 0
relative_start = segment.reference_start - amplicons[chrom][amplicon]["p_start"]

relative_end = segment.reference_end - amplicons[amplicon]["p_start"]
if relative_start < 0:
relative_start = 0

test_depths[relative_start:relative_end] += 1
relative_end = segment.reference_end - amplicons[chrom][amplicon]["p_start"]

test_distance = np.mean(np.abs(test_depths - desired_depth))
test_depths[relative_start:relative_end] += 1

if test_distance < distance:
amplicon_depth = test_depths
distance = test_distance
output_segments.append(segment)
test_distance = np.mean(np.abs(test_depths - desired_depth))

mean_depths[amplicon] = np.mean(amplicon_depth)
if test_distance < distance:
amplicon_depth = test_depths
distance = test_distance
output_segments.append(segment)

mean_depths[(chrom, amplicon)] = np.mean(amplicon_depth)

return output_segments, mean_depths


Expand All @@ -449,6 +455,7 @@ def go(args):
if args.report:
reportfh = open(args.report, "w")
report_headers = [
"chrom",
"QueryName",
"ReferenceStart",
"ReferenceEnd",
Expand All @@ -469,6 +476,7 @@ def go(args):
# open the primer scheme and get the pools
bed = read_bed_file(args.bedfile)
pools = set([row["PoolName"] for row in bed])
chroms = set([row["chrom"] for row in bed])
pools.add("unmatched")

# open the input SAM file and process read groups
Expand All @@ -484,7 +492,7 @@ def go(args):
# prepare the alignment outfile
outfile = pysam.AlignmentFile("-", "wh", header=bam_header)

trimmed_segments = {}
trimmed_segments = {x: {} for x in chroms}

# iterate over the alignment segments in the input SAM file
for segment in infile:
Expand All @@ -508,10 +516,10 @@ def go(args):

# unpack the trimming tuple since segment passed trimming
amplicon, trimmed_segment = trimming_tuple
trimmed_segments.setdefault(amplicon, [])
trimmed_segments[trimmed_segment.reference_name].setdefault(amplicon, [])

if trimmed_segment:
trimmed_segments[amplicon].append(trimmed_segment)
trimmed_segments[trimmed_segment.reference_name][amplicon].append(trimmed_segment)

# normalise if requested
if args.normalise:
Expand All @@ -522,9 +530,9 @@ def go(args):
# write mean amplicon depths to file
if args.amp_depth_report:
with open(args.amp_depth_report, "w") as amp_depth_report_fh:
amp_depth_report_fh.write("amplicon\tmean_depth\n")
for amplicon, depth in mean_amp_depths.items():
amp_depth_report_fh.write(f"{amplicon}\t{depth}\n")
amp_depth_report_fh.write("chrom\tamplicon\tmean_depth\n")
for (chrom, amplicon), depth in mean_amp_depths.items():
amp_depth_report_fh.write(f"{chrom}\t{amplicon}\t{depth}\n")

for output_segment in output_segments:
outfile.write(output_segment)
Expand Down Expand Up @@ -554,7 +562,7 @@ def main():
parser.add_argument(
"--primer-match-threshold",
type=int,
default=5,
default=35,
help="Fuzzy match primer positions within this threshold",
)
parser.add_argument("--report", type=str, help="Output report to file")
Expand Down
25 changes: 12 additions & 13 deletions artic/fasta_header.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
from Bio import SeqIO


def fasta_header(fn, header):
fh = open(fn)
rec = list(SeqIO.parse(fh, "fasta"))
if len(rec) > 1:
print("Sorry, this script doesn't support multi-FASTA files!")
raise SystemExit
fh.close()
def fasta_header(args):
with open(args.filename) as fh:
rec = list(SeqIO.parse(fh, "fasta"))

fh = open(fn, "w")
rec[0].id = header
SeqIO.write([rec[0]], fh, "fasta")
fh.close()
with open(args.filename, "w") as fh:
for record in rec:
chrom = record.id
record.id = f"{args.samplename}/{chrom}/ARTIC/{args.caller}"

SeqIO.write(record, fh, "fasta")


def main():
Expand All @@ -22,10 +20,11 @@ def main():
description="Trim alignments from an amplicon scheme."
)
parser.add_argument("filename")
parser.add_argument("header")
parser.add_argument("samplename")
parser.add_argument("caller")

args = parser.parse_args()
fasta_header(args.filename, args.header)
fasta_header(args)


if __name__ == "__main__":
Expand Down
13 changes: 7 additions & 6 deletions artic/minion.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,20 +134,22 @@ def run(parser, args):
normalise_string = ""

cmds.append(
f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --remove-incorrect-pairs --min-mapq {args.min_mapq} --report {args.sample}.alignreport.csv < {args.sample}.sorted.bam > {args.sample}.trimmed.rg.bam"
f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --remove-incorrect-pairs --min-mapq {args.min_mapq} --report {args.sample}.alignreport.csv < {args.sample}.sorted.bam > {args.sample}.trimmed.rg.sam"
)

cmds.append(
f"samtools sort -T {args.sample} {args.sample}.trimmed.rg.bam -o {args.sample}.trimmed.rg.sorted.bam"
f"samtools sort -T {args.sample} {args.sample}.trimmed.rg.sam -o {args.sample}.trimmed.rg.sorted.bam"
)
cmds.append(f"rm {args.sample}.trimmed.rg.sam")

cmds.append(
f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --min-mapq {args.min_mapq} --remove-incorrect-pairs --trim-primers --report {args.sample}.alignreport.csv --amp-depth-report {args.sample}.amplicon_depths.tsv < {args.sample}.sorted.bam > {args.sample}.primertrimmed.rg.bam"
f"align_trim {normalise_string} {bed} --primer-match-threshold {args.primer_match_threshold} --min-mapq {args.min_mapq} --remove-incorrect-pairs --trim-primers --report {args.sample}.alignreport.csv --amp-depth-report {args.sample}.amplicon_depths.tsv < {args.sample}.sorted.bam > {args.sample}.primertrimmed.rg.sam"
)

cmds.append(
f"samtools sort -T {args.sample} {args.sample}.primertrimmed.rg.bam -o {args.sample}.primertrimmed.rg.sorted.bam"
f"samtools sort -T {args.sample} {args.sample}.primertrimmed.rg.sam -o {args.sample}.primertrimmed.rg.sorted.bam"
)
cmds.append(f"rm {args.sample}.primertrimmed.rg.sam")

cmds.append(f"samtools index {args.sample}.trimmed.rg.sorted.bam")
cmds.append(f"samtools index {args.sample}.primertrimmed.rg.sorted.bam")
Expand Down Expand Up @@ -222,9 +224,8 @@ def run(parser, args):

# 11) apply the header to the consensus sequence and run alignment against the reference sequence
caller = "clair3"
fasta_header = f"{args.sample}/ARTIC/{caller}"
cmds.append(
'artic_fasta_header %s.consensus.fasta "%s"' % (args.sample, fasta_header)
f"artic_fasta_header {args.sample}.consensus.fasta {args.sample} {caller}"
)

if args.align_consensus:
Expand Down
8 changes: 4 additions & 4 deletions artic/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,12 +359,12 @@ def read_bed_file(fn):
for _, row in primers.iterrows():
scheme_name, primer_id, direction, primer_n = row["Primer_ID"].split("_")

if (primer_id, direction) not in canonical_primers:
canonical_primers[(primer_id, direction)] = row.to_dict()
if (row["chrom"], primer_id, direction) not in canonical_primers:
canonical_primers[(row["chrom"], primer_id, direction)] = row.to_dict()
continue

canonical_primers[(primer_id, direction)] = merge_sites(
canonical_primers[(primer_id, direction)], row
canonical_primers[(row["chrom"], primer_id, direction)] = merge_sites(
canonical_primers[(row["chrom"], primer_id, direction)], row
)

# return the bedFile as a list
Expand Down
Loading