Skip to content

Commit 4fd6a34

Browse files
authored
Merge pull request #18 from andersen-lab/ambiguous-bases
Ambiguous bases
2 parents cb2a053 + e9d2f77 commit 4fd6a34

File tree

4 files changed

+116
-36
lines changed

4 files changed

+116
-36
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,4 +68,4 @@ To learn more about how to adjust other parameters use `bygul simulate-proportio
6868
#### Simulated reads output
6969
Simulated reads from all samples are located in `provided_output_path/reads.fastq`
7070
#### Information about amplicon dropouts
71-
In order to find more about amplicon dropouts, please refer to `provided_output_path/sample_name/amplicon_stats.csv` file. Please note that primer_seq_x and primer_seq_y define the left and right primer sequence whereas left_mismatch_map and right_mismatch_map shows the actual sequence found in the sample for a better comparison of mismatching bases in the primer sequence.
71+
In order to find more about amplicon dropouts, please refer to `provided_output_path/sample_name/amplicon_stats.csv` file. Please note that primer_seq_x and primer_seq_y define the left and right primer sequence whereas left_mismatch_map and right_mismatch_map shows the actual sequence found in the sample for a better comparison of mismatching bases in the primer sequence. Additionally, if there are any ambiguous bases present in the matching sequence, the ambiguous_bases value returns true.

bygul/_cli.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import numpy as np
66
import sys
77
import shutil
8+
import warnings
89

910

1011
@click.group(context_settings={"show_default": True})
@@ -149,7 +150,8 @@ def simulate_proportions(
149150
merge_fastq_files,
150151
find_closest_primer_match,
151152
generate_random_values,
152-
validate_simulator_options
153+
validate_simulator_options,
154+
assess_genome_quality_from_fasta
153155
)
154156
ctx = click.get_current_context()
155157
params_source = {
@@ -175,9 +177,32 @@ def simulate_proportions(
175177
for fp in str(genomes).split(",")]
176178
sample_paths = str(genomes).split(",")
177179

180+
for genome in sample_paths:
181+
report = assess_genome_quality_from_fasta(genome)
182+
num_contigs = len(report['contig_lengths'])
183+
num_ambiguous = report['total_ambiguous_bases']
184+
185+
# Warnings handled here
186+
if num_ambiguous > 0:
187+
warnings.warn(f"{genome}: Contains {num_ambiguous}"
188+
"ambiguous base(s)."
189+
"Please choose a better quality genome..")
190+
191+
if num_contigs > 1:
192+
warnings.warn(f"{genome}: Contains {num_contigs} contigs."
193+
"Does your organism have more than one chromosome?"
194+
"Are you providing high quality assemblies?")
195+
196+
# Print results
197+
print(f"\nGenome: {genome}")
198+
print(f" Total ambiguous bases: {num_ambiguous}")
199+
print(" Contig lengths:")
200+
for contig, length in report['contig_lengths'].items():
201+
print(f" {contig}: {length}")
202+
178203
if proportions == "NA":
179204
if len(sample_names) == 1:
180-
print("Only one sample provided."
205+
print("Only one sample provided. "
181206
"Using 1.0 as the sample proportion.")
182207
proportions = [1]
183208
else:

bygul/utils.py

Lines changed: 87 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,37 @@
99
import numpy as np
1010
import regex as re
1111
import click
12+
import warnings
13+
14+
15+
def assess_genome_quality_from_fasta(fasta_path):
16+
"""
17+
Parses a FASTA genome file and assesses quality by:
18+
- Counting ambiguous (non-ACGT) bases.
19+
- Reporting the length of each contig.
20+
21+
Parameters:
22+
fasta_path (str): Path to the FASTA file.
23+
24+
Returns:
25+
dict: {
26+
'total_ambiguous_bases': int,
27+
'contig_lengths': dict of {contig_id: length}
28+
}
29+
"""
30+
ambiguous_bases = {'R', 'Y', 'S', 'W', 'K', 'M', 'B', 'D', 'H', 'V', 'N'}
31+
total_ambiguous = 0
32+
contig_lengths = {}
33+
34+
for record in SeqIO.parse(fasta_path, "fasta"):
35+
seq = str(record.seq).upper()
36+
contig_lengths[record.id] = len(seq)
37+
total_ambiguous += sum(1 for base in seq if base in ambiguous_bases)
38+
39+
return {
40+
'total_ambiguous_bases': total_ambiguous,
41+
'contig_lengths': contig_lengths
42+
}
1243

1344

1445
def validate_simulator_options(simulator, params_source):
@@ -173,10 +204,11 @@ def create_valid_primer_combinations(df):
173204

174205
# Convert collected data to DataFrame efficiently
175206
valid_primers_df = pd.DataFrame.from_records(valid_primers)
176-
177207
# Merge with original DataFrame to include additional columns
178-
all_amplicons = valid_primers_df.merge(
179-
df[["amplicon_number", "primer_seq_x", "primer_seq_y", "strand"]],
208+
df = df[["amplicon_number", "primer_seq_x",
209+
"primer_seq_y", "ambiguous_bases"]]
210+
all_amplicons = df.merge(
211+
valid_primers_df,
180212
how="left",
181213
on="amplicon_number",
182214
)
@@ -363,23 +395,27 @@ def find_closest_primer_match(df, reference_seq, maxmismatch):
363395
For each row in df, find all left/right primer match positions (as lists),
364396
allowing up to `maxmismatch` mismatches. Ensures both primers are found
365397
on the same strand. Returns original df columns + matches, mismatch maps,
366-
and strand.
398+
strand, and whether primers contain ambiguous bases (IUPAC codes).
367399
"""
368400

401+
# Extended ambiguity-aware mismatch display
369402
def mismatch_alignment(primer, matched_seq):
370403
"""
371404
Returns matched sequence with mismatches shown in parentheses.
372-
Example:
373-
Primer : AGCT
374-
Match : AGTT
375-
Output : AG(T)T
405+
Also returns a flag if primer contains any ambiguous base.
376406
"""
407+
ambiguous_bases = {'R', 'Y', 'S', 'W',
408+
'K', 'M', 'B', 'D',
409+
'H', 'V', 'N'}
410+
has_ambiguity = any(base in ambiguous_bases
411+
for base in matched_seq.upper())
377412
aligned = []
378413
for p, m in zip(primer.upper(), matched_seq.upper()):
379414
aligned.append(m if p == m else f"({m})")
380-
return "".join(aligned)
415+
return "".join(aligned), has_ambiguity
381416

382417
results = []
418+
warned = False
383419

384420
for _, row in df.iterrows():
385421
primer_left = row["primer_seq_x"]
@@ -388,7 +424,7 @@ def mismatch_alignment(primer, matched_seq):
388424
pattern_left = f"({primer_left}){{s<={maxmismatch}}}"
389425
pattern_right = f"({primer_right}){{s<={maxmismatch}}}"
390426

391-
# Forward strand
427+
# Forward strand search
392428
left_fwd = [m.start() for m in re.finditer(pattern_left,
393429
reference_seq,
394430
flags=re.IGNORECASE,
@@ -403,12 +439,15 @@ def mismatch_alignment(primer, matched_seq):
403439
right_fwd_actual = [reference_seq[pos:pos+len(primer_right)]
404440
for pos in right_fwd]
405441

406-
left_fwd_mismatch_map = [mismatch_alignment(primer_left, seq)
407-
for seq in left_fwd_actual]
408-
right_fwd_mismatch_map = [mismatch_alignment(primer_right, seq)
409-
for seq in right_fwd_actual]
442+
left_fwd_mismatch_map, left_fwd_has_ambig = zip(*[
443+
mismatch_alignment(primer_left, seq) for seq in left_fwd_actual
444+
]) if left_fwd_actual else ([], [])
445+
446+
right_fwd_mismatch_map, right_fwd_has_ambig = zip(*[
447+
mismatch_alignment(primer_right, seq) for seq in right_fwd_actual
448+
]) if right_fwd_actual else ([], [])
410449

411-
# Reverse strand
450+
# Reverse strand search
412451
ref_rev = str(Seq(reference_seq).reverse_complement())
413452
left_rev = [m.start() for m in re.finditer(pattern_left,
414453
ref_rev,
@@ -424,42 +463,58 @@ def mismatch_alignment(primer, matched_seq):
424463
right_rev_actual = [ref_rev[pos:pos+len(primer_right)]
425464
for pos in right_rev]
426465

427-
left_rev_mismatch_map = [mismatch_alignment(primer_left, seq)
428-
for seq in left_rev_actual]
429-
right_rev_mismatch_map = [mismatch_alignment(primer_right, seq)
430-
for seq in right_rev_actual]
431-
432-
result_row = row.to_dict() # Preserve original row data
466+
left_rev_mismatch_map, left_rev_has_ambig = zip(*[
467+
mismatch_alignment(primer_left, seq) for seq in left_rev_actual
468+
]) if left_rev_actual else ([], [])
469+
470+
right_rev_mismatch_map, right_rev_has_ambig = zip(*[
471+
mismatch_alignment(primer_right, seq) for seq in right_rev_actual
472+
]) if right_rev_actual else ([], [])
473+
474+
# Check if any ambiguous bases are in the primers or alignments
475+
has_ambiguous_base = any([
476+
any(b in primer_left.upper() for b in "RYSWKMBDHVN"),
477+
any(b in primer_right.upper() for b in "RYSWKMBDHVN"),
478+
any(left_fwd_has_ambig) if left_fwd_has_ambig else False,
479+
any(right_fwd_has_ambig) if right_fwd_has_ambig else False,
480+
any(left_rev_has_ambig) if left_rev_has_ambig else False,
481+
any(right_rev_has_ambig) if right_rev_has_ambig else False,
482+
])
483+
if has_ambiguous_base and not warned:
484+
warnings.warn("One or more primers contain ambiguous"
485+
"bases (e.g., N, R, Y, etc)."
486+
"Matches may be unreliable.")
487+
warned = True
488+
489+
result_row = row.to_dict()
490+
result_row["ambiguous_bases"] = has_ambiguous_base
433491

434492
if left_fwd and right_fwd:
435493
result_row.update({
436494
"left_primer_loc": left_fwd,
437495
"right_primer_loc": right_fwd,
438496
"left_seq_actual": left_fwd_actual,
439497
"right_seq_actual": right_fwd_actual,
440-
"left_mismatch_map": left_fwd_mismatch_map,
441-
"right_mismatch_map": right_fwd_mismatch_map,
442-
"strand": "forward"
498+
"left_mismatch_map": list(left_fwd_mismatch_map),
499+
"right_mismatch_map": list(right_fwd_mismatch_map),
443500
})
444501
elif left_rev and right_rev:
445502
result_row.update({
446503
"left_primer_loc": left_rev,
447504
"right_primer_loc": right_rev,
448505
"left_seq_actual": left_rev_actual,
449506
"right_seq_actual": right_rev_actual,
450-
"left_mismatch_map": left_rev_mismatch_map,
451-
"right_mismatch_map": right_rev_mismatch_map,
452-
"strand": "reverse"
507+
"left_mismatch_map": list(left_rev_mismatch_map),
508+
"right_mismatch_map": list(right_rev_mismatch_map),
453509
})
454510
else:
455511
result_row.update({
456512
"left_primer_loc": [],
457513
"right_primer_loc": [],
458-
"left_seq_actual": "none",
459-
"right_seq_actual": "none",
460-
"left_mismatch_map": "none",
461-
"right_mismatch_map": "none",
462-
"strand": "none"
514+
"left_seq_actual": [],
515+
"right_seq_actual": [],
516+
"left_mismatch_map": [],
517+
"right_mismatch_map": []
463518
})
464519

465520
results.append(result_row)

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
setup(
1818
name="bygul",
19-
version="V1.0.5",
19+
version="V1.0.6",
2020
packages=find_packages(include=['bygul']),
2121
author="Maryam Ahmadi Jeshvaghane",
2222
license='BSD 2-Clause',

0 commit comments

Comments
 (0)