Merge pull request #18 from andersen-lab/ambiguous-bases

mariaelf97 · web-flow · commit 4fd6a347a442 · 2025-07-17T13:49:36.000-07:00
Ambiguous bases
diff --git a/README.md b/README.md
@@ -68,4 +68,4 @@ To learn more about how to adjust other parameters use `bygul simulate-proportio
 #### Simulated reads output
 Simulated reads from all samples are located in `provided_output_path/reads.fastq`
 #### Information about amplicon dropouts
-In order to find more about amplicon dropouts, please refer to `provided_output_path/sample_name/amplicon_stats.csv` file. Please note that primer_seq_x and primer_seq_y define the left and right primer sequence whereas left_mismatch_map and right_mismatch_map shows the actual sequence found in the sample for a better comparison of mismatching bases in the primer sequence.
+In order to find more about amplicon dropouts, please refer to `provided_output_path/sample_name/amplicon_stats.csv` file. Please note that primer_seq_x and primer_seq_y define the left and right primer sequence whereas left_mismatch_map and right_mismatch_map shows the actual sequence found in the sample for a better comparison of mismatching bases in the primer sequence. Additionally, if there are any ambiguous bases present in the matching sequence, the ambiguous_bases value returns true. 
diff --git a/bygul/_cli.py b/bygul/_cli.py
@@ -5,6 +5,7 @@
 import numpy as np
 import sys
 import shutil
+import warnings
 
 
 @click.group(context_settings={"show_default": True})
@@ -149,7 +150,8 @@ def simulate_proportions(
         merge_fastq_files,
         find_closest_primer_match,
         generate_random_values,
-        validate_simulator_options
+        validate_simulator_options,
+        assess_genome_quality_from_fasta
     )
     ctx = click.get_current_context()
     params_source = {
@@ -175,9 +177,32 @@ def simulate_proportions(
                     for fp in str(genomes).split(",")]
     sample_paths = str(genomes).split(",")
 
+    for genome in sample_paths:
+        report = assess_genome_quality_from_fasta(genome)
+        num_contigs = len(report['contig_lengths'])
+        num_ambiguous = report['total_ambiguous_bases']
+
+        # Warnings handled here
+        if num_ambiguous > 0:
+            warnings.warn(f"{genome}: Contains {num_ambiguous}"
+                          "ambiguous base(s)."
+                          "Please choose a better quality genome..")
+
+        if num_contigs > 1:
+            warnings.warn(f"{genome}: Contains {num_contigs} contigs."
+                          "Does your organism have more than one chromosome?"
+                          "Are you providing high quality assemblies?")
+
+        # Print results
+        print(f"\nGenome: {genome}")
+        print(f"  Total ambiguous bases: {num_ambiguous}")
+        print("  Contig lengths:")
+        for contig, length in report['contig_lengths'].items():
+            print(f"    {contig}: {length}")
+
     if proportions == "NA":
         if len(sample_names) == 1:
-            print("Only one sample provided."
+            print("Only one sample provided. "
                   "Using 1.0 as the sample proportion.")
             proportions = [1]
         else:
diff --git a/bygul/utils.py b/bygul/utils.py
@@ -9,6 +9,37 @@
 import numpy as np
 import regex as re
 import click
+import warnings
+
+
+def assess_genome_quality_from_fasta(fasta_path):
+    """
+    Parses a FASTA genome file and assesses quality by:
+    - Counting ambiguous (non-ACGT) bases.
+    - Reporting the length of each contig.
+
+    Parameters:
+        fasta_path (str): Path to the FASTA file.
+
+    Returns:
+        dict: {
+            'total_ambiguous_bases': int,
+            'contig_lengths': dict of {contig_id: length}
+        }
+    """
+    ambiguous_bases = {'R', 'Y', 'S', 'W', 'K', 'M', 'B', 'D', 'H', 'V', 'N'}
+    total_ambiguous = 0
+    contig_lengths = {}
+
+    for record in SeqIO.parse(fasta_path, "fasta"):
+        seq = str(record.seq).upper()
+        contig_lengths[record.id] = len(seq)
+        total_ambiguous += sum(1 for base in seq if base in ambiguous_bases)
+
+    return {
+        'total_ambiguous_bases': total_ambiguous,
+        'contig_lengths': contig_lengths
+    }
 
 
 def validate_simulator_options(simulator, params_source):
@@ -173,10 +204,11 @@ def create_valid_primer_combinations(df):
 
     # Convert collected data to DataFrame efficiently
     valid_primers_df = pd.DataFrame.from_records(valid_primers)
-
     # Merge with original DataFrame to include additional columns
-    all_amplicons = valid_primers_df.merge(
-        df[["amplicon_number", "primer_seq_x", "primer_seq_y", "strand"]],
+    df = df[["amplicon_number", "primer_seq_x",
+             "primer_seq_y", "ambiguous_bases"]]
+    all_amplicons = df.merge(
+        valid_primers_df,
         how="left",
         on="amplicon_number",
     )
@@ -363,23 +395,27 @@ def find_closest_primer_match(df, reference_seq, maxmismatch):
     For each row in df, find all left/right primer match positions (as lists),
     allowing up to `maxmismatch` mismatches. Ensures both primers are found
     on the same strand. Returns original df columns + matches, mismatch maps,
-    and strand.
+    strand, and whether primers contain ambiguous bases (IUPAC codes).
     """
 
+    # Extended ambiguity-aware mismatch display
     def mismatch_alignment(primer, matched_seq):
         """
         Returns matched sequence with mismatches shown in parentheses.
-        Example:
-            Primer : AGCT
-            Match  : AGTT
-            Output : AG(T)T
+        Also returns a flag if primer contains any ambiguous base.
         """
+        ambiguous_bases = {'R', 'Y', 'S', 'W',
+                           'K', 'M', 'B', 'D',
+                           'H', 'V', 'N'}
+        has_ambiguity = any(base in ambiguous_bases
+                            for base in matched_seq.upper())
         aligned = []
         for p, m in zip(primer.upper(), matched_seq.upper()):
             aligned.append(m if p == m else f"({m})")
-        return "".join(aligned)
+        return "".join(aligned), has_ambiguity
 
     results = []
+    warned = False
 
     for _, row in df.iterrows():
         primer_left = row["primer_seq_x"]
@@ -388,7 +424,7 @@ def mismatch_alignment(primer, matched_seq):
         pattern_left = f"({primer_left}){{s<={maxmismatch}}}"
         pattern_right = f"({primer_right}){{s<={maxmismatch}}}"
 
-        # Forward strand
+        # Forward strand search
         left_fwd = [m.start() for m in re.finditer(pattern_left,
                                                    reference_seq,
                                                    flags=re.IGNORECASE,
@@ -403,12 +439,15 @@ def mismatch_alignment(primer, matched_seq):
         right_fwd_actual = [reference_seq[pos:pos+len(primer_right)]
                             for pos in right_fwd]
 
-        left_fwd_mismatch_map = [mismatch_alignment(primer_left, seq)
-                                 for seq in left_fwd_actual]
-        right_fwd_mismatch_map = [mismatch_alignment(primer_right, seq)
-                                  for seq in right_fwd_actual]
+        left_fwd_mismatch_map, left_fwd_has_ambig = zip(*[
+            mismatch_alignment(primer_left, seq) for seq in left_fwd_actual
+        ]) if left_fwd_actual else ([], [])
+
+        right_fwd_mismatch_map, right_fwd_has_ambig = zip(*[
+            mismatch_alignment(primer_right, seq) for seq in right_fwd_actual
+        ]) if right_fwd_actual else ([], [])
 
-        # Reverse strand
+        # Reverse strand search
         ref_rev = str(Seq(reference_seq).reverse_complement())
         left_rev = [m.start() for m in re.finditer(pattern_left,
                                                    ref_rev,
@@ -424,42 +463,58 @@ def mismatch_alignment(primer, matched_seq):
         right_rev_actual = [ref_rev[pos:pos+len(primer_right)]
                             for pos in right_rev]
 
-        left_rev_mismatch_map = [mismatch_alignment(primer_left, seq)
-                                 for seq in left_rev_actual]
-        right_rev_mismatch_map = [mismatch_alignment(primer_right, seq)
-                                  for seq in right_rev_actual]
-
-        result_row = row.to_dict()  # Preserve original row data
+        left_rev_mismatch_map, left_rev_has_ambig = zip(*[
+            mismatch_alignment(primer_left, seq) for seq in left_rev_actual
+        ]) if left_rev_actual else ([], [])
+
+        right_rev_mismatch_map, right_rev_has_ambig = zip(*[
+            mismatch_alignment(primer_right, seq) for seq in right_rev_actual
+        ]) if right_rev_actual else ([], [])
+
+        # Check if any ambiguous bases are in the primers or alignments
+        has_ambiguous_base = any([
+            any(b in primer_left.upper() for b in "RYSWKMBDHVN"),
+            any(b in primer_right.upper() for b in "RYSWKMBDHVN"),
+            any(left_fwd_has_ambig) if left_fwd_has_ambig else False,
+            any(right_fwd_has_ambig) if right_fwd_has_ambig else False,
+            any(left_rev_has_ambig) if left_rev_has_ambig else False,
+            any(right_rev_has_ambig) if right_rev_has_ambig else False,
+        ])
+        if has_ambiguous_base and not warned:
+            warnings.warn("One or more primers contain ambiguous"
+                          "bases (e.g., N, R, Y, etc)."
+                          "Matches may be unreliable.")
+            warned = True
+
+        result_row = row.to_dict()
+        result_row["ambiguous_bases"] = has_ambiguous_base
 
         if left_fwd and right_fwd:
             result_row.update({
                 "left_primer_loc": left_fwd,
                 "right_primer_loc": right_fwd,
                 "left_seq_actual": left_fwd_actual,
                 "right_seq_actual": right_fwd_actual,
-                "left_mismatch_map": left_fwd_mismatch_map,
-                "right_mismatch_map": right_fwd_mismatch_map,
-                "strand": "forward"
+                "left_mismatch_map": list(left_fwd_mismatch_map),
+                "right_mismatch_map": list(right_fwd_mismatch_map),
             })
         elif left_rev and right_rev:
             result_row.update({
                 "left_primer_loc": left_rev,
                 "right_primer_loc": right_rev,
                 "left_seq_actual": left_rev_actual,
                 "right_seq_actual": right_rev_actual,
-                "left_mismatch_map": left_rev_mismatch_map,
-                "right_mismatch_map": right_rev_mismatch_map,
-                "strand": "reverse"
+                "left_mismatch_map": list(left_rev_mismatch_map),
+                "right_mismatch_map": list(right_rev_mismatch_map),
             })
         else:
             result_row.update({
                 "left_primer_loc": [],
                 "right_primer_loc": [],
-                "left_seq_actual": "none",
-                "right_seq_actual": "none",
-                "left_mismatch_map": "none",
-                "right_mismatch_map": "none",
-                "strand": "none"
+                "left_seq_actual": [],
+                "right_seq_actual": [],
+                "left_mismatch_map": [],
+                "right_mismatch_map": []
             })
 
         results.append(result_row)
diff --git a/setup.py b/setup.py
@@ -16,7 +16,7 @@
 
 setup(
     name="bygul",
-    version="V1.0.5",
+    version="V1.0.6",
     packages=find_packages(include=['bygul']),
     author="Maryam Ahmadi Jeshvaghane",
     license='BSD 2-Clause',