99import numpy as np
1010import regex as re
1111import click
12+ import warnings
13+
14+
15+ def assess_genome_quality_from_fasta (fasta_path ):
16+ """
17+ Parses a FASTA genome file and assesses quality by:
18+ - Counting ambiguous (non-ACGT) bases.
19+ - Reporting the length of each contig.
20+
21+ Parameters:
22+ fasta_path (str): Path to the FASTA file.
23+
24+ Returns:
25+ dict: {
26+ 'total_ambiguous_bases': int,
27+ 'contig_lengths': dict of {contig_id: length}
28+ }
29+ """
30+ ambiguous_bases = {'R' , 'Y' , 'S' , 'W' , 'K' , 'M' , 'B' , 'D' , 'H' , 'V' , 'N' }
31+ total_ambiguous = 0
32+ contig_lengths = {}
33+
34+ for record in SeqIO .parse (fasta_path , "fasta" ):
35+ seq = str (record .seq ).upper ()
36+ contig_lengths [record .id ] = len (seq )
37+ total_ambiguous += sum (1 for base in seq if base in ambiguous_bases )
38+
39+ return {
40+ 'total_ambiguous_bases' : total_ambiguous ,
41+ 'contig_lengths' : contig_lengths
42+ }
1243
1344
1445def validate_simulator_options (simulator , params_source ):
@@ -173,10 +204,11 @@ def create_valid_primer_combinations(df):
173204
174205 # Convert collected data to DataFrame efficiently
175206 valid_primers_df = pd .DataFrame .from_records (valid_primers )
176-
177207 # Merge with original DataFrame to include additional columns
178- all_amplicons = valid_primers_df .merge (
179- df [["amplicon_number" , "primer_seq_x" , "primer_seq_y" , "strand" ]],
208+ df = df [["amplicon_number" , "primer_seq_x" ,
209+ "primer_seq_y" , "ambiguous_bases" ]]
210+ all_amplicons = df .merge (
211+ valid_primers_df ,
180212 how = "left" ,
181213 on = "amplicon_number" ,
182214 )
@@ -363,23 +395,27 @@ def find_closest_primer_match(df, reference_seq, maxmismatch):
363395 For each row in df, find all left/right primer match positions (as lists),
364396 allowing up to `maxmismatch` mismatches. Ensures both primers are found
365397 on the same strand. Returns original df columns + matches, mismatch maps,
366- and strand .
398+ strand, and whether primers contain ambiguous bases (IUPAC codes) .
367399 """
368400
401+ # Extended ambiguity-aware mismatch display
369402 def mismatch_alignment (primer , matched_seq ):
370403 """
371404 Returns matched sequence with mismatches shown in parentheses.
372- Example:
373- Primer : AGCT
374- Match : AGTT
375- Output : AG(T)T
405+ Also returns a flag if primer contains any ambiguous base.
376406 """
407+ ambiguous_bases = {'R' , 'Y' , 'S' , 'W' ,
408+ 'K' , 'M' , 'B' , 'D' ,
409+ 'H' , 'V' , 'N' }
410+ has_ambiguity = any (base in ambiguous_bases
411+ for base in matched_seq .upper ())
377412 aligned = []
378413 for p , m in zip (primer .upper (), matched_seq .upper ()):
379414 aligned .append (m if p == m else f"({ m } )" )
380- return "" .join (aligned )
415+ return "" .join (aligned ), has_ambiguity
381416
382417 results = []
418+ warned = False
383419
384420 for _ , row in df .iterrows ():
385421 primer_left = row ["primer_seq_x" ]
@@ -388,7 +424,7 @@ def mismatch_alignment(primer, matched_seq):
388424 pattern_left = f"({ primer_left } ){{s<={ maxmismatch } }}"
389425 pattern_right = f"({ primer_right } ){{s<={ maxmismatch } }}"
390426
391- # Forward strand
427+ # Forward strand search
392428 left_fwd = [m .start () for m in re .finditer (pattern_left ,
393429 reference_seq ,
394430 flags = re .IGNORECASE ,
@@ -403,12 +439,15 @@ def mismatch_alignment(primer, matched_seq):
403439 right_fwd_actual = [reference_seq [pos :pos + len (primer_right )]
404440 for pos in right_fwd ]
405441
406- left_fwd_mismatch_map = [mismatch_alignment (primer_left , seq )
407- for seq in left_fwd_actual ]
408- right_fwd_mismatch_map = [mismatch_alignment (primer_right , seq )
409- for seq in right_fwd_actual ]
442+ left_fwd_mismatch_map , left_fwd_has_ambig = zip (* [
443+ mismatch_alignment (primer_left , seq ) for seq in left_fwd_actual
444+ ]) if left_fwd_actual else ([], [])
445+
446+ right_fwd_mismatch_map , right_fwd_has_ambig = zip (* [
447+ mismatch_alignment (primer_right , seq ) for seq in right_fwd_actual
448+ ]) if right_fwd_actual else ([], [])
410449
411- # Reverse strand
450+ # Reverse strand search
412451 ref_rev = str (Seq (reference_seq ).reverse_complement ())
413452 left_rev = [m .start () for m in re .finditer (pattern_left ,
414453 ref_rev ,
@@ -424,42 +463,58 @@ def mismatch_alignment(primer, matched_seq):
424463 right_rev_actual = [ref_rev [pos :pos + len (primer_right )]
425464 for pos in right_rev ]
426465
427- left_rev_mismatch_map = [mismatch_alignment (primer_left , seq )
428- for seq in left_rev_actual ]
429- right_rev_mismatch_map = [mismatch_alignment (primer_right , seq )
430- for seq in right_rev_actual ]
431-
432- result_row = row .to_dict () # Preserve original row data
466+ left_rev_mismatch_map , left_rev_has_ambig = zip (* [
467+ mismatch_alignment (primer_left , seq ) for seq in left_rev_actual
468+ ]) if left_rev_actual else ([], [])
469+
470+ right_rev_mismatch_map , right_rev_has_ambig = zip (* [
471+ mismatch_alignment (primer_right , seq ) for seq in right_rev_actual
472+ ]) if right_rev_actual else ([], [])
473+
474+ # Check if any ambiguous bases are in the primers or alignments
475+ has_ambiguous_base = any ([
476+ any (b in primer_left .upper () for b in "RYSWKMBDHVN" ),
477+ any (b in primer_right .upper () for b in "RYSWKMBDHVN" ),
478+ any (left_fwd_has_ambig ) if left_fwd_has_ambig else False ,
479+ any (right_fwd_has_ambig ) if right_fwd_has_ambig else False ,
480+ any (left_rev_has_ambig ) if left_rev_has_ambig else False ,
481+ any (right_rev_has_ambig ) if right_rev_has_ambig else False ,
482+ ])
483+ if has_ambiguous_base and not warned :
484+ warnings .warn ("One or more primers contain ambiguous"
485+ "bases (e.g., N, R, Y, etc)."
486+ "Matches may be unreliable." )
487+ warned = True
488+
489+ result_row = row .to_dict ()
490+ result_row ["ambiguous_bases" ] = has_ambiguous_base
433491
434492 if left_fwd and right_fwd :
435493 result_row .update ({
436494 "left_primer_loc" : left_fwd ,
437495 "right_primer_loc" : right_fwd ,
438496 "left_seq_actual" : left_fwd_actual ,
439497 "right_seq_actual" : right_fwd_actual ,
440- "left_mismatch_map" : left_fwd_mismatch_map ,
441- "right_mismatch_map" : right_fwd_mismatch_map ,
442- "strand" : "forward"
498+ "left_mismatch_map" : list (left_fwd_mismatch_map ),
499+ "right_mismatch_map" : list (right_fwd_mismatch_map ),
443500 })
444501 elif left_rev and right_rev :
445502 result_row .update ({
446503 "left_primer_loc" : left_rev ,
447504 "right_primer_loc" : right_rev ,
448505 "left_seq_actual" : left_rev_actual ,
449506 "right_seq_actual" : right_rev_actual ,
450- "left_mismatch_map" : left_rev_mismatch_map ,
451- "right_mismatch_map" : right_rev_mismatch_map ,
452- "strand" : "reverse"
507+ "left_mismatch_map" : list (left_rev_mismatch_map ),
508+ "right_mismatch_map" : list (right_rev_mismatch_map ),
453509 })
454510 else :
455511 result_row .update ({
456512 "left_primer_loc" : [],
457513 "right_primer_loc" : [],
458- "left_seq_actual" : "none" ,
459- "right_seq_actual" : "none" ,
460- "left_mismatch_map" : "none" ,
461- "right_mismatch_map" : "none" ,
462- "strand" : "none"
514+ "left_seq_actual" : [],
515+ "right_seq_actual" : [],
516+ "left_mismatch_map" : [],
517+ "right_mismatch_map" : []
463518 })
464519
465520 results .append (result_row )
0 commit comments