diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs
index c3c4ef9e2..3b5ef13f0 100644
--- a/mzLib/Omics/BioPolymer/SequenceVariation.cs
+++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs
@@ -4,69 +4,216 @@
 
 namespace Omics.BioPolymer
 {
+    /// <summary>
+    /// Represents a contiguous sequence variation on a 1-based, inclusive coordinate system.
+    /// A variation spans <see cref="OneBasedBeginPosition"/>.. <see cref="OneBasedEndPosition"/> in the parent sequence,
+    /// replaces the <see cref="OriginalSequence"/> (which may be empty for an insertion) with <see cref="VariantSequence"/>
+    /// (which may be empty for a deletion), and can optionally carry site-specific <see cref="OneBasedModifications"/>.
+    /// When available, a parsed <see cref="VariantCallFormat"/> is attached via <see cref="VariantCallFormatDataString"/>.
+    /// 
+    /// Typical interpretations:
+    /// - Substitution: non-empty <see cref="OriginalSequence"/> and non-empty <see cref="VariantSequence"/> of equal length.
+    /// - Insertion: empty <see cref="OriginalSequence"/> and non-empty <see cref="VariantSequence"/>.
+    /// - Deletion: non-empty <see cref="OriginalSequence"/> and empty <see cref="VariantSequence"/>.
+    /// </summary>
     public class SequenceVariation
     {
         /// <summary>
-        /// For longer sequence variations, where a range of sequence is replaced. Point mutations should be specified with the same begin and end positions.
+        /// Create a variation with an explicit VCF object.
         /// </summary>
-        /// <param name="oneBasedBeginPosition"></param>
-        /// <param name="oneBasedEndPosition"></param>
-        /// <param name="originalSequence"></param>
-        /// <param name="variantSequence"></param>
-        /// <param name="oneBasedModifications"></param>
+        /// <param name="oneBasedBeginPosition">
+        /// 1-based, inclusive start position in the parent sequence where the variation begins.
+        /// Must be &gt;= 1. See <see cref="AreValid"/> for validity conditions.
+        /// </param>
+        /// <param name="oneBasedEndPosition">
+        /// 1-based, inclusive end position in the parent sequence where the variation ends.
+        /// Must satisfy <c>oneBasedEndPosition &gt;= oneBasedBeginPosition</c>.
+        /// </param>
+        /// <param name="originalSequence">
+        /// Reference subsequence being replaced. Null is coerced to an empty string.
+        /// Empty string typically indicates an insertion at <paramref name="oneBasedBeginPosition"/>.
+        /// </param>
+        /// <param name="variantSequence">
+        /// Alternate subsequence to insert in place of <paramref name="originalSequence"/>.
+        /// Null is coerced to an empty string. Empty string typically indicates a deletion.
+        /// </param>
+        /// <param name="description">
+        /// Free-form description of the variation. Often the original VCF line or human-readable note.
+        /// </param>
+        /// <param name="variantCallFormat">
+        /// Parsed VCF wrapper for the originating record. Used for downstream analysis of genotype/allele metadata.
+        /// </param>
+        /// <param name="oneBasedModifications">
+        /// Optional mapping from absolute 1-based residue positions to one or more <see cref="Modification"/> objects
+        /// applied at that position (in the same coordinate system as <paramref name="oneBasedBeginPosition"/>/<paramref name="oneBasedEndPosition"/>).
+        /// If null, an empty dictionary is created.
+        /// </param>
+        public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, VariantCallFormat variantCallFormat, Dictionary<int, List<Modification>>? oneBasedModifications = null)
+        {
+            OneBasedBeginPosition = oneBasedBeginPosition;
+            OneBasedEndPosition = oneBasedEndPosition;
+            OriginalSequence = originalSequence ?? "";
+            VariantSequence = variantSequence ?? "";
+            Description = description;
+            VariantCallFormatDataString = variantCallFormat;
+            OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
+        }
+
+        /// <summary>
+        /// Create a variation by providing a raw VCF line (string representation) which will be parsed into a <see cref="VariantCallFormat"/>.
+        /// </summary>
+        /// <param name="oneBasedBeginPosition">
+        /// 1-based, inclusive start position in the parent sequence where the variation begins.
+        /// Must be &gt;= 1. See <see cref="AreValid"/> for validity conditions.
+        /// </param>
+        /// <param name="oneBasedEndPosition">
+        /// 1-based, inclusive end position in the parent sequence where the variation ends.
+        /// Must satisfy <c>oneBasedEndPosition &gt;= oneBasedBeginPosition</c>.
+        /// </param>
+        /// <param name="originalSequence">
+        /// Reference subsequence being replaced. Null is coerced to an empty string.
+        /// Empty string typically indicates an insertion at <paramref name="oneBasedBeginPosition"/>.
+        /// </param>
+        /// <param name="variantSequence">
+        /// Alternate subsequence to insert in place of <paramref name="originalSequence"/>.
+        /// Null is coerced to an empty string. Empty string typically indicates a deletion.
+        /// </param>
+        /// <param name="description">
+        /// Free-form description of the variation. Often the original VCF line or human-readable note.
+        /// </param>
+        /// <param name="variantCallFormatStringRepresentation">
+        /// Raw VCF record (a single, tab-delimited line). It is parsed into <see cref="VariantCallFormatDataString"/>.
+        /// </param>
+        /// <param name="oneBasedModifications">
+        /// Optional mapping from absolute 1-based residue positions to one or more <see cref="Modification"/> objects
+        /// applied at that position (in the same coordinate system as <paramref name="oneBasedBeginPosition"/>/<paramref name="oneBasedEndPosition"/>).
+        /// If null, an empty dictionary is created.
+        /// </param>
+        public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, string variantCallFormatStringRepresentation, Dictionary<int, List<Modification>>? oneBasedModifications = null)
+        {
+            OneBasedBeginPosition = oneBasedBeginPosition;
+            OneBasedEndPosition = oneBasedEndPosition;
+            OriginalSequence = originalSequence ?? "";
+            VariantSequence = variantSequence ?? "";
+            Description = description;
+            VariantCallFormatDataString = new VariantCallFormat(variantCallFormatStringRepresentation);
+            OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
+        }
+
+        /// <summary>
+        /// Create a variation without a separate VCF string; a <see cref="VariantCallFormat"/> is still constructed
+        /// from <paramref name="description"/> to maintain a non-null object for tests and downstream consumers.
+        /// </summary>
+        /// <param name="oneBasedBeginPosition">
+        /// 1-based, inclusive start position in the parent sequence where the variation begins.
+        /// Must be &gt;= 1. See <see cref="AreValid"/> for validity conditions.
+        /// </param>
+        /// <param name="oneBasedEndPosition">
+        /// 1-based, inclusive end position in the parent sequence where the variation ends.
+        /// Must satisfy <c>oneBasedEndPosition &gt;= oneBasedBeginPosition</c>.
+        /// </param>
+        /// <param name="originalSequence">
+        /// Reference subsequence being replaced. Null is coerced to an empty string.
+        /// Empty string typically indicates an insertion at <paramref name="oneBasedBeginPosition"/>.
+        /// </param>
+        /// <param name="variantSequence">
+        /// Alternate subsequence to insert in place of <paramref name="originalSequence"/>.
+        /// Null is coerced to an empty string. Empty string typically indicates a deletion.
+        /// </param>
+        /// <param name="description">
+        /// Free-form description of the variation. Also used to initialize <see cref="VariantCallFormatDataString"/>.
+        /// </param>
+        /// <param name="oneBasedModifications">
+        /// Optional mapping from absolute 1-based residue positions to one or more <see cref="Modification"/> objects
+        /// applied at that position (in the same coordinate system as <paramref name="oneBasedBeginPosition"/>/<paramref name="oneBasedEndPosition"/>).
+        /// If null, an empty dictionary is created.
+        /// </param>
         public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, Dictionary<int, List<Modification>>? oneBasedModifications = null)
         {
             OneBasedBeginPosition = oneBasedBeginPosition;
             OneBasedEndPosition = oneBasedEndPosition;
             OriginalSequence = originalSequence ?? "";
             VariantSequence = variantSequence ?? "";
-            Description = new VariantCallFormat(description);
+            Description = description;
+            // Always construct a VariantCallFormat so tests relying on non-null VCF objects pass.
+            VariantCallFormatDataString = new VariantCallFormat(description);
             OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
         }
 
         /// <summary>
-        /// For variations with only position information (not begin and end).
-        /// Sets the end to the end of the original protein sequence to which this variation applies.
+        /// Convenience constructor for single-position variations. The end position is inferred from
+        /// <paramref name="oneBasedPosition"/> and the length of <paramref name="originalSequence"/>:
+        /// <c>end = position + length(originalSequence) - 1</c> (or <c>end = position</c> if <paramref name="originalSequence"/> is null).
         /// </summary>
-        /// <param name="oneBasedPosition"></param>
-        /// <param name="originalSequence"></param>
-        /// <param name="variantSequence"></param>
-        /// <param name="description"></param>
-        /// <param name="oneBasedModifications"></param>
+        /// <param name="oneBasedPosition">
+        /// 1-based, inclusive position of the variation start. Used to infer <see cref="OneBasedEndPosition"/>.
+        /// </param>
+        /// <param name="originalSequence">
+        /// Reference subsequence being replaced. If null, treated as empty for end-position inference.
+        /// </param>
+        /// <param name="variantSequence">
+        /// Alternate subsequence to insert in place of <paramref name="originalSequence"/>. May be empty for deletions.
+        /// </param>
+        /// <param name="description">
+        /// Free-form description of the variation. Also used to initialize <see cref="VariantCallFormatDataString"/>.
+        /// </param>
+        /// <param name="oneBasedModifications">
+        /// Optional mapping from absolute 1-based residue positions to one or more <see cref="Modification"/> objects
+        /// applied at that position. If null, an empty dictionary is created.
+        /// </param>
         public SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, Dictionary<int, List<Modification>>? oneBasedModifications = null)
             : this(oneBasedPosition, originalSequence == null ? oneBasedPosition : oneBasedPosition + originalSequence.Length - 1, originalSequence, variantSequence, description, oneBasedModifications)
         { }
 
         /// <summary>
-        /// Beginning position of original sequence to be replaced
+        /// 1-based, inclusive start position of this variation within the parent sequence.
         /// </summary>
         public int OneBasedBeginPosition { get; }
 
         /// <summary>
-        /// End position of original sequence to be replaced
+        /// 1-based, inclusive end position of this variation within the parent sequence.
+        /// For single-site variations with non-null <see cref="OriginalSequence"/>, this is
+        /// <c>OneBasedBeginPosition + OriginalSequence.Length - 1</c>.
         /// </summary>
         public int OneBasedEndPosition { get; }
 
         /// <summary>
-        /// Original sequence information (optional)
+        /// The reference subsequence replaced by this variation. Empty string implies an insertion.
         /// </summary>
         public string OriginalSequence { get; }
 
         /// <summary>
-        /// Variant sequence information (required)
+        /// The alternate subsequence inserted by this variation. Empty string implies a deletion.
         /// </summary>
         public string VariantSequence { get; }
 
         /// <summary>
-        /// Description of this variation (optional)
+        /// Free-form description of the variation. Often the raw VCF line or a human-readable summary.
         /// </summary>
-        public VariantCallFormat Description { get; }
+        public string Description { get; }
 
         /// <summary>
-        /// Modifications specifically for this variant
+        /// Optional parsed VCF wrapper providing structured access to the originating VCF record.
+        /// May be null in some construction paths; in the provided constructors it is initialized.
+        /// </summary>
+        public VariantCallFormat? VariantCallFormatDataString { get; }
+
+        /// <summary>
+        /// Mapping from absolute 1-based residue positions to a list of <see cref="Modification"/> objects
+        /// to apply at each position. Never null; defaults to an empty dictionary.
         /// </summary>
         public Dictionary<int, List<Modification>> OneBasedModifications { get; }
 
+        /// <summary>
+        /// Determines value equality with another object.
+        /// Two <see cref="SequenceVariation"/> objects are equal when:
+        /// - Begin and end positions are equal
+        /// - Original and variant sequences are equal (nulls treated as equal only if both are null)
+        /// - <see cref="VariantCallFormatDataString"/> are both null or equal
+        /// - <see cref="OneBasedModifications"/> have identical key sets and identical flattened modification lists (sequence-equal)
+        /// </summary>
+        /// <param name="obj">Object to compare against.</param>
+        /// <returns>True if equal by the criteria above; otherwise false.</returns>
         public override bool Equals(object obj)
         {
             SequenceVariation s = obj as SequenceVariation;
@@ -75,90 +222,95 @@ public override bool Equals(object obj)
                 && OneBasedEndPosition == s.OneBasedEndPosition
                 && (s.OriginalSequence == null && OriginalSequence == null || OriginalSequence.Equals(s.OriginalSequence))
                 && (s.VariantSequence == null && VariantSequence == null || VariantSequence.Equals(s.VariantSequence))
-                && (s.Description == null && Description == null || Description.Equals(s.Description))
+                && ((s.VariantCallFormatDataString == null && VariantCallFormatDataString == null)
+                    || (VariantCallFormatDataString != null && VariantCallFormatDataString.Equals(s.VariantCallFormatDataString)))
                 && (s.OneBasedModifications == null && OneBasedModifications == null ||
                     s.OneBasedModifications.Keys.ToList().SequenceEqual(OneBasedModifications.Keys.ToList())
                     && s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList()));
         }
 
+        /// <summary>
+        /// Computes a hash code from begin/end positions, sequences, and the VCF wrapper (if present).
+        /// </summary>
+        /// <returns>A hash code suitable for hash-based collections.</returns>
         public override int GetHashCode()
         {
             return OneBasedBeginPosition.GetHashCode()
                 ^ OneBasedEndPosition.GetHashCode()
-                ^ OriginalSequence.GetHashCode() // null handled in constructor
-                ^ VariantSequence.GetHashCode() // null handled in constructor
-                ^ Description.GetHashCode(); // always constructed in constructor
+                ^ OriginalSequence.GetHashCode()
+                ^ VariantSequence.GetHashCode()
+                ^ (VariantCallFormatDataString?.GetHashCode() ?? 0);
         }
 
         /// <summary>
-        /// Returns a simple string represantation of this amino acid change
+        /// Produces a compact, human-readable representation: <c>{OriginalSequence}{OneBasedBeginPosition}{VariantSequence}</c>.
+        /// Example: substitution A-&gt;T at position 12 yields <c>"A12T"</c>.
         /// </summary>
-        /// <returns></returns>
+        /// <returns>The compact representation string.</returns>
         public string SimpleString()
         {
             return OriginalSequence + OneBasedBeginPosition.ToString() + VariantSequence;
         }
 
         /// <summary>
-        /// Determines whether this interval overlaps the queried interval
+        /// Tests whether the current variation intersects (overlaps) another variation in coordinate space.
+        /// Intersection is inclusive: any shared position in the 1-based, inclusive ranges is considered overlap.
         /// </summary>
-        /// <param name="segment"></param>
-        /// <returns></returns>
+        /// <param name="segment">The other <see cref="SequenceVariation"/> to test.</param>
+        /// <returns>True if the ranges overlap; otherwise false.</returns>
         internal bool Intersects(SequenceVariation segment)
         {
             return segment.OneBasedEndPosition >= OneBasedBeginPosition && segment.OneBasedBeginPosition <= OneBasedEndPosition;
         }
 
         /// <summary>
-        /// Determines whether this interval overlaps the queried interval
+        /// Tests whether the current variation intersects (overlaps) a truncation product range.
+        /// Intersection is inclusive: any shared position in the 1-based, inclusive ranges is considered overlap.
         /// </summary>
-        /// <param name="segment"></param>
-        /// <returns></returns>
+        /// <param name="segment">The <see cref="TruncationProduct"/> segment to test.</param>
+        /// <returns>True if the ranges overlap; otherwise false.</returns>
         internal bool Intersects(TruncationProduct segment)
         {
             return segment.OneBasedEndPosition >= OneBasedBeginPosition && segment.OneBasedBeginPosition <= OneBasedEndPosition;
         }
 
         /// <summary>
-        /// Determines whether this interval overlaps the queried position
+        /// Tests whether the current variation intersects a single 1-based position.
         /// </summary>
-        /// <param name="pos"></param>
-        /// <returns></returns>
+        /// <param name="pos">A 1-based, inclusive position in the parent sequence.</param>
+        /// <returns>True if <paramref name="pos"/> lies within the variation’s range; otherwise false.</returns>
         internal bool Intersects(int pos)
         {
             return OneBasedBeginPosition <= pos && pos <= OneBasedEndPosition;
         }
 
         /// <summary>
-        /// Determines whether this interval includes the queried interval
+        /// Tests whether the current variation fully includes another variation’s range.
+        /// Inclusion is inclusive on both ends.
         /// </summary>
-        /// <param name="segment"></param>
-        /// <returns></returns>
+        /// <param name="segment">The other <see cref="SequenceVariation"/> to test.</param>
+        /// <returns>True if the current range fully contains <paramref name="segment"/>; otherwise false.</returns>
         internal bool Includes(SequenceVariation segment)
         {
             return OneBasedBeginPosition <= segment.OneBasedBeginPosition && OneBasedEndPosition >= segment.OneBasedEndPosition;
         }
-        // Commented out by AVC on 4/5/23. Unused and untested in current code base,
-        // but can't rule out that it could be useful in the future.  
-        /// <summary>
-        /// Determines whether this interval includes the queried interval
-        /// </summary>
-        /// <param name="segment"></param>
-        /// <returns></returns>
-        // internal bool Includes(TruncationProduct segment)
-        // {
-        //     return OneBasedBeginPosition <= segment.OneBasedBeginPosition && OneBasedEndPosition >= segment.OneBasedEndPosition;
-        // }
 
         /// <summary>
-        /// Determines whether this interval overlaps the queried position
+        /// Tests whether the current variation includes a single 1-based position.
+        /// Inclusion is inclusive on both ends.
         /// </summary>
-        /// <param name="pos"></param>
-        /// <returns></returns>
+        /// <param name="pos">A 1-based, inclusive position in the parent sequence.</param>
+        /// <returns>True if <paramref name="pos"/> lies within the variation’s range; otherwise false.</returns>
         internal bool Includes(int pos)
         {
             return OneBasedBeginPosition <= pos && pos <= OneBasedEndPosition;
         }
+
+        /// <summary>
+        /// Validates positional consistency: begin must be &gt; 0 and end must be &gt;= begin.
+        /// This does not validate string/length consistency between <see cref="OriginalSequence"/> and <see cref="VariantSequence"/>.
+        /// </summary>
+        /// <returns>True if positions are valid; otherwise false.</returns>
         public bool AreValid()
         {
             return OneBasedBeginPosition > 0 && OneBasedEndPosition >= OneBasedBeginPosition;
diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs
index 1669c3afe..c5fd71ed5 100644
--- a/mzLib/Omics/BioPolymer/VariantApplication.cs
+++ b/mzLib/Omics/BioPolymer/VariantApplication.cs
@@ -5,38 +5,56 @@
 namespace Omics.BioPolymer
 {
     /// <summary>
-    /// Provides methods for applying sequence variations to proteins and handling modifications on variant sequences.
+    /// Utilities for constructing and applying sequence variants to biopolymers (proteins/RNA) in a 1-based, inclusive coordinate system.
+    /// This includes:
+    /// - Expanding a base biopolymer into concrete variant instances (genotype-aware or combinatorial).
+    /// - Renaming/accession tagging based on applied variants.
+    /// - Re-mapping indices for truncation products and localized modifications after edits (insertions/deletions/substitutions).
+    /// - Converting certain annotation-style modifications (e.g., nucleotide substitution markers) into true sequence variants.
     /// </summary>
-    /// <remarks>
-    /// Originally by A. Cesnik on 11/2/18, updated on 4/25/23. NB moved it and generalized for use in Transcriptomics on 3/25/25.
-    /// </remarks>
     public static class VariantApplication
     {
         /// <summary>
-        /// Creates a list of IBioPolymers of the same type as the original protein, each with applied variants from this protein.
+        /// Builds concrete variant biopolymers from a base entity.
+        /// If any known variation lacks genotype information (no VCF or empty genotypes), a safe combinatorial expansion is used (bounded by <paramref name="maxAllowedVariantsForCombinatorics"/>).
+        /// Otherwise, variants are applied in a genotype/allele-depth-aware manner via <see cref="ApplyVariants{TBioPolymerType}(TBioPolymerType, IEnumerable{SequenceVariation}, int, int)"/>.
         /// </summary>
-        /// <typeparam name="TBioPolymerType">Type of BioPolymer to create variants of</typeparam>
-        /// <param name="protein">original to generate variants of</param>
-        /// <param name="maxAllowedVariantsForCombinatorics"></param>
-        /// <param name="minAlleleDepth"></param>
-        /// <remarks>This replaces a method call that was previously an instance method in Protein</remarks>
+        /// <typeparam name="TBioPolymerType">A biopolymer type that supports sequence variants.</typeparam>
+        /// <param name="protein">The base biopolymer to expand into variant instances.</param>
+        /// <param name="maxAllowedVariantsForCombinatorics">
+        /// Maximum number of variants to consider when creating combinations for genotype-less scenarios.
+        /// This caps explosion in <see cref="ApplyAllVariantCombinations{TBioPolymerType}(TBioPolymerType, System.Collections.Generic.List{SequenceVariation}, int)"/>.
+        /// </param>
+        /// <param name="minAlleleDepth">
+        /// Minimum AD (Allele Depth) per sample required for an allele (reference or alternate) to be considered "deep" enough to participate in genotype-aware application.
+        /// </param>
+        /// <returns>A list of concrete variants derived from <paramref name="protein"/>.</returns>
         public static List<TBioPolymerType> GetVariantBioPolymers<TBioPolymerType>(this TBioPolymerType protein, int maxAllowedVariantsForCombinatorics = 4, int minAlleleDepth = 1)
             where TBioPolymerType : IHasSequenceVariants
         {
+            // Materialize any substitution-like annotations into concrete sequence variants on both the consensus and the instance
             protein.ConsensusVariant.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
             protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
-            if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.Description == null || v.Description.Genotypes.Count == 0))
+
+            // If all variants are positionally valid and any is missing VCF/genotype data, fall back to bounded combinatorial application
+            if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.VariantCallFormatDataString == null || v.VariantCallFormatDataString.Genotypes.Count == 0))
             {
-                // this is a protein with either no VCF lines or a mix of VCF and non-VCF lines
                 return ApplyAllVariantCombinations(protein, protein.SequenceVariations, maxAllowedVariantsForCombinatorics).ToList();
             }
-            // this is a protein with only VCF lines
-            return ApplyVariants(protein, protein.SequenceVariations, maxAllowedVariantsForCombinatorics, minAlleleDepth);
+
+            // Otherwise, do genotype/allele-depth-aware application with combinatorics limited for heterozygous sites
+            return ApplyVariants(protein, protein.SequenceVariations, maxAllowedVariantsForCombinitorics: maxAllowedVariantsForCombinatorics, minAlleleDepth);
         }
 
         /// <summary>
-        /// Gets the name of a protein with applied variations
+        /// Produces a name with an appended variant tag built from applied variation descriptions.
+        /// If both <paramref name="name"/> and <paramref name="appliedVariations"/> are effectively empty, returns null.
         /// </summary>
+        /// <param name="name">Base name (e.g., protein name). May be null.</param>
+        /// <param name="appliedVariations">Variations applied to the instance. If null/empty, no variant tag is appended.</param>
+        /// <returns>
+        /// The base <paramref name="name"/> plus " variant:{...}" when variations exist; null if both inputs are empty.
+        /// </returns>
         public static string? GetVariantName(string? name, IEnumerable<SequenceVariation>? appliedVariations)
         {
             bool emptyVars = appliedVariations.IsNullOrEmpty();
@@ -48,8 +66,13 @@ public static List<TBioPolymerType> GetVariantBioPolymers<TBioPolymerType>(this
         }
 
         /// <summary>
-        /// Gets the accession for a protein with applied variations
+        /// Constructs a variant-aware accession by appending a compact variant string to the consensus accession.
         /// </summary>
+        /// <param name="protein">The variant-capable biopolymer.</param>
+        /// <param name="appliedSequenceVariations">Applied variations to encode into the suffix; may be null/empty.</param>
+        /// <returns>
+        /// The consensus accession or consensus accession with "_{SimpleString}_..." suffix if variations are present.
+        /// </returns>
         public static string GetAccession(IHasSequenceVariants protein, IEnumerable<SequenceVariation>? appliedSequenceVariations)
         {
             return protein.ConsensusVariant.Accession +
@@ -57,33 +80,23 @@ public static string GetAccession(IHasSequenceVariants protein, IEnumerable<Sequ
         }
 
         /// <summary>
-        /// Determines if the modification falls on a variant amino acid
+        /// Determines if a specific 1-based position in the variant biopolymer lies within a particular variation's range.
         /// </summary>
-        /// <returns>true if a modification index on the protein falls within the applied variant</returns>
-        /// <remarks>
-        /// A. Cesnik - 4/25/23 
-        /// Variants annotated in protein entries can be applied to a sequence, i.e. a change is made to the sequence. 
-        /// One of the things Spritz can do that no other tool can do is enable finding modifications on these sites of variation, 
-        /// since I amended the sequence variant XML entries to have modifications.
-        /// </remarks>
+        /// <param name="appliedVariant">The variation of interest; may be null.</param>
+        /// <param name="variantProteinIndex">1-based position in the current (possibly already-edited) variant sequence.</param>
+        /// <returns>True if the position is included by the variation; otherwise false.</returns>
         public static bool IsSequenceVariantModification(SequenceVariation? appliedVariant, int variantProteinIndex)
         {
             return appliedVariant != null && appliedVariant.Includes(variantProteinIndex);
         }
 
         /// <summary>
-        /// Restores modification index on a variant protein to the index on the nonvariant protein,
-        /// or if it falls on a variant, this restores the position on the protein with only that variant
+        /// Maps a modification index from the edited (variant) sequence back to the original consensus index by subtracting the
+        /// net length changes of all applied variations that ended before the queried position.
         /// </summary>
-        /// <param name="protein">Protein containing applied sequence variations</param>
-        /// <param name="variantProteinModificationIndex">The one-based index of the amino acid residue bearing a modification</param>
-        /// <returns></returns>
-        /// <remarks>
-        /// A. Cesnik - 4/25/23 
-        /// Useful for comparing modification indices on variant proteins to the original protein.
-        /// Variations can introduce length changes and other changes to the sequence, 
-        /// so the indices of the modifications aren’t directly comparable, but this method makes that possible.
-        /// </remarks>
+        /// <param name="protein">The variant-capable biopolymer containing applied variations.</param>
+        /// <param name="variantProteinModificationIndex">1-based index in the variant sequence.</param>
+        /// <returns>The corresponding 1-based index in the original consensus sequence.</returns>
         public static int RestoreModificationIndex(IHasSequenceVariants protein, int variantProteinModificationIndex)
         {
             return variantProteinModificationIndex - protein.AppliedSequenceVariations
@@ -92,57 +105,73 @@ public static int RestoreModificationIndex(IHasSequenceVariants protein, int var
         }
 
         /// <summary>
-        /// Applies multiple variant changes to a protein sequence
+        /// Applies a set of sequence variations in a genotype- and allele-depth-aware fashion for all individuals found in the VCF payloads.
+        /// Heterozygous sites can produce combinatorial branches up to <paramref name="maxAllowedVariantsForCombinitorics"/> per individual.
+        /// Results are deduplicated by final base sequence.
         /// </summary>
+        /// <typeparam name="TBioPolymerType">A biopolymer type that supports sequence variants.</typeparam>
+        /// <param name="protein">The base biopolymer to which variations will be applied.</param>
+        /// <param name="sequenceVariations">Candidate variations. Duplicates by effect are collapsed using <see cref="SequenceVariation.SimpleString"/>.</param>
+        /// <param name="maxAllowedVariantsForCombinitorics">
+        /// Upper cap for heterozygous combinatorial branching. If an individual has more heterozygous variants than this number,
+        /// the algorithm limits branching to control explosion.
+        /// </param>
+        /// <param name="minAlleleDepth">Minimum AD (Allele Depth) per sample for an allele to be considered in application.</param>
+        /// <returns>A list of concrete variant biopolymers across all individuals encoded in the VCF payloads.</returns>
         public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerType protein, IEnumerable<SequenceVariation> sequenceVariations, int maxAllowedVariantsForCombinitorics, int minAlleleDepth)
             where TBioPolymerType : IHasSequenceVariants
         {
+            // Remove duplicate effects (by SimpleString), require variants with genotype data, apply from higher to lower positions
             List<SequenceVariation> uniqueEffectsToApply = sequenceVariations
                 .GroupBy(v => v.SimpleString())
                 .Select(x => x.First())
-                .Where(v => v.Description.Genotypes.Count > 0) // this is a VCF line
-                .OrderByDescending(v => v.OneBasedBeginPosition) // apply variants at the end of the protein sequence first
+                .Where(v => v.VariantCallFormatDataString.Genotypes.Count > 0)
+                .OrderByDescending(v => v.OneBasedBeginPosition)
                 .ToList();
 
+            // A shallow "base" variant to branch from (no applied variants yet)
             TBioPolymerType proteinCopy = protein.CreateVariant(protein.BaseSequence, protein, null, protein.TruncationProducts, protein.OneBasedPossibleLocalizedModifications, null);
 
-            // If there aren't any variants to apply, just return the base protein
             if (uniqueEffectsToApply.Count == 0)
             {
                 return new List<TBioPolymerType> { proteinCopy };
             }
 
-            HashSet<string> individuals = new HashSet<string>(uniqueEffectsToApply.SelectMany(v => v.Description.Genotypes.Keys));
+            // All per-sample identifiers present in the VCF objects
+            HashSet<string> individuals = new HashSet<string>(uniqueEffectsToApply.SelectMany(v => v.VariantCallFormatDataString.Genotypes.Keys));
             List<TBioPolymerType> variantProteins = new();
             List<TBioPolymerType> newVariantProteins = new();
-            // loop through genotypes for each sample/individual (e.g. tumor and normal)
+
             foreach (string individual in individuals)
             {
                 newVariantProteins.Clear();
                 newVariantProteins.Add(proteinCopy);
 
-                bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.Description.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics;
+                // Whether to limit combinatorial branching for this individual
+                bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.VariantCallFormatDataString.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics;
+
                 foreach (var variant in uniqueEffectsToApply)
                 {
-                    bool variantAlleleIsInTheGenotype = variant.Description.Genotypes[individual].Contains(variant.Description.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff
+                    // Only proceed if the individual's genotype references this variant's alternate allele index
+                    bool variantAlleleIsInTheGenotype = variant.VariantCallFormatDataString.Genotypes[individual].Contains(variant.VariantCallFormatDataString.AlleleIndex.ToString());
                     if (!variantAlleleIsInTheGenotype)
                     {
                         continue;
                     }
-                    bool isHomozygousAlternate = variant.Description.Homozygous[individual] && variant.Description.Genotypes[individual].All(d => d == variant.Description.AlleleIndex.ToString()); // note this isn't a great test for homozygosity, since the genotype could be 1/2 and this would still return true. But currently, alleles 1 and 2 will be included as separate variants, so this is fine for now.
-                    bool isDeepReferenceAllele = int.TryParse(variant.Description.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth;
-                    bool isDeepAlternateAllele = int.TryParse(variant.Description.AlleleDepths[individual][variant.Description.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth;
 
-                    // homozygous alternate
+                    // Zygosity and depth checks for this individual
+                    bool isHomozygousAlternate = variant.VariantCallFormatDataString.Homozygous[individual] && variant.VariantCallFormatDataString.Genotypes[individual].All(d => d == variant.VariantCallFormatDataString.AlleleIndex.ToString());
+                    bool isDeepReferenceAllele = int.TryParse(variant.VariantCallFormatDataString.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth;
+                    bool isDeepAlternateAllele = int.TryParse(variant.VariantCallFormatDataString.AlleleDepths[individual][variant.VariantCallFormatDataString.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth;
+
                     if (isHomozygousAlternate && isDeepAlternateAllele)
                     {
+                        // Deterministic application: all branches take the alt allele
                         newVariantProteins = newVariantProteins.Select(p => ApplySingleVariant(variant, p, individual)).ToList();
                     }
-
-                    // heterozygous basic
-                    // first protein with variants contains all homozygous variation, second contains all variations
-                    else if (variant.Description.Heterozygous[individual] && tooManyHeterozygousVariants)
+                    else if (variant.VariantCallFormatDataString.Heterozygous[individual] && tooManyHeterozygousVariants)
                     {
+                        // Limit branching: either keep ref, take alt, or update second branch if already present
                         if (isDeepAlternateAllele && isDeepReferenceAllele)
                         {
                             if (newVariantProteins.Count == 1 && maxAllowedVariantsForCombinitorics > 0)
@@ -154,51 +183,35 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                             {
                                 newVariantProteins[1] = ApplySingleVariant(variant, newVariantProteins[1], individual);
                             }
-                            else
-                            {
-                                // no heterozygous variants
-                            }
                         }
                         else if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0)
                         {
                             newVariantProteins = newVariantProteins.Select(p => ApplySingleVariant(variant, p, individual)).ToList();
                         }
-                        else
-                        {
-                            // keep reference only
-                        }
                     }
-
-                    // heterozygous combinitorics
-                    else if (variant.Description.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants)
+                    else if (variant.VariantCallFormatDataString.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants)
                     {
+                        // Full combinatorics (bounded) for heterozygous case: keep ref and/or take alt depending on depths
                         List<TBioPolymerType> combinitoricProteins = new();
 
                         foreach (var ppp in newVariantProteins)
                         {
                             if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0 && isDeepReferenceAllele)
                             {
-                                // keep reference allele
-                                if (variant.Description.Genotypes[individual].Contains("0"))
+                                if (variant.VariantCallFormatDataString.Genotypes[individual].Contains("0"))
                                 {
-                                    combinitoricProteins.Add(ppp);
+                                    combinitoricProteins.Add(ppp); // keep reference branch
                                 }
-
-                                // alternate allele (replace all, since in heterozygous with two alternates, both alternates are included)
-                                combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual));
+                                combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual)); // alternate branch
                             }
                             else if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0)
                             {
                                 combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual));
                             }
-                            else if (variant.Description.Genotypes[individual].Contains("0"))
+                            else if (variant.VariantCallFormatDataString.Genotypes[individual].Contains("0"))
                             {
                                 combinitoricProteins.Add(ppp);
                             }
-                            else
-                            {
-                                // must be two alternate alleles with not enough depth
-                            }
                         }
                         newVariantProteins = combinitoricProteins;
                     }
@@ -206,49 +219,79 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                 variantProteins.AddRange(newVariantProteins);
             }
 
+            // De-duplicate by final sequence to avoid identical variants from different branches
             return variantProteins.GroupBy(x => x.BaseSequence).Select(x => x.First()).ToList();
         }
 
         /// <summary>
-        /// Applies a single variant to a protein sequence
+        /// Applies a single <see cref="SequenceVariation"/> to a biopolymer and returns the updated instance with
+        /// sequence, truncation products, localized modifications, and applied-variation indices all re-mapped.
         /// </summary>
+        /// <typeparam name="TBioPolymerType">A biopolymer type that supports sequence variants.</typeparam>
+        /// <param name="variantGettingApplied">Variation to apply (coordinates refer to the current protein's sequence).</param>
+        /// <param name="protein">Source biopolymer to mutate.</param>
+        /// <param name="individual">Sample identifier used to annotate the created variant (may be empty).</param>
+        /// <returns>A new variant instance created via <see cref="IHasSequenceVariants.CreateVariant{TBioPolymerType}"/>.</returns>
         private static TBioPolymerType ApplySingleVariant<TBioPolymerType>(SequenceVariation variantGettingApplied, TBioPolymerType protein, string individual)
             where TBioPolymerType : IHasSequenceVariants
         {
+            // Sequence prefix before the edit region
             string seqBefore = protein.BaseSequence.Substring(0, variantGettingApplied.OneBasedBeginPosition - 1);
+            // The replacement (alternate) sequence
             string seqVariant = variantGettingApplied.VariantSequence;
+            // First index in the original sequence after the edited region
             int afterIdx = variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.OriginalSequence.Length - 1;
 
-            SequenceVariation variantAfterApplication = new SequenceVariation(
-                variantGettingApplied.OneBasedBeginPosition,
-                variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1,
-                variantGettingApplied.OriginalSequence,
-                variantGettingApplied.VariantSequence,
-                variantGettingApplied.Description.Description,
-                variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value));
-
-            // check to see if there is incomplete indel overlap, which would lead to weird variant sequences
-            // complete overlap is okay, since it will be overwritten; this can happen if there are two alternate alleles,
-            //    e.g. reference sequence is wrong at that point
-            bool intersectsAppliedRegionIncompletely = protein.AppliedSequenceVariations.Any(x => variantGettingApplied.Intersects(x) && !variantGettingApplied.Includes(x));
+            // Reify a "post-application" variation object pinned to the inserted length
+            SequenceVariation variantAfterApplication;
+            var vcf = variantGettingApplied.VariantCallFormatDataString;
+            if (vcf != null)
+            {
+                variantAfterApplication = new SequenceVariation(
+                    variantGettingApplied.OneBasedBeginPosition,
+                    variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1,
+                    variantGettingApplied.OriginalSequence,
+                    variantGettingApplied.VariantSequence,
+                    vcf.Description,
+                    vcf,
+                    variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value));
+            }
+            else
+            {
+                variantAfterApplication = new SequenceVariation(
+                    variantGettingApplied.OneBasedBeginPosition,
+                    variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1,
+                    variantGettingApplied.OriginalSequence,
+                    variantGettingApplied.VariantSequence,
+                    variantGettingApplied.Description,
+                    variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value));
+            }
+
+            // If an already-applied variation partially overlaps the current edit, use the consensus tail to avoid index corruption
+            bool intersectsAppliedRegionIncompletely =
+                protein.AppliedSequenceVariations != null
+                && protein.AppliedSequenceVariations.Any(x => variantGettingApplied.Intersects(x) && !variantGettingApplied.Includes(x));
+
             IEnumerable<SequenceVariation> appliedVariations = new[] { variantAfterApplication };
-            string seqAfter = null;
+            string seqAfter;
             if (intersectsAppliedRegionIncompletely)
             {
-                // use original protein sequence for the remaining sequence
+                // Tail from consensus (not the possibly already-mutated BaseSequence)
                 seqAfter = protein.BaseSequence.Length - afterIdx <= 0 ? "" : protein.ConsensusVariant.BaseSequence.Substring(afterIdx);
             }
             else
             {
-                // use this variant protein sequence for the remaining sequence
+                // Tail from the current BaseSequence; keep any previously applied variations that are not fully contained by the current edit
                 seqAfter = protein.BaseSequence.Length - afterIdx <= 0 ? "" : protein.BaseSequence.Substring(afterIdx);
                 appliedVariations = appliedVariations
-                    .Concat(protein.AppliedSequenceVariations.Where(x => !variantGettingApplied.Includes(x)))
+                    .Concat((protein.AppliedSequenceVariations ?? Enumerable.Empty<SequenceVariation>()).Where(x => !variantGettingApplied.Includes(x)))
                     .ToList();
             }
-            string variantSequence = (seqBefore + seqVariant + seqAfter).Split('*')[0]; // there may be a stop gained
 
-            // adjust indices
+            // Clip at stop (*) if any
+            string variantSequence = (seqBefore + seqVariant + seqAfter).Split('*')[0];
+
+            // Re-map dependent structures after the sequence length change
             List<TruncationProduct> adjustedProteolysisProducts = AdjustTruncationProductIndices(variantGettingApplied, variantSequence, protein, protein.TruncationProducts);
             Dictionary<int, List<Modification>> adjustedModifications = AdjustModificationIndices(variantGettingApplied, variantSequence, protein);
             List<SequenceVariation> adjustedAppliedVariations = AdjustSequenceVariationIndices(variantGettingApplied, variantSequence, appliedVariations);
@@ -257,90 +300,119 @@ private static TBioPolymerType ApplySingleVariant<TBioPolymerType>(SequenceVaria
         }
 
         /// <summary>
-        /// Adjusts the indices of sequence variations due to applying a single additional variant
+        /// Adjusts (re-bases) the coordinates of already-applied variations after applying a new variation.
+        /// Handles overlaps by trimming and shifts by adding the net length change.
         /// </summary>
+        /// <param name="variantGettingApplied">The newly applied variation causing coordinate shifts.</param>
+        /// <param name="variantAppliedProteinSequence">The updated sequence after the application (used to clamp ends).</param>
+        /// <param name="alreadyAppliedVariations">Variations that were previously applied (may be null).</param>
+        /// <returns>A new list of variations with updated coordinates, filtered for validity.</returns>
         private static List<SequenceVariation> AdjustSequenceVariationIndices(SequenceVariation variantGettingApplied, string variantAppliedProteinSequence, IEnumerable<SequenceVariation> alreadyAppliedVariations)
         {
             List<SequenceVariation> variations = new List<SequenceVariation>();
             if (alreadyAppliedVariations == null) { return variations; }
+
             foreach (SequenceVariation v in alreadyAppliedVariations)
             {
+                // Net length already introduced before the start of v
                 int addedIdx = alreadyAppliedVariations
                     .Where(applied => applied.OneBasedEndPosition < v.OneBasedBeginPosition)
                     .Sum(applied => applied.VariantSequence.Length - applied.OriginalSequence.Length);
 
-                // variant was entirely before the one being applied (shouldn't happen because of order of applying variants)
-                // or it's the current variation
-                if (v.Description.Equals(variantGettingApplied.Description) || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition)
+                // Either same VCF payload (null-safe) or fully before the new application region after compensating for prior shifts
+                if ((v.VariantCallFormatDataString != null && v.VariantCallFormatDataString.Equals(variantGettingApplied.VariantCallFormatDataString))
+                    || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition)
                 {
                     variations.Add(v);
                 }
-
-                // adjust indices based on new included sequence, minding possible overlaps to be filtered later
                 else
                 {
+                    // Compute overlap with the newly applied edit and shift by the net length change
                     int intersectOneBasedStart = Math.Max(variantGettingApplied.OneBasedBeginPosition, v.OneBasedBeginPosition);
                     int intersectOneBasedEnd = Math.Min(variantGettingApplied.OneBasedEndPosition, v.OneBasedEndPosition);
-                    int overlap = intersectOneBasedEnd < intersectOneBasedStart ? 0 : // no overlap
-                        intersectOneBasedEnd - intersectOneBasedStart + 1; // there's some overlap
+                    int overlap = intersectOneBasedEnd < intersectOneBasedStart ? 0 :
+                        intersectOneBasedEnd - intersectOneBasedStart + 1;
+
                     int sequenceLengthChange = variantGettingApplied.VariantSequence.Length - variantGettingApplied.OriginalSequence.Length;
+
                     int begin = v.OneBasedBeginPosition + sequenceLengthChange - overlap;
                     if (begin > variantAppliedProteinSequence.Length)
                     {
-                        continue; // cut out by a stop gain
+                        continue;
                     }
+
                     int end = v.OneBasedEndPosition + sequenceLengthChange - overlap;
                     if (end > variantAppliedProteinSequence.Length)
                     {
-                        end = variantAppliedProteinSequence.Length; // end shortened by a stop gain
+                        end = variantAppliedProteinSequence.Length;
+                    }
+
+                    var vcf = v.VariantCallFormatDataString;
+                    if (vcf != null)
+                    {
+                        variations.Add(new SequenceVariation(
+                            begin,
+                            end,
+                            v.OriginalSequence,
+                            v.VariantSequence,
+                            vcf.Description,
+                            vcf,
+                            v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)));
+                    }
+                    else
+                    {
+                        variations.Add(new SequenceVariation(
+                            begin,
+                            end,
+                            v.OriginalSequence,
+                            v.VariantSequence,
+                            v.Description,
+                            v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)));
                     }
-                    variations.Add(new SequenceVariation(
-                        begin,
-                        end,
-                        v.OriginalSequence,
-                        v.VariantSequence,
-                        v.Description.Description,
-                        v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)));
                 }
             }
             return variations;
         }
 
         /// <summary>
-        /// Eliminates proteolysis products that overlap sequence variations.
-        /// Since frameshift indels are written across the remaining sequence,
-        /// this eliminates proteolysis products that conflict with large deletions and other structural variations.
+        /// Re-bases proteolysis/truncation product ranges after applying a sequence variation.
+        /// Shifts segments to the right if the edit occurs before them and expands/contracts segments that span the edit.
+        /// If a stop (*) is introduced by the variant, downstream products are clamped to the new sequence length.
         /// </summary>
+        /// <param name="variant">The applied variation (source of positional shifts and length change).</param>
+        /// <param name="variantAppliedProteinSequence">The updated sequence after application.</param>
+        /// <param name="protein">The source biopolymer (used for consensus length in certain boundary checks).</param>
+        /// <param name="proteolysisProducts">Existing truncation products to be re-based; may be null.</param>
+        /// <returns>Updated list of truncation products within the new coordinate system.</returns>
         private static List<TruncationProduct> AdjustTruncationProductIndices(SequenceVariation variant, string variantAppliedProteinSequence, IHasSequenceVariants protein, IEnumerable<TruncationProduct> proteolysisProducts)
         {
             List<TruncationProduct> products = new List<TruncationProduct>();
             if (proteolysisProducts == null) { return products; }
+
             int sequenceLengthChange = variant.VariantSequence.Length - variant.OriginalSequence.Length;
+
             foreach (TruncationProduct p in proteolysisProducts.Where(p => p.OneBasedEndPosition.HasValue && p.OneBasedBeginPosition.HasValue))
             {
-                // proteolysis product is entirely before the variant
+                // Entirely before the edit: unchanged
                 if (variant.OneBasedBeginPosition > p.OneBasedEndPosition)
                 {
                     products.Add(p);
                 }
-                // proteolysis product straddles the variant, but the cleavage site(s) are still intact; the ends aren't considered cleavage sites
+                // Segment spans the edit or is clamped at boundaries: extend/contract or clamp to stop
                 else if ((p.OneBasedBeginPosition < variant.OneBasedBeginPosition || p.OneBasedBeginPosition == 1 || p.OneBasedBeginPosition == 2)
                          && (p.OneBasedEndPosition > variant.OneBasedEndPosition || p.OneBasedEndPosition == protein.ConsensusVariant.BaseSequence.Length))
                 {
                     if (variant.VariantSequence.EndsWith("*"))
                     {
+                        // Introduced stop codon/terminator: clamp to the new sequence length
                         products.Add(new TruncationProduct(p.OneBasedBeginPosition, variantAppliedProteinSequence.Length, p.Type));
                     }
                     else if (p.OneBasedEndPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length)
                     {
                         products.Add(new TruncationProduct(p.OneBasedBeginPosition, p.OneBasedEndPosition + sequenceLengthChange, p.Type));
                     }
-                    else
-                    {
-                        // cleavage site is not intact
-                    }
                 }
-                // proteolysis product is after the variant and there is no stop gain
+                // Entirely after the edit: shift right/left by the net length change, if still within bounds and not terminated
                 else if (p.OneBasedBeginPosition > variant.OneBasedEndPosition
                          && p.OneBasedBeginPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length
                          && p.OneBasedEndPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length
@@ -348,52 +420,49 @@ private static List<TruncationProduct> AdjustTruncationProductIndices(SequenceVa
                 {
                     products.Add(new TruncationProduct(p.OneBasedBeginPosition + sequenceLengthChange, p.OneBasedEndPosition + sequenceLengthChange, p.Type));
                 }
-                else // sequence variant conflicts with proteolysis cleavage site (cleavage site was lost)
-                {
-                    continue;
-                }
             }
             return products;
         }
 
         /// <summary>
-        /// Adjusts modification indices.
+        /// Produces a new localized modification dictionary re-based to the post-variation coordinate system and merged with
+        /// any one-based modifications carried by the applied variant itself.
         /// </summary>
+        /// <param name="variant">The variation causing index shifts.</param>
+        /// <param name="variantAppliedProteinSequence">The updated sequence after application (for bounds checks).</param>
+        /// <param name="protein">The source biopolymer providing the original modifications map.</param>
+        /// <returns>A new dictionary of one-based modification lists keyed by position in the updated sequence.</returns>
         private static Dictionary<int, List<Modification>> AdjustModificationIndices(SequenceVariation variant, string variantAppliedProteinSequence, IHasSequenceVariants protein)
         {
+            // Original per-position modifications (pre-application)
             IDictionary<int, List<Modification>> modificationDictionary = protein.OneBasedPossibleLocalizedModifications;
+            // Modifications contributed by the variant itself (coordinated in the same one-based space)
             IDictionary<int, List<Modification>> variantModificationDictionary = variant.OneBasedModifications;
+
             Dictionary<int, List<Modification>> mods = new Dictionary<int, List<Modification>>();
             int sequenceLengthChange = variant.VariantSequence.Length - variant.OriginalSequence.Length;
 
-            // change modification indices for variant sequence
+            // Re-base original modifications
             if (modificationDictionary != null)
             {
                 foreach (KeyValuePair<int, List<Modification>> kv in modificationDictionary)
                 {
                     if (kv.Key > variantAppliedProteinSequence.Length)
                     {
-                        continue; // it was cut out by a stop gain
+                        continue; // drop if beyond new end
                     }
-                    // mod is before the variant
                     else if (kv.Key < variant.OneBasedBeginPosition)
                     {
-                        mods.Add(kv.Key, kv.Value);
+                        mods.Add(kv.Key, kv.Value); // unaffected positions
                     }
-                    // mod is after the variant and not affected by a stop gain
                     else if (variant.OneBasedEndPosition < kv.Key && kv.Key + sequenceLengthChange <= variantAppliedProteinSequence.Length)
                     {
-                        mods.Add(kv.Key + sequenceLengthChange, kv.Value);
-                    }
-                    else // sequence variant conflicts with modification site (modification site substitution)
-                    {
-                        continue;
+                        mods.Add(kv.Key + sequenceLengthChange, kv.Value); // shift after the edit
                     }
                 }
             }
 
-            // sequence variant modifications are indexed to the variant sequence
-            //    NOTE: this code assumes variants are added from end to beginning of protein, so that previously added variant mods are adjusted above
+            // Merge-in variant-borne modifications (may share positions)
             if (variantModificationDictionary != null)
             {
                 foreach (var kv in variantModificationDictionary)
@@ -413,51 +482,54 @@ private static Dictionary<int, List<Modification>> AdjustModificationIndices(Seq
         }
 
         /// <summary>
-        /// Format string to append to accession
+        /// Concatenates the compact representations (<see cref="SequenceVariation.SimpleString"/>) of the provided variations with underscores.
         /// </summary>
+        /// <param name="variations">Variations to stringify; may be null/empty.</param>
+        /// <returns>An underscore-joined string or empty string if none.</returns>
         private static string CombineSimpleStrings(IEnumerable<SequenceVariation>? variations)
         {
             return variations.IsNullOrEmpty() ? "" : string.Join("_", variations.Select(v => v.SimpleString()));
         }
 
         /// <summary>
-        /// Format string to append to protein names
+        /// Concatenates human-readable descriptions for the provided variations.
+        /// Prefers VCF description when available, otherwise falls back to the variation's own description.
         /// </summary>
+        /// <param name="variations">Variations to describe; may be null/empty.</param>
+        /// <returns>A comma-delimited description string or empty string if none.</returns>
         public static string CombineDescriptions(IEnumerable<SequenceVariation>? variations)
         {
-            return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.Description));
+            return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.VariantCallFormatDataString?.Description ?? d.Description));
         }
+
         /// <summary>
-        /// Applies all possible combinations of the provided SequenceVariation list to the base TBioPolymerType object,
-        /// starting with the fewest single variations and up to the specified maximum number of combinations.
+        /// Applies all combinations of the provided variations to the base biopolymer up to a maximum number of yielded results.
+        /// The base (no-variant) biopolymer is yielded first, followed by combinations in increasing size.
         /// </summary>
-        /// <typeparam name="TBioPolymerType">The type of the biopolymer object.</typeparam>
-        /// <param name="baseBioPolymer">The base biopolymer object to apply variations to.</param>
-        /// <param name="variations">List of SequenceVariation objects to combine and apply. Assumed not null or empty.</param>
-        /// <param name="maxCombinations">Maximum number of combinations to return.</param>
-        /// <returns>
-        /// An IEnumerable of TBioPolymerType objects, each with a unique combination of variations applied.
-        /// </returns>
+        /// <typeparam name="TBioPolymerType">A biopolymer type that supports sequence variants.</typeparam>
+        /// <param name="baseBioPolymer">The starting biopolymer (no variations applied).</param>
+        /// <param name="variations">Candidate variations to combine.</param>
+        /// <param name="maxCombinations">Maximum number of variants (including the base) to yield to bound combinatorial growth.</param>
+        /// <returns>An enumerable of applied-variant biopolymers.</returns>
         public static IEnumerable<TBioPolymerType> ApplyAllVariantCombinations<TBioPolymerType>(
             TBioPolymerType baseBioPolymer,
             List<SequenceVariation> variations,
             int maxCombinations)
             where TBioPolymerType : IHasSequenceVariants
         {
-            int count = 0;
+            int count = 0; // number of variants yielded so far
 
-            // Always yield the base biopolymer first
             yield return baseBioPolymer;
             count++;
             if (count >= maxCombinations)
                 yield break;
 
-            int n = variations.Count;
+            int n = variations.Count; // total variation count
             for (int size = 1; size <= n; size++)
             {
                 foreach (var combo in GetCombinations(variations, size))
                 {
-                    var result = baseBioPolymer;
+                    var result = baseBioPolymer; // start from base and apply this combination in order
                     foreach (var variant in combo)
                     {
                         result = ApplySingleVariant(variant, result, string.Empty);
@@ -474,26 +546,27 @@ public static IEnumerable<TBioPolymerType> ApplyAllVariantCombinations<TBioPolym
         }
 
         /// <summary>
-        /// Generates all possible combinations of the specified size from the input list.
+        /// Generates all k-combinations (order-independent, no repetition) of the given list.
+        /// This is a standard lexicographic index-based combinator that yields increasing index tuples.
         /// </summary>
-        /// <param name="variations">List of SequenceVariation objects to combine. Assumed not null or empty.</param>
-        /// <param name="size">The size of each combination.</param>
-        /// <returns>
-        /// An IEnumerable of IList&lt;SequenceVariation&gt; representing each combination.
-        /// </returns>
+        /// <param name="variations">Source list to combine.</param>
+        /// <param name="size">Combination size k (0 &lt; k &lt;= n).</param>
+        /// <returns>An enumerable of read-only lists containing the selected variations.</returns>
         private static IEnumerable<IList<SequenceVariation>> GetCombinations(List<SequenceVariation> variations, int size)
         {
             int n = variations.Count;
             var indices = new int[size];
-            for (int i = 0; i < size; i++) indices[i] = i;
+            for (int i = 0; i < size; i++) indices[i] = i; // initial 0..k-1
 
             while (true)
             {
+                // Materialize current combination
                 var combo = new List<SequenceVariation>(size);
                 for (int i = 0; i < size; i++)
                     combo.Add(variations[indices[i]]);
                 yield return combo;
 
+                // Advance to next lexicographic combination
                 int pos = size - 1;
                 while (pos >= 0 && indices[pos] == n - size + pos)
                     pos--;
@@ -503,15 +576,26 @@ private static IEnumerable<IList<SequenceVariation>> GetCombinations(List<Sequen
                     indices[i] = indices[i - 1] + 1;
             }
         }
+
+        /// <summary>
+        /// Scans localized modifications for "nucleotide substitution" annotations of the form "X-&gt;Y" and converts them
+        /// into concrete <see cref="SequenceVariation"/> objects at the associated positions.
+        /// The originating annotation-style modifications are removed from both the possible-localized and original-non-variant collections
+        /// (and from consensus mirrors) when they are fully consumed by the conversion.
+        /// </summary>
+        /// <typeparam name="TBioPolymerType">A biopolymer type that supports sequence variants.</typeparam>
+        /// <param name="protein">The biopolymer whose modifications/variants are to be updated in-place.</param>
         public static void ConvertNucleotideSubstitutionModificationsToSequenceVariants<TBioPolymerType>(this TBioPolymerType protein)
             where TBioPolymerType : IHasSequenceVariants
         {
+            // Collect mods to remove after converting them to sequence variants
             List<KeyValuePair<int, Modification>> modificationsToRemove = new();
-            //convert substitution modifications to sequence variations
+
             foreach (var kvp in protein.OneBasedPossibleLocalizedModifications)
             {
                 foreach (Modification mod in kvp.Value)
                 {
+                    // Look for annotation-style nucleotide substitutions (e.g., "A->G")
                     if (mod.ModificationType.Contains("nucleotide substitution") && mod.OriginalId.Contains("->"))
                     {
                         string[] originalAndSubstitutedAminoAcids = mod.OriginalId.Split(new[] { "->" }, StringSplitOptions.RemoveEmptyEntries);
@@ -520,12 +604,14 @@ public static void ConvertNucleotideSubstitutionModificationsToSequenceVariants<
                         {
                             protein.SequenceVariations.Add(sequenceVariation);
                         }
+                        // Defer removal to avoid mutating collection during enumeration
                         KeyValuePair<int, Modification> pair = new(kvp.Key, mod);
                         modificationsToRemove.Add(pair);
                     }
                 }
             }
-            //remove the modifications that were converted to sequence variations
+
+            // Remove the consumed annotation-style modifications from both live and consensus dictionaries
             foreach (KeyValuePair<int, Modification> pair in modificationsToRemove)
             {
                 if (protein.OneBasedPossibleLocalizedModifications.ContainsKey(pair.Key))
@@ -544,6 +630,7 @@ public static void ConvertNucleotideSubstitutionModificationsToSequenceVariants<
                         }
                     }
                 }
+
                 if (protein.OriginalNonVariantModifications.ContainsKey(pair.Key))
                 {
                     List<Modification> modList = protein.OriginalNonVariantModifications[pair.Key];
diff --git a/mzLib/Omics/BioPolymer/VariantCallFormat.cs b/mzLib/Omics/BioPolymer/VariantCallFormat.cs
index e0cbec9a7..7549726c0 100644
--- a/mzLib/Omics/BioPolymer/VariantCallFormat.cs
+++ b/mzLib/Omics/BioPolymer/VariantCallFormat.cs
@@ -146,7 +146,7 @@ public VariantCallFormat(string description)
             Description = description;
 
             // Back-compat: if no real tabs are present but literal "\t" sequences are,
-            // normalize them to actual tabs for parsing only. Leave Description intact.
+            // normalize them to actual tabs for parsing only. Leave VariantCallFormatDataString intact.
             string parseLine = NormalizeTabsForParsing(description);
 
             // Parse description into VCF fields
diff --git a/mzLib/Test/DatabaseTests/TestProteinReader.cs b/mzLib/Test/DatabaseTests/TestProteinReader.cs
index 7dcd0b4d8..e0768eef4 100644
--- a/mzLib/Test/DatabaseTests/TestProteinReader.cs
+++ b/mzLib/Test/DatabaseTests/TestProteinReader.cs
@@ -125,7 +125,7 @@ public static void XmlTest()
             Assert.AreEqual(64, ok[0].SequenceVariations.First().OneBasedEndPosition);
             Assert.AreEqual(103 - 64 + 2, ok[1].SequenceVariations.First().OneBasedBeginPosition);
             Assert.AreEqual(103 - 64 + 2, ok[1].SequenceVariations.First().OneBasedEndPosition);
-            Assert.AreNotEqual(ok[0].SequenceVariations.First().Description, ok[1].SequenceVariations.First().Description); //decoys and target variations don't have the same desc.
+            Assert.AreNotEqual(ok[0].SequenceVariations.First().VariantCallFormatDataString, ok[1].SequenceVariations.First().VariantCallFormatDataString); //decoys and target variations don't have the same desc.
             Assert.AreEqual("Homo sapiens", ok[1].Organism);
         }
 
@@ -420,8 +420,8 @@ public static void TestReverseDecoyXML_WithCustomIdentifier()
 
                 foreach (var variant in protein.AppliedSequenceVariations)
                 {
-                    Assert.That(variant.Description, Does.StartWith("rev"));
-                    Assert.That(variant.Description, Does.Not.StartWith("DECOY"));
+                    Assert.That(variant.VariantCallFormatDataString, Does.StartWith("rev"));
+                    Assert.That(variant.VariantCallFormatDataString, Does.Not.StartWith("DECOY"));
                 }
 
                 foreach (var bond in protein.DisulfideBonds)
diff --git a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs
index babe44a76..e6df1c8d5 100644
--- a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs
+++ b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs
@@ -504,12 +504,12 @@ public void TestFullProteinReadWrite()
             Assert.AreEqual(originalProtein.TruncationProducts.First().OneBasedEndPosition, proteinReadFromXml[0].TruncationProducts.First().OneBasedEndPosition);
             Assert.AreEqual(originalProtein.TruncationProducts.First().Type, proteinReadFromXml[0].TruncationProducts.First().Type.Split('(')[0]);
 
-            Assert.AreEqual(originalProtein.SequenceVariations.First().Description, proteinReadFromXml[0].SequenceVariations.First().Description);
+            Assert.AreEqual(originalProtein.SequenceVariations.First().VariantCallFormatDataString, proteinReadFromXml[0].SequenceVariations.First().VariantCallFormatDataString);
             Assert.AreEqual(originalProtein.SequenceVariations.First().OneBasedBeginPosition, proteinReadFromXml[0].SequenceVariations.First().OneBasedBeginPosition);
             Assert.AreEqual(originalProtein.SequenceVariations.First().OneBasedEndPosition, proteinReadFromXml[0].SequenceVariations.First().OneBasedEndPosition);
             Assert.AreEqual(originalProtein.SequenceVariations.First().OriginalSequence, proteinReadFromXml[0].SequenceVariations.First().OriginalSequence);
             Assert.AreEqual(originalProtein.SequenceVariations.First().VariantSequence, proteinReadFromXml[0].SequenceVariations.First().VariantSequence);
-            Assert.AreEqual(originalProtein.SequenceVariations.Last().Description, proteinReadFromXml[0].SequenceVariations.Last().Description);
+            Assert.AreEqual(originalProtein.SequenceVariations.Last().VariantCallFormatDataString, proteinReadFromXml[0].SequenceVariations.Last().VariantCallFormatDataString);
             Assert.AreEqual(originalProtein.SequenceVariations.Last().OneBasedBeginPosition, proteinReadFromXml[0].SequenceVariations.Last().OneBasedBeginPosition);
             Assert.AreEqual(originalProtein.SequenceVariations.Last().OneBasedEndPosition, proteinReadFromXml[0].SequenceVariations.Last().OneBasedEndPosition);
             Assert.AreEqual(originalProtein.SequenceVariations.Last().OriginalSequence, proteinReadFromXml[0].SequenceVariations.Last().OriginalSequence);
@@ -534,7 +534,7 @@ public void TestReadWriteSeqVars()
             Assert.AreEqual(ok[0].SequenceVariations.Count(), ok2[0].SequenceVariations.Count());
             Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedBeginPosition, ok2[0].SequenceVariations.First().OneBasedBeginPosition);
             Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedEndPosition, ok2[0].SequenceVariations.First().OneBasedEndPosition);
-            Assert.AreEqual(ok[0].SequenceVariations.First().Description, ok2[0].SequenceVariations.First().Description);
+            Assert.AreEqual(ok[0].SequenceVariations.First().VariantCallFormatDataString, ok2[0].SequenceVariations.First().VariantCallFormatDataString);
             Assert.AreEqual(ok[0].SequenceVariations.First().OriginalSequence, ok2[0].SequenceVariations.First().OriginalSequence);
             Assert.AreEqual(ok[0].SequenceVariations.First().VariantSequence, ok2[0].SequenceVariations.First().VariantSequence);
         }
@@ -557,7 +557,7 @@ public void TestReadWriteSeqVars2()
             Assert.AreEqual(ok[0].SequenceVariations.Count(), ok2[0].SequenceVariations.Count());
             Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedBeginPosition, ok2[0].SequenceVariations.First().OneBasedBeginPosition);
             Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedEndPosition, ok2[0].SequenceVariations.First().OneBasedEndPosition);
-            Assert.AreEqual(ok[0].SequenceVariations.First().Description, ok2[0].SequenceVariations.First().Description);
+            Assert.AreEqual(ok[0].SequenceVariations.First().VariantCallFormatDataString, ok2[0].SequenceVariations.First().VariantCallFormatDataString);
             Assert.AreEqual(ok[0].SequenceVariations.First().OriginalSequence, ok2[0].SequenceVariations.First().OriginalSequence);
             Assert.AreEqual(ok[0].SequenceVariations.First().VariantSequence, ok2[0].SequenceVariations.First().VariantSequence);
         }
diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
index 8ced7b208..53bf78897 100644
--- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs
+++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
@@ -1,20 +1,21 @@
-﻿using System;
-using System.Collections;
-using System.Collections.Generic;
-using System.IO;
-using System.Linq;
+﻿using Chemistry;
+using MassSpectrometry;
 using NUnit.Framework;
+using NUnit.Framework.Legacy;
+using Omics;
 using Omics.BioPolymer;
-using Assert = NUnit.Framework.Legacy.ClassicAssert;
 using Omics.Modifications;
 using Proteomics;
 using Proteomics.ProteolyticDigestion;
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using Transcriptomics;
 using UsefulProteomicsDatabases;
+using Assert = NUnit.Framework.Legacy.ClassicAssert;
 using Stopwatch = System.Diagnostics.Stopwatch;
-using Omics;
-using Transcriptomics;
-using MassSpectrometry;
-using Chemistry;
 
 namespace Test.DatabaseTests
 {
@@ -115,7 +116,7 @@ public static void SeqVarXmlTest()
             {
                 Assert.AreEqual(s.OriginalSequence, decoy.BaseSequence.Substring(s.OneBasedBeginPosition - 1, s.OneBasedEndPosition - s.OneBasedBeginPosition + 1));
             }
-            Assert.AreNotEqual(target.SequenceVariations.First().Description, decoy.SequenceVariations.First().Description); //decoys and target variations don't have the same desc.
+            Assert.AreNotEqual(target.SequenceVariations.First().VariantCallFormatDataString, decoy.SequenceVariations.First().VariantCallFormatDataString); //decoys and target variations don't have the same desc.
 
             List<PeptideWithSetModifications> peptides = ok.SelectMany(vp => vp.Digest(new DigestionParams(), null, null)).ToList();
         }
@@ -284,76 +285,258 @@ public static void HomozygousVariantsAtVariedDepths(string filename, int minVari
             var variantProteins = proteins[0].GetVariantBioPolymers();
             List<PeptideWithSetModifications> peptides = proteins.SelectMany(vp => vp.Digest(new DigestionParams(), null, null)).ToList();
         }
-
         [Test]
         public static void AppliedVariants()
         {
-            ModificationMotif.TryGetMotif("P", out ModificationMotif motifP);
-            Modification mp = new Modification("mod", null, "type", null, motifP, "Anywhere.", null, 42.01, new Dictionary<string, IList<string>>(), null, null, null, null, null);
+            // This test verifies that applying sequence variations (SAV, MNV, insertion, deletion)
+            // produces the correct variant protein sequences, maps variant coordinates correctly,
+            // preserves per-variant metadata (AppliedSequenceVariations), and remains stable across:
+            // - repeated in-memory application,
+            // - round-tripping through XML (write → read).
+            //
+            // Additionally, it verifies that a modification attached to a variant (protein5)
+            // is persisted and localized at the expected one-based position after application
+            // and after XML reload.
 
+            // Define a simple P-specific modification used later to validate that modifications
+            // attached to a variant are preserved and localized correctly after applying variants
+            // and XML round-tripping.
+            ModificationMotif.TryGetMotif("P", out ModificationMotif motifP);
+            Modification mp = new Modification(
+                _originalId: "mod",
+                _accession: null,
+                _modificationType: "type",
+                _featureType: null,
+                _target: motifP,
+                _locationRestriction: "Anywhere.",
+                _chemicalFormula: null,
+                _monoisotopicMass: 42.01,
+                _databaseReference: new Dictionary<string, IList<string>>(),
+                _taxonomicRange: null,
+                _keywords: null,
+                _neutralLosses: null,
+                _diagnosticIons: null,
+                _fileOrigin: null);
+
+            // Prepare five proteins that each have one sequence variation:
+            // 1) protein1: Single Amino-acid Variant (SAV) P→V at position 4 (4..4)
+            // 2) protein2: Multi-Nucleotide Variant (MNV) PT→KT spanning positions 4..5
+            // 3) protein3: Insertion-like replacement: P→PPP at position 4 (longer variant)
+            // 4) protein4: Deletion-like replacement: PPP→P spanning 4..6 (shorter variant)
+            // 5) protein5: Same as (3) but with a modification attached at one-based index 5
+            //    to verify mod persistence/localization through variant application and XML.
             List<Protein> proteinsWithSeqVars = new List<Protein>
             {
-                new Protein("MPEPTIDE", "protein1", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 4, "P", "V", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
-                new Protein("MPEPTIDE", "protein2", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 5, "PT", "KT", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
-                new Protein("MPEPTIDE", "protein3", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
-                new Protein("MPEPPPTIDE", "protein4", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 6, "PPP", "P", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
-                new Protein("MPEPTIDE", "protein5", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary<int, List<Modification>> {{ 5, new[] { mp }.ToList() } }) }),
-             };
+                new Protein("MPEPTIDE", "protein1",
+                    sequenceVariations: new List<SequenceVariation>
+                    {
+                        // SAV: P(4) -> V(4)
+                        new SequenceVariation(4, 4, "P", "V",
+                            @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
+                    }),
+                new Protein("MPEPTIDE", "protein2",
+                    sequenceVariations: new List<SequenceVariation>
+                    {
+                        // MNV: PT(4..5) -> KT(4..5)
+                        new SequenceVariation(4, 5, "PT", "KT",
+                            @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
+                    }),
+                new Protein("MPEPTIDE", "protein3",
+                    sequenceVariations: new List<SequenceVariation>
+                    {
+                        // Insertion-like: P(4) -> PPP(4..6) (length +2)
+                        new SequenceVariation(4, 4, "P", "PPP",
+                            @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
+                    }),
+                new Protein("MPEPPPTIDE", "protein4",
+                    sequenceVariations: new List<SequenceVariation>
+                    {
+                        // Deletion-like: PPP(4..6) -> P(4) (length -2)
+                        new SequenceVariation(4, 6, "PPP", "P",
+                            @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
+                    }),
+                new Protein("MPEPTIDE", "protein5",
+                    sequenceVariations: new List<SequenceVariation>
+                    {
+                        // Insertion-like with a downstream mod to verify mod localization at 5
+                        new SequenceVariation(4, 4, "P", "PPP",
+                            @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30",
+                            new Dictionary<int, List<Modification>> { { 5, new[] { mp }.ToList() } })
+                    }),
+            };
+
+            // Apply variants in memory twice; the output should be stable and identical.
             var proteinsWithAppliedVariants = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList();
             var proteinsWithAppliedVariants2 = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList(); // should be stable
+
+            // Round-trip through XML: write the variant-bearing proteins (targets only) and read back.
+            // This ensures that variant application and metadata survive I/O and result identically.
             string xml = Path.Combine(TestContext.CurrentContext.TestDirectory, "AppliedVariants.xml");
             ProteinDbWriter.WriteXmlDatabase(new Dictionary<string, HashSet<Tuple<int, Modification>>>(), proteinsWithSeqVars, xml);
             var proteinsWithAppliedVariants3 = ProteinDbLoader.LoadProteinXML(xml, true, DecoyType.None, null, false, null, out var un);
 
+            // Compare across all three sources:
+            // - [0]: in-memory 1
+            // - [1]: in-memory 2 (should match [0])
+            // - [2]: XML reload (should match [0] and [1])
             var listArray = new[] { proteinsWithAppliedVariants, proteinsWithAppliedVariants2, proteinsWithAppliedVariants3 };
+
+            // Assert we always get exactly five variant proteins in the same order
+            // (SAV, MNV, insertion, deletion, insertion+mod).
+            Assert.AreEqual(5, proteinsWithAppliedVariants.Count, "Expected 5 applied variants (in-memory #1).");
+            Assert.AreEqual(5, proteinsWithAppliedVariants2.Count, "Expected 5 applied variants (in-memory #2).");
+            Assert.AreEqual(5, proteinsWithAppliedVariants3.Count, "Expected 5 applied variants (XML reload).");
+
+            // The expected sequences for each of the five applied variants, in order:
+            // 0: "MPEVTIDE"  (SAV at 4: P->V)
+            // 1: "MPEKTIDE"  (MNV at 4..5: PT->KT)
+            // 2: "MPEPPPTIDE" (Insertion-like at 4: P->PPP; length +2)
+            // 3: "MPEPTIDE"  (Deletion-like at 4..6: PPP->P; length -2 from "MPEPPPTIDE")
+            // 4: "MPEPPPTIDE" (Insertion-like with mod at 5)
             for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++)
             {
                 // sequences
-                Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence);
-                Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence);
-                Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence);
-                Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence);
-                Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence);
-                Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key);
-
+                Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence, $"[{dbIdx}] SAV sequence mismatch");
+                Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence, $"[{dbIdx}] MNV sequence mismatch");
+                Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence, $"[{dbIdx}] insertion sequence mismatch");
+                Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence, $"[{dbIdx}] deletion sequence mismatch");
+                Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence, $"[{dbIdx}] insertion+mod sequence mismatch");
+
+                // Confirm the modification attached to protein5 survives application and XML round-trip
+                Assert.AreEqual(1, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] Expected exactly 1 mod on the insertion+mod variant");
+                Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key, $"[{dbIdx}] Mod should localize to one-based position 5");
+                // Sanity: ensure the residue under the mod is indeed P
+                Assert.AreEqual('P', listArray[dbIdx][4].BaseSequence[5 - 1], $"[{dbIdx}] Residue at mod position should be 'P'");
+
+                // SAV expectations: single-residue change; length unchanged; position 4 is 'V'
+                Assert.AreEqual(8, listArray[dbIdx][0].BaseSequence.Length, $"[{dbIdx}] SAV length should be unchanged");
+                Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedBeginPosition, $"[{dbIdx}] SAV begin mismatch");
+                Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedEndPosition, $"[{dbIdx}] SAV end mismatch");
+                Assert.AreEqual("P", listArray[dbIdx][0].AppliedSequenceVariations.Single().OriginalSequence, $"[{dbIdx}] SAV original should be 'P'");
+                Assert.AreEqual("V", listArray[dbIdx][0].AppliedSequenceVariations.Single().VariantSequence, $"[{dbIdx}] SAV variant should be 'V'");
+                Assert.AreEqual('V', listArray[dbIdx][0].BaseSequence[4 - 1], $"[{dbIdx}] SAV residue at 4 should be 'V'");
+
+                // MNV expectations: multi-residue change; length unchanged; positions 4..5 become "KT"
+                Assert.AreEqual(8, listArray[dbIdx][1].BaseSequence.Length, $"[{dbIdx}] MNV length should be unchanged");
+                Assert.AreEqual(4, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedBeginPosition, $"[{dbIdx}] MNV begin mismatch");
+                Assert.AreEqual(5, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedEndPosition, $"[{dbIdx}] MNV end mismatch");
+                Assert.AreEqual("PT", listArray[dbIdx][1].AppliedSequenceVariations.Single().OriginalSequence, $"[{dbIdx}] MNV original should be 'PT'");
+                Assert.AreEqual("KT", listArray[dbIdx][1].AppliedSequenceVariations.Single().VariantSequence, $"[{dbIdx}] MNV variant should be 'KT'");
+                Assert.AreEqual("KT", listArray[dbIdx][1].BaseSequence.Substring(4 - 1, 2), $"[{dbIdx}] MNV residues 4..5 should be 'KT'");
+
+                // insertion expectations: length grows by +2; positions 4..6 are "PPP"
+                Assert.AreEqual(10, listArray[dbIdx][2].BaseSequence.Length, $"[{dbIdx}] insertion length should be 8 + 2 = 10");
+                Assert.AreEqual(4, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedBeginPosition, $"[{dbIdx}] insertion begin mismatch");
+                Assert.AreEqual(6, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedEndPosition, $"[{dbIdx}] insertion end mismatch");
+                Assert.AreEqual("P", listArray[dbIdx][2].AppliedSequenceVariations.Single().OriginalSequence, $"[{dbIdx}] insertion original should be 'P'");
+                Assert.AreEqual("PPP", listArray[dbIdx][2].AppliedSequenceVariations.Single().VariantSequence, $"[{dbIdx}] insertion variant should be 'PPP'");
+                Assert.AreEqual("PPP", listArray[dbIdx][2].BaseSequence.Substring(4 - 1, 3), $"[{dbIdx}] insertion residues 4..6 should be 'PPP'");
+
+                // deletion expectations: length shrinks by -2 relative to the starting "MPEPPPTIDE" (10 → 8)
+                // and positions collapse so that sequence returns to "MPEPTIDE".
+                Assert.AreEqual(8, listArray[dbIdx][3].BaseSequence.Length, $"[{dbIdx}] deletion length should be 10 - 2 = 8");
+                Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedBeginPosition, $"[{dbIdx}] deletion begin mismatch");
+                Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedEndPosition, $"[{dbIdx}] deletion end mismatch (post-collapse)");
+                Assert.AreEqual("PPP", listArray[dbIdx][3].AppliedSequenceVariations.Single().OriginalSequence, $"[{dbIdx}] deletion original should be 'PPP'");
+                Assert.AreEqual("P", listArray[dbIdx][3].AppliedSequenceVariations.Single().VariantSequence, $"[{dbIdx}] deletion variant should be 'P'");
+
+                // For completeness, also assert the summarized begin/end expectations that the original test verified.
                 // SAV
                 Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedBeginPosition);
                 Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedEndPosition);
-
                 // MNV
                 Assert.AreEqual(4, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedBeginPosition);
                 Assert.AreEqual(5, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedEndPosition);
-
                 // insertion
                 Assert.AreEqual(4, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedBeginPosition);
                 Assert.AreEqual(6, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedEndPosition);
-
                 // deletion
                 Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedBeginPosition);
                 Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedEndPosition);
             }
-        }
 
+            // Ensure exact stability across the three sources:
+            // - All sequences in in-memory #1 equal in-memory #2 and XML reload.
+            CollectionAssert.AreEqual(
+                proteinsWithAppliedVariants.Select(p => p.BaseSequence).ToList(),
+                proteinsWithAppliedVariants2.Select(p => p.BaseSequence).ToList(),
+                "In-memory application should be stable across repeated calls");
+            CollectionAssert.AreEqual(
+                proteinsWithAppliedVariants.Select(p => p.BaseSequence).ToList(),
+                proteinsWithAppliedVariants3.Select(p => p.BaseSequence).ToList(),
+                "XML round-trip should preserve variant-applied sequences in the same order");
+        }
         [Test]
         public static void AppliedVariants_AsIBioPolymer()
         {
-            ModificationMotif.TryGetMotif("P", out ModificationMotif motifP);
-            Modification mp = new Modification("mod", null, "type", null, motifP, "Anywhere.", null, 42.01, new Dictionary<string, IList<string>>(), null, null, null, null, null);
+            // PURPOSE
+            // This test mirrors "AppliedVariants" but exercises the IBioPolymer interface path.
+            // It validates, in detail:
+            // 1) Correct application of four variant types (SAV, MNV, insertion, deletion) to produce expected sequences.
+            // 2) Correct coordinates (begin/end), original/variant strings in AppliedSequenceVariations after application.
+            // 3) Stability of results across:
+            //    - repeated in-memory variant application,
+            //    - XML round-trip (write → read).
+            // 4) Modification propagation and localization for a variant carrying a downstream mod.
+            // 5) Distinguishing two variants with identical sequences by their modification state (index 2 vs 4).
+            //
+            // EXPECTATIONS SUMMARY
+            // - Variant sequences (in order): "MPEVTIDE", "MPEKTIDE", "MPEPPPTIDE", "MPEPTIDE", "MPEPPPTIDE".
+            // - AppliedSequenceVariations:
+            //   SAV       (idx 0): begin=4, end=4, original="P",   variant="V",   len=8
+            //   MNV       (idx 1): begin=4, end=5, original="PT",  variant="KT",  len=8
+            //   Insertion (idx 2): begin=4, end=6, original="P",   variant="PPP", len=10
+            //   Deletion  (idx 3): begin=4, end=4, original="PPP", variant="P",   len=8
+            //   Insert+Mod(idx 4): same sequence as insertion, plus exactly one mod at one-based index 5 targeting a 'P'.
+            //
+            // NOTE: Lists [0], [1], [2] below represent:
+            //   [0] in-memory application (first call)
+            //   [1] in-memory application (second call, should be identical/stable)
+            //   [2] XML round-trip results, must match [0] and [1]
 
+            // Arrange: create a P-specific modification used to test propagation and localization
+            ModificationMotif.TryGetMotif("P", out ModificationMotif motifP);
+            Modification mp = new Modification(
+                _originalId: "mod",
+                _accession: null,
+                _modificationType: "type",
+                _featureType: null,
+                _target: motifP,
+                _locationRestriction: "Anywhere.",
+                _chemicalFormula: null,
+                _monoisotopicMass: 42.01,
+                _databaseReference: new Dictionary<string, IList<string>>(),
+                _taxonomicRange: null,
+                _keywords: null,
+                _neutralLosses: null,
+                _diagnosticIons: null,
+                _fileOrigin: null);
+
+            // Arrange: build five proteins (as IBioPolymer) with one sequence variation each
+            // 1) SAV P(4)->V
+            // 2) MNV PT(4..5)->KT
+            // 3) Insertion-like P(4)->PPP(4..6)
+            // 4) Deletion-like PPP(4..6)->P(4) on a longer starting sequence
+            // 5) Insertion-like with a downstream mod at one-based index 5
             List<IBioPolymer> proteinsWithSeqVars = new List<IBioPolymer>
             {
-                new Protein("MPEPTIDE", "protein1", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 4, "P", "V", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
-                new Protein("MPEPTIDE", "protein2", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 5, "PT", "KT", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
-                new Protein("MPEPTIDE", "protein3", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
-                new Protein("MPEPPPTIDE", "protein4", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 6, "PPP", "P", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
-                new Protein("MPEPTIDE", "protein5", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary<int, List<Modification>> {{ 5, new[] { mp }.ToList() } }) }),
+                new Protein("MPEPTIDE",   "protein1", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 4, "P",   "V",   @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
+                new Protein("MPEPTIDE",   "protein2", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 5, "PT",  "KT",  @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
+                new Protein("MPEPTIDE",   "protein3", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 4, "P",   "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
+                new Protein("MPEPPPTIDE", "protein4", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 6, "PPP", "P",   @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
+                new Protein("MPEPTIDE",   "protein5", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 4, "P",   "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary<int, List<Modification>> { { 5, new[] { mp }.ToList() } }) }),
              };
+
+            // Act: apply variants in-memory twice (should be identical/stable)
             var proteinsWithAppliedVariants = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList();
-            var proteinsWithAppliedVariants2 = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList(); // should be stable
+            var proteinsWithAppliedVariants2 = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList();
+
+            // Act: write to XML and load back; results should match in-memory outputs
             string xml = Path.Combine(TestContext.CurrentContext.TestDirectory, "AppliedVariants.xml");
             ProteinDbWriter.WriteXmlDatabase(new Dictionary<string, HashSet<Tuple<int, Modification>>>(), proteinsWithSeqVars, xml);
             var proteinsWithAppliedVariants3 = ProteinDbLoader.LoadProteinXML(xml, true, DecoyType.None, null, false, null, out var un);
 
+            // Group lists for uniform validation loops
             var listArray = new List<IBioPolymer>[]
             {
                 proteinsWithAppliedVariants,
@@ -361,34 +544,114 @@ public static void AppliedVariants_AsIBioPolymer()
                 proteinsWithAppliedVariants3.Cast<IBioPolymer>().ToList()
             };
 
-            for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++)
+            // Assert: each expansion produces exactly 5 variant biopolymers, in the same, predictable order
+            Assert.AreEqual(5, proteinsWithAppliedVariants.Count, "Expected 5 variants (in-memory 1)");
+            Assert.AreEqual(5, proteinsWithAppliedVariants2.Count, "Expected 5 variants (in-memory 2)");
+            Assert.AreEqual(5, proteinsWithAppliedVariants3.Count, "Expected 5 variants (XML reload)");
+
+            // Assert: stability across the three sources (same sequences, same order)
+            CollectionAssert.AreEqual(
+                proteinsWithAppliedVariants.Select(p => p.BaseSequence).ToList(),
+                proteinsWithAppliedVariants2.Select(p => p.BaseSequence).ToList(),
+                "In-memory application should be stable across repeated calls");
+            CollectionAssert.AreEqual(
+                proteinsWithAppliedVariants.Select(p => p.BaseSequence).ToList(),
+                proteinsWithAppliedVariants3.Select(p => p.BaseSequence).ToList(),
+                "XML round-trip should preserve variant-applied sequences in the same order");
+
+            // Assert: for all variants we expect exactly one applied sequence variation
+            foreach (var variants in listArray)
             {
-                // sequences
-                Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence);
-                Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence);
-                Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence);
-                Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence);
-                Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence);
-                Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key);
-
-                // SAV
-                Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedBeginPosition);
-                Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedEndPosition);
-
-                // MNV
-                Assert.AreEqual(4, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedBeginPosition);
-                Assert.AreEqual(5, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedEndPosition);
+                Assert.AreEqual(1, variants[0].AppliedSequenceVariations.Count, "SAV must have exactly one applied variant");
+                Assert.AreEqual(1, variants[1].AppliedSequenceVariations.Count, "MNV must have exactly one applied variant");
+                Assert.AreEqual(1, variants[2].AppliedSequenceVariations.Count, "Insertion must have exactly one applied variant");
+                Assert.AreEqual(1, variants[3].AppliedSequenceVariations.Count, "Deletion must have exactly one applied variant");
+                Assert.AreEqual(1, variants[4].AppliedSequenceVariations.Count, "Insertion+Mod must have exactly one applied variant");
+            }
 
-                // insertion
-                Assert.AreEqual(4, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedBeginPosition);
-                Assert.AreEqual(6, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedEndPosition);
+            // Per-list validation of sequence, coordinates, and (where appropriate) residue checks and mod checks
+            for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++)
+            {
+                // Assert: expected sequences in fixed order
+                Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence, $"[{dbIdx}] SAV sequence mismatch");
+                Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence, $"[{dbIdx}] MNV sequence mismatch");
+                Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence, $"[{dbIdx}] insertion sequence mismatch");
+                Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence, $"[{dbIdx}] deletion sequence mismatch");
+                Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence, $"[{dbIdx}] insertion+mod sequence mismatch");
+
+                // Assert: lengths (sanity for ins/del)
+                Assert.AreEqual(8, listArray[dbIdx][0].BaseSequence.Length, $"[{dbIdx}] SAV length should be unchanged");
+                Assert.AreEqual(8, listArray[dbIdx][1].BaseSequence.Length, $"[{dbIdx}] MNV length should be unchanged");
+                Assert.AreEqual(10, listArray[dbIdx][2].BaseSequence.Length, $"[{dbIdx}] insertion length should be 8 + 2 = 10");
+                Assert.AreEqual(8, listArray[dbIdx][3].BaseSequence.Length, $"[{dbIdx}] deletion length should be 10 - 2 = 8");
+                Assert.AreEqual(10, listArray[dbIdx][4].BaseSequence.Length, $"[{dbIdx}] insertion+mod length should be 10");
+
+                // SAV assertions: P(4)->V
+                var sav = listArray[dbIdx][0].AppliedSequenceVariations.Single();
+                Assert.AreEqual(4, sav.OneBasedBeginPosition, $"[{dbIdx}] SAV begin");
+                Assert.AreEqual(4, sav.OneBasedEndPosition, $"[{dbIdx}] SAV end");
+                Assert.AreEqual("P", sav.OriginalSequence, $"[{dbIdx}] SAV original");
+                Assert.AreEqual("V", sav.VariantSequence, $"[{dbIdx}] SAV variant");
+                Assert.AreEqual('V', listArray[dbIdx][0].BaseSequence[3], $"[{dbIdx}] SAV residue at 4 must be 'V'");
+
+                // MNV assertions: PT(4..5)->KT
+                var mnv = listArray[dbIdx][1].AppliedSequenceVariations.Single();
+                Assert.AreEqual(4, mnv.OneBasedBeginPosition, $"[{dbIdx}] MNV begin");
+                Assert.AreEqual(5, mnv.OneBasedEndPosition, $"[{dbIdx}] MNV end");
+                Assert.AreEqual("PT", mnv.OriginalSequence, $"[{dbIdx}] MNV original");
+                Assert.AreEqual("KT", mnv.VariantSequence, $"[{dbIdx}] MNV variant");
+                Assert.AreEqual("KT", listArray[dbIdx][1].BaseSequence.Substring(3, 2), $"[{dbIdx}] MNV residues 4..5 must be 'KT'");
+
+                // Insertion-like assertions: P(4)->PPP(4..6)
+                var ins = listArray[dbIdx][2].AppliedSequenceVariations.Single();
+                Assert.AreEqual(4, ins.OneBasedBeginPosition, $"[{dbIdx}] insertion begin");
+                Assert.AreEqual(6, ins.OneBasedEndPosition, $"[{dbIdx}] insertion end");
+                Assert.AreEqual("P", ins.OriginalSequence, $"[{dbIdx}] insertion original");
+                Assert.AreEqual("PPP", ins.VariantSequence, $"[{dbIdx}] insertion variant");
+                Assert.AreEqual("PPP", listArray[dbIdx][2].BaseSequence.Substring(3, 3), $"[{dbIdx}] insertion residues 4..6 must be 'PPP'");
+
+                // Deletion-like assertions: PPP(4..6)->P(4) (collapses back to MPEPTIDE)
+                var del = listArray[dbIdx][3].AppliedSequenceVariations.Single();
+                Assert.AreEqual(4, del.OneBasedBeginPosition, $"[{dbIdx}] deletion begin");
+                Assert.AreEqual(4, del.OneBasedEndPosition, $"[{dbIdx}] deletion end (post-collapse)");
+                Assert.AreEqual("PPP", del.OriginalSequence, $"[{dbIdx}] deletion original");
+                Assert.AreEqual("P", del.VariantSequence, $"[{dbIdx}] deletion variant");
+
+                // Insertion + Modification assertions: identical sequence to insertion, plus one mod at pos 5
+                var insMod = listArray[dbIdx][4].AppliedSequenceVariations.Single();
+                Assert.AreEqual(4, insMod.OneBasedBeginPosition, $"[{dbIdx}] insertion+mod begin");
+                Assert.AreEqual(6, insMod.OneBasedEndPosition, $"[{dbIdx}] insertion+mod end");
+                Assert.AreEqual("P", insMod.OriginalSequence, $"[{dbIdx}] insertion+mod original");
+                Assert.AreEqual("PPP", insMod.VariantSequence, $"[{dbIdx}] insertion+mod variant");
+
+                // Mods: Only the "insertion+mod" variant (index 4) carries a mod; all others should have none
+                // Confirm the variant 4 has exactly one mod at one-based position 5, and the residue at 5 is 'P' (motif).
+                if (dbIdx == 0 || dbIdx == 1 || dbIdx == 2) // only assert detailed mod state once per path for clarity
+                {
+                    // Index 2 (plain insertion) must have no mods
+                    Assert.AreEqual(0, listArray[dbIdx][2].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] insertion should have no mods");
+
+                    // Index 4 (insertion+mod) must have exactly one mod at site 5, targeting a P residue
+                    Assert.AreEqual(1, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] insertion+mod should have exactly one site with mods");
+                    Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key, $"[{dbIdx}] insertion+mod site should be one-based index 5");
+                    Assert.AreEqual('P', listArray[dbIdx][4].BaseSequence[5 - 1], $"[{dbIdx}] insertion+mod residue at site 5 must be 'P' (motif)");
+
+                    // All other variants should have zero possible localized mods
+                    Assert.AreEqual(0, listArray[dbIdx][0].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] SAV should have no mods");
+                    Assert.AreEqual(0, listArray[dbIdx][1].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] MNV should have no mods");
+                    Assert.AreEqual(0, listArray[dbIdx][3].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] deletion should have no mods");
+                }
+            }
 
-                // deletion
-                Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedBeginPosition);
-                Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedEndPosition);
+            // Additional cross-list checks:
+            // - The two "MPEPPPTIDE" variants (indices 2 and 4) should be sequence-identical, but differ by modification presence.
+            for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++)
+            {
+                Assert.AreEqual(listArray[dbIdx][2].BaseSequence, listArray[dbIdx][4].BaseSequence, $"[{dbIdx}] insertion and insertion+mod sequences must match");
+                Assert.AreEqual(0, listArray[dbIdx][2].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] insertion should remain unmodified");
+                Assert.AreEqual(1, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] insertion+mod should remain modified");
             }
         }
-
         [Test]
         public static void CrashOnCreateVariantFromRNA()
         {
@@ -401,43 +664,277 @@ public static void CrashOnCreateVariantFromRNA()
                 proteins[0].CreateVariant(proteins[0].BaseSequence, rna, [], [], new Dictionary<int, List<Modification>>(), "");
             });
         }
-
         [Test]
         public static void StopGained()
         {
-            var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), true,
-                DecoyType.None, null, false, null, out var unknownModifications);
+            // PURPOSE
+            // This test validates correct handling of a stop-gained sequence variation (creation of a premature stop codon).
+            // Verifies two scenarios:
+            // 1) Default variant depth filtering: both reference (no variant applied) and alternate (stop-gained applied) proteins are emitted.
+            // 2) High min-allele-depth threshold: only the stop-gained (applied) protein is retained.
+            //
+            // EXPECTATIONS SUMMARY (based on the StopGained.xml test data):
+            // - Two proteins are initially produced (reference + variant-applied):
+            //   [0] Reference protein:
+            //       * 1 sequence variation present in metadata, but 0 applied (reference form).
+            //       * BaseSequence length = 191.
+            //       * Residue at one-based position 161 equals 'Q' (so zero-based index 160 is 'Q').
+            //   [1] Variant-applied protein (stop-gained):
+            //       * Exactly 1 applied variation.
+            //       * BaseSequence length truncated to 161 - 1 = 160 (stop codon at 161 shortens the sequence).
+            //       * The sequence is exactly the prefix of the reference up to length 160.
+            //       * No '*' appears in the resulting BaseSequence (the stop codon is not a literal character in the sequence).
+            //       * The applied variant's VariantSequence ends with '*'.
+            //       * The applied variant one-based begin (and end) position is 161.
+            //
+            // - With minAlleleDepth: 400
+            //   * Only the variant-applied protein is returned.
+            //   * It retains the same applied-variation and truncated length expectations.
+
+            // Load the proteins with default filtering
+            var proteins = ProteinDbLoader.LoadProteinXML(
+                Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"),
+                true,               // generateTargets
+                DecoyType.None,     // decoyType
+                null,               // allKnownModifications
+                false,              // isContaminant
+                null,               // modTypesToExclude
+                out var unknownModifications);
+
+            // Sanity: Decoys are not requested
+            Assert.IsTrue(proteins.All(p => !p.IsDecoy), "No decoys expected when using DecoyType.None");
+
+            // Expect exactly two proteins: reference (no applied variant) and stop-gained (applied)
+            Assert.AreEqual(2, proteins.Count, "Expected reference and stop-gained variant proteins");
+            // Reference protein metadata: one possible sequence variation in the record
+            Assert.AreEqual(1, proteins[0].SequenceVariations.Count(), "Reference metadata should contain one sequence variation");
+            Assert.AreEqual(1, proteins[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), "Reference metadata should contain exactly one unique sequence variation");
+            // Reference should have zero applied variations (reference form retained)
+            Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Count(), "Reference protein should not have an applied sequence variation");
+            Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), "Reference applied variations should be zero");
+
+            // Variant-applied protein: applied variation present and unique
+            Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Count(), "Variant protein should have exactly one applied variation");
+            Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), "Variant protein applied variations should be unique");
+
+            // Reference length and residue checks around the stop site
+            Assert.AreEqual(191, proteins[0].Length, "Reference protein length should match source data");
+            Assert.AreEqual('Q', proteins[0][161 - 1], "Reference residue at 161 should be 'Q' prior to stop-gain");
+
+            // Variant length must be truncated by the stop at position 161 → length becomes 160
+            Assert.AreEqual(161 - 1, proteins[1].Length, "Variant protein length should be truncated to 160 due to stop at 161");
+
+            // The variant BaseSequence must be exactly the prefix of the reference up to the stop position - 1
+            string reference = proteins[0].BaseSequence;
+            string variant = proteins[1].BaseSequence;
+            Assert.IsTrue(reference.StartsWith(variant), "Variant sequence must be a prefix of the reference sequence");
+            Assert.AreEqual(reference.Substring(0, 161 - 1), variant, "Variant sequence must equal reference[0..159]");
+
+            // Ensure we did not write literal '*' into the protein sequence; stop codon is represented by truncation instead
+            Assert.AreEqual(-1, variant.IndexOf('*'), "Variant BaseSequence should not contain a literal '*'");
+
+            // Verify applied-variant details for the stop-gained protein
+            var applied = proteins[1].AppliedSequenceVariations.Single();
+            Assert.IsTrue(applied.VariantSequence.EndsWith("*"), "Stop-gained variant must end with '*'");
+            Assert.AreEqual(161, applied.OneBasedBeginPosition, "Stop-gained begins at residue 161");
+            Assert.AreEqual(161, applied.OneBasedEndPosition, "Stop-gained ends at residue 161 (single residue change)");
+
+            // The two forms must differ in sequence length (as a sanity check)
+            Assert.AreNotEqual(proteins[0].Length, proteins[1].Length, "Reference and variant proteins should differ in length");
+
+            // Now require a higher min-allele-depth; expect only the variant-applied protein retained
+            proteins = ProteinDbLoader.LoadProteinXML(
+                Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"),
+                true,               // generateTargets
+                DecoyType.None,     // decoyType
+                null,               // allKnownModifications
+                false,              // isContaminant
+                null,               // modTypesToExclude
+                out unknownModifications,
+                minAlleleDepth: 400);
+
+            // Only the stop-gained, variant-applied form is retained under a strict depth threshold
+            Assert.AreEqual(1, proteins.Count, "High min-allele-depth should retain only the variant-applied protein");
+            Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Count(), "Variant-applied protein should still have one applied variation");
+            Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), "Variant-applied unique variation should be retained");
+            Assert.AreEqual(161 - 1, proteins[0].Length, "Variant-applied protein length should remain truncated to 160");
+
+            // Confirm stability: the single protein from the depth-filtered load matches the previously observed variant sequence
+            Assert.AreEqual(variant, proteins[0].BaseSequence, "Depth-filtered variant sequence should match previously observed variant");
+
+            // Re-check applied-variant semantics after filtering for completeness
+            var appliedAfterFilter = proteins[0].AppliedSequenceVariations.Single();
+            Assert.IsTrue(appliedAfterFilter.VariantSequence.EndsWith("*"), "Stop-gained variant must end with '*' (after filtering)");
+            Assert.AreEqual(161, appliedAfterFilter.OneBasedBeginPosition, "Stop-gained begins at residue 161 (after filtering)");
+            Assert.AreEqual(161, appliedAfterFilter.OneBasedEndPosition, "Stop-gained ends at residue 161 (after filtering)");
+        }
+        [Test]
+        public static void StopGained_TruncationIsPrefixAndNoOutOfBoundsAnnotations()
+        {
+            var proteins = ProteinDbLoader.LoadProteinXML(
+                Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"),
+                true, DecoyType.None, null, false, null, out var _);
+
             Assert.AreEqual(2, proteins.Count);
-            Assert.AreEqual(1, proteins[0].SequenceVariations.Count()); // some redundant
-            Assert.AreEqual(1, proteins[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes
-            Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Count()); // some redundant
-            Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes
-            Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Count()); // some redundant
-            Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes
-            Assert.AreEqual(191, proteins[0].Length);
-            Assert.AreEqual('Q', proteins[0][161 - 1]);
-            Assert.AreEqual(161 - 1, proteins[1].Length);
-            Assert.AreNotEqual(proteins[0].Length, proteins[1].Length);
+            var reference = proteins[0];
+            var truncated = proteins[1];
 
-            proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), true,
-                DecoyType.None, null, false, null, out unknownModifications, minAlleleDepth: 400);
-            Assert.AreEqual(1, proteins.Count);
-            Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Count()); // some redundant
-            Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes
-            Assert.AreEqual(161 - 1, proteins[0].Length);
+            // The truncated sequence must be a prefix of the reference,
+            // i.e., identical up to the truncation point.
+            Assert.That(reference.BaseSequence.StartsWith(truncated.BaseSequence));
+
+            // Any possible localized modifications must not point past the truncation boundary.
+            Assert.That(truncated.OneBasedPossibleLocalizedModifications
+                .All(kv => kv.Key >= 1 && kv.Key <= truncated.Length));
+
+            // Any proteolysis products (if present) must not reference indices outside the sequence.
+            Assert.That(truncated.TruncationProducts.All(tp =>
+                (!tp.OneBasedBeginPosition.HasValue || (tp.OneBasedBeginPosition.Value >= 1 && tp.OneBasedBeginPosition.Value <= truncated.Length)) &&
+                (!tp.OneBasedEndPosition.HasValue || (tp.OneBasedEndPosition.Value >= 1 && tp.OneBasedEndPosition.Value <= truncated.Length))));
+
+            // The applied stop-gained variation often encodes a '*' in the variant sequence.
+            // If present, that indicates stop; the actual sequence is cut at the stop.
+            if (truncated.AppliedSequenceVariations.Any())
+            {
+                Assert.That(truncated.AppliedSequenceVariations.Single().VariantSequence.EndsWith("*") ||
+                            !truncated.AppliedSequenceVariations.Single().VariantSequence.Contains("*"));
+            }
         }
+        [Test]
+        public static void StopGained_RoundTripSerializationPreservesTruncation()
+        {
+            var proteins = ProteinDbLoader.LoadProteinXML(
+                Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"),
+                true, DecoyType.None, null, false, null, out var _);
+
+            Assert.AreEqual(2, proteins.Count);
+            var tempPath = Path.Combine(TestContext.CurrentContext.TestDirectory, $"StopGained_roundtrip_{Guid.NewGuid()}.xml");
 
+            try
+            {
+                // Persist both proteins (reference + variant) and reload.
+                ProteinDbWriter.WriteXmlDatabase(new Dictionary<string, HashSet<Tuple<int, Modification>>>(), proteins, tempPath);
+                var roundtrip = ProteinDbLoader.LoadProteinXML(tempPath, true, DecoyType.None, null, false, null, out var __);
+
+                // Round-trip preserves count and the truncation boundary for the variant-applied protein.
+                Assert.AreEqual(2, roundtrip.Count);
+                Assert.AreEqual(proteins[0].Length, roundtrip[0].Length);
+                Assert.AreEqual(proteins[1].Length, roundtrip[1].Length);
+                Assert.AreEqual(proteins[1].AppliedSequenceVariations.Count(), roundtrip[1].AppliedSequenceVariations.Count());
+            }
+            finally
+            {
+                if (File.Exists(tempPath))
+                {
+                    File.SetAttributes(tempPath, FileAttributes.Normal);
+                    File.Delete(tempPath);
+                }
+            }
+        }
         [Test]
-        public static void StopGainedDecoysAndDigestion()
+        public static void StopGained_NoPeptidesCrossTruncationSite()
         {
-            // test decoys and digestion
-            var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGain.xml"), true,
-                DecoyType.Reverse, null, false, null, out var unknownModifications, minAlleleDepth: 400);
+            var proteins = ProteinDbLoader.LoadProteinXML(
+                Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"),
+                true, DecoyType.None, null, false, null, out var _);
+
             Assert.AreEqual(2, proteins.Count);
-            var targetPeps = proteins[0].Digest(new DigestionParams(), null, null).ToList();
-            var decoyPeps = proteins[1].Digest(new DigestionParams(), null, null).ToList();
-            //Assert.AreEqual(targetPeps.Sum(p => p.Length), decoyPeps.Sum(p => p.Length));
-            //Assert.AreEqual(targetPeps.Count, decoyPeps.Count);
+            var reference = proteins[0];
+            var truncated = proteins[1];
+
+            // Peptides from the truncated protein must not reference indices past the truncation boundary.
+            var dp = new DigestionParams();
+            var variantPeps = truncated.Digest(dp, null, null).ToList();
+            Assert.That(variantPeps.All(p => p.OneBasedEndResidueInProtein <= truncated.Length));
+
+            // Any peptide in the reference that extends past the truncation boundary cannot exist in the variant.
+            var refPeps = reference.Digest(dp, null, null).ToList();
+            var refCrossing = refPeps.Where(p => p.OneBasedEndResidueInProtein > truncated.Length).ToList();
+            var variantPepWindows = new HashSet<(int start, int end)>(variantPeps.Select(p => (p.OneBasedStartResidueInProtein, p.OneBasedEndResidueInProtein)));
+            Assert.That(refCrossing.All(p => !variantPepWindows.Contains((p.OneBasedStartResidueInProtein, p.OneBasedEndResidueInProtein))));
+        }
+        [Test]
+        public static void StopGainedDecoysAndDigestion()
+        {
+            // PURPOSE
+            // This test ensures that:
+            // 1) Reverse decoys are generated for a stop-gained (truncated) target protein.
+            // 2) The target and its decoy digest into peptides without referencing residues outside their sequence bounds.
+            // 3) Basic invariants hold for decoy generation (count/order/length/decoy flag).
+            //
+            // CONTEXT
+            // - Database: "StopGain.xml" contains a target protein with a stop-gained variant that truncates the sequence.
+            // - Decoys: Generated using DecoyType.Reverse (sequence reversal-based decoys).
+            // - minAlleleDepth: 400 ensures the stop-gained variant is applied (truncated target).
+            //
+            // EXPECTATIONS
+            // - Exactly 2 proteins are returned: [0] target (non-decoy) + [1] decoy (IsDecoy = true).
+            // - Target and decoy lengths are equal (reverse decoys preserve length).
+            // - Both target and decoy produce peptides via digestion.
+            // - No peptide references indices outside its parent protein's 1..Length range.
+            // - Accessions reflect decoy generation (decoy starts with the default "DECOY_").
+            // - If variant(s) are present, their counts match between target and decoy.
+
+            // Arrange: Load a variant-applied protein set and reverse decoy pair from StopGain.xml.
+            // Using a strict minAlleleDepth applies the stop-gained variant to the target.
+            var proteins = ProteinDbLoader.LoadProteinXML(
+                Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGain.xml"),
+                true,                 // generateTargets
+                DecoyType.Reverse,    // generate reverse-sequence decoys
+                null,                 // allKnownModifications
+                false,                // isContaminant
+                null,                 // modTypesToExclude
+                out var unknownModifications,
+                minAlleleDepth: 400); // force applying the stop-gained variant
+
+            // Assert: We expect exactly two proteins: target then its decoy.
+            Assert.AreEqual(2, proteins.Count, "Expected 1 target + 1 decoy");
+            Assert.IsFalse(proteins[0].IsDecoy, "First protein should be the target (non-decoy)");
+            Assert.IsTrue(proteins[1].IsDecoy, "Second protein should be the decoy");
+            Assert.That(proteins[1].Accession.StartsWith("DECOY_"), "Decoy accession should start with the default decoy identifier");
+
+            // Assert: Reverse decoys should preserve sequence length.
+            Assert.AreEqual(proteins[0].Length, proteins[1].Length, "Target and decoy should have identical lengths");
+            // In general, target and decoy sequences should not be byte-identical.
+            Assert.AreNotEqual(proteins[0].BaseSequence, proteins[1].BaseSequence, "Decoy sequence should differ from target sequence");
+
+            // If the stop-gained variant is applied to the target, the decoy should carry a corresponding variant count.
+            // We do not enforce exact mapping positions here, only that counts match if any are present.
+            if (proteins[0].AppliedSequenceVariations.Any() || proteins[1].AppliedSequenceVariations.Any())
+            {
+                Assert.AreEqual(
+                    proteins[0].AppliedSequenceVariations.Count(),
+                    proteins[1].AppliedSequenceVariations.Count(),
+                    "Target and decoy should carry the same number of applied sequence variations");
+            }
+
+            // Act: Digest both target and decoy using default digestion parameters (typically trypsin-like rules).
+            var dp = new DigestionParams();
+            var targetPeps = proteins[0].Digest(dp, null, null).ToList();
+            var decoyPeps = proteins[1].Digest(dp, null, null).ToList();
+
+            // Assert: Both should yield peptides.
+            Assert.That(targetPeps.Count > 0, "Target digestion should produce peptides");
+            Assert.That(decoyPeps.Count > 0, "Decoy digestion should produce peptides");
+
+            // Assert: No peptide references residues outside the corresponding protein bounds.
+            Assert.That(targetPeps.All(p =>
+                p.OneBasedStartResidueInProtein >= 1 &&
+                p.OneBasedEndResidueInProtein <= proteins[0].Length),
+                "All target peptides must fall within target bounds");
+
+            Assert.That(decoyPeps.All(p =>
+                p.OneBasedStartResidueInProtein >= 1 &&
+                p.OneBasedEndResidueInProtein <= proteins[1].Length),
+                "All decoy peptides must fall within decoy bounds");
+
+            // Note:
+            // We intentionally do NOT assert the number of peptides or the sum of peptide lengths to be equal between
+            // target and decoy. Even with reverse decoys, tryptic cleavage context differs and may alter cleavage patterns.
+            // The commented lines below are often too strict and can fail legitimately:
+            //
+            // Assert.AreEqual(targetPeps.Sum(p => p.Length), decoyPeps.Sum(p => p.Length));
+            // Assert.AreEqual(targetPeps.Count, decoyPeps.Count);
         }
 
         [Test]
@@ -490,7 +987,7 @@ public void VariantSymbolWeirdnessXml()
             string file = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "SeqVarSymbolWeirdness.xml");
             List<Protein> variantProteins = ProteinDbLoader.LoadProteinXML(file, true, DecoyType.None, null, false, null, out var un);
             Assert.AreEqual(12, variantProteins.First().ConsensusVariant.SequenceVariations.Count());
-            Assert.AreEqual(2, variantProteins.First().ConsensusVariant.SequenceVariations.Count(v => v.Description.Heterozygous.Any(kv => kv.Value)));
+            Assert.AreEqual(2, variantProteins.First().ConsensusVariant.SequenceVariations.Count(v => v.VariantCallFormatDataString.Heterozygous.Any(kv => kv.Value)));
 
             Assert.AreEqual(1, variantProteins.Count); // Should be 2^2 from combinitorics of heterozygous, but the giant indels overwrite them
             Assert.AreEqual(0, variantProteins.Where(v => v.BaseSequence == variantProteins.First().ConsensusVariant.BaseSequence).Count()); // Homozygous variations are included
@@ -560,21 +1057,159 @@ public void IndelDecoyVariants()
         [Test]
         public void SequenceVariationIsValidTest()
         {
-            SequenceVariation sv1 = new SequenceVariation(10, 10, "A", "T", "info", null);
-            SequenceVariation sv2 = new SequenceVariation(5, 5, "G", "C", "info", null);
-            SequenceVariation sv3 = new SequenceVariation(8, 8, "T", "A", "info", null);
+            // PURPOSE
+            // Validate the minimal, position-only "validity" rules implemented by SequenceVariation.AreValid():
+            //   AreValid() == (OneBasedBeginPosition > 0) && (OneBasedEndPosition >= OneBasedBeginPosition)
+            //
+            // We cover:
+            // 1) Explicit begin/end ctor with typical point mutations → valid.
+            // 2) Explicit begin/end ctor with invalid positions → invalid.
+            // 3) Explicit begin/end ctor representing insertion/deletion edge-cases → valid as long as positions are valid.
+            // 4) One-position convenience ctor behavior for different originalSequence values (null, "", length > 0).
+            //    - This ctor derives end as: end = (original == null) ? begin : begin + original.Length - 1.
+            //    - Therefore, empty originalSequence "" makes end = begin - 1 → invalid by design.
+            // 5) Content fields (Original/Variant) and OneBasedModifications do NOT affect AreValid(), only positions do.
+            // 6) Optional sanity checks on derived fields (SimpleString and computed end position).
+
+            // -----------------------------
+            // 1) Valid: explicit begin/end point mutations (begin == end, begin > 0)
+            // -----------------------------
+            SequenceVariation sv1 = new SequenceVariation(
+                oneBasedBeginPosition: 10, oneBasedEndPosition: 10,
+                originalSequence: "A", variantSequence: "T",
+                description: "info", oneBasedModifications: null);
+            SequenceVariation sv2 = new SequenceVariation(
+                oneBasedBeginPosition: 5, oneBasedEndPosition: 5,
+                originalSequence: "G", variantSequence: "C",
+                description: "info", oneBasedModifications: null);
+            SequenceVariation sv3 = new SequenceVariation(
+                oneBasedBeginPosition: 8, oneBasedEndPosition: 8,
+                originalSequence: "T", variantSequence: "A",
+                description: "info", oneBasedModifications: null);
+
+            // A protein can carry multiple variations; positions alone determine validity.
             List<SequenceVariation> svList = new List<SequenceVariation> { sv1, sv2, sv3 };
-
             Protein variantProtein = new Protein("ACDEFGHIKLMNPQRSTVWY", "protein1", sequenceVariations: svList);
-            Assert.IsTrue(variantProtein.SequenceVariations.All(v => v.AreValid()));
-            SequenceVariation svInvalidOneBasedBeginLessThanOne = new SequenceVariation(0, 10, "A", "T", "info", null);
-            SequenceVariation svInvalidOneBasedEndLessThanOneBasedBegin = new SequenceVariation(5, 4, "G", "C", "info", null);
-            SequenceVariation svValidOriginalSequenceIsEmpty = new SequenceVariation(8, 8, "", "A", "info", null);
-            SequenceVariation svValidVariantSequenceLenthIsZero = new SequenceVariation(10, 10, "A", "", "info", null);
-            Assert.IsFalse(svInvalidOneBasedBeginLessThanOne.AreValid());
-            Assert.IsFalse(svInvalidOneBasedEndLessThanOneBasedBegin.AreValid());
-            Assert.IsTrue(svValidOriginalSequenceIsEmpty.AreValid()); //This is valid because it is an insertion
-            Assert.IsTrue(svValidVariantSequenceLenthIsZero.AreValid()); // This is valid because it is a deletion
+
+            // Expectation: all three above are valid (begin > 0 and end == begin).
+            Assert.IsTrue(variantProtein.SequenceVariations.All(v => v.AreValid()), "All explicit point mutations with valid positions should be valid");
+
+            // -----------------------------
+            // 2) Invalid: begin < 1 and end < begin
+            // -----------------------------
+            SequenceVariation svInvalidOneBasedBeginLessThanOne = new SequenceVariation(
+                oneBasedBeginPosition: 0, oneBasedEndPosition: 10,
+                originalSequence: "A", variantSequence: "T",
+                description: "info", oneBasedModifications: null);
+            Assert.IsFalse(svInvalidOneBasedBeginLessThanOne.AreValid(), "Begin position must be >= 1");
+
+            SequenceVariation svInvalidOneBasedEndLessThanOneBasedBegin = new SequenceVariation(
+                oneBasedBeginPosition: 5, oneBasedEndPosition: 4,
+                originalSequence: "G", variantSequence: "C",
+                description: "info", oneBasedModifications: null);
+            Assert.IsFalse(svInvalidOneBasedEndLessThanOneBasedBegin.AreValid(), "End position cannot be less than begin position");
+
+            // -----------------------------
+            // 3) Explicit begin/end edge-cases: insertion and deletion modeled by content only
+            //    NOTE: AreValid ignores Original/Variant content; only positions matter.
+            // -----------------------------
+            // Insertion-like (explicit): original is empty (""), variant has content.
+            // Valid because we explicitly supply begin == end (positions are valid).
+            SequenceVariation svValidOriginalSequenceIsEmpty = new SequenceVariation(
+                oneBasedBeginPosition: 8, oneBasedEndPosition: 8,
+                originalSequence: "", variantSequence: "A",
+                description: "info", oneBasedModifications: null);
+            Assert.IsTrue(svValidOriginalSequenceIsEmpty.AreValid(), "Explicit insertion with valid positions should be considered valid");
+
+            // Deletion-like (explicit): variant is empty (""), original has content, positions still valid.
+            SequenceVariation svValidVariantSequenceLengthIsZero = new SequenceVariation(
+                oneBasedBeginPosition: 10, oneBasedEndPosition: 10,
+                originalSequence: "A", variantSequence: "",
+                description: "info", oneBasedModifications: null);
+            Assert.IsTrue(svValidVariantSequenceLengthIsZero.AreValid(), "Explicit deletion with valid positions should be considered valid");
+
+            // -----------------------------
+            // 4) One-position convenience ctor behavior for originalSequence values
+            //    ctor: SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, ...)
+            //    end is computed as:
+            //      - if original == null  → end = begin
+            //      - else                  → end = begin + original.Length - 1
+            // -----------------------------
+
+            // 4a) originalSequence has length 1 → end == begin (valid)
+            var svPosCtorLength1 = new SequenceVariation(
+                oneBasedPosition: 15,
+                originalSequence: "K",
+                variantSequence: "R",
+                description: "pos-ctor length 1");
+            Assert.AreEqual(15, svPosCtorLength1.OneBasedBeginPosition);
+            Assert.AreEqual(15, svPosCtorLength1.OneBasedEndPosition, "With original length 1, end should equal begin");
+            Assert.IsTrue(svPosCtorLength1.AreValid(), "Single-site variation via position ctor should be valid");
+
+            // 4b) originalSequence has length > 1 → end == begin + len - 1 (valid)
+            var svPosCtorLength3 = new SequenceVariation(
+                oneBasedPosition: 20,
+                originalSequence: "PEP",   // len = 3
+                variantSequence: "AAA",    // content irrelevant to AreValid
+                description: "pos-ctor length 3");
+            Assert.AreEqual(20, svPosCtorLength3.OneBasedBeginPosition);
+            Assert.AreEqual(22, svPosCtorLength3.OneBasedEndPosition, "End should be begin + original.Length - 1");
+            Assert.IsTrue(svPosCtorLength3.AreValid(), "Multi-length replacement with valid positions should be valid");
+
+            // 4c) originalSequence == null → end = begin (valid)
+            var svPosCtorNullOriginal = new SequenceVariation(
+                oneBasedPosition: 30,
+                originalSequence: null,    // special case handled in ctor: end = begin
+                variantSequence: "A",
+                description: "pos-ctor null original");
+            Assert.AreEqual(30, svPosCtorNullOriginal.OneBasedBeginPosition);
+            Assert.AreEqual(30, svPosCtorNullOriginal.OneBasedEndPosition, "Null original should set end = begin");
+            Assert.IsTrue(svPosCtorNullOriginal.AreValid(), "Null original via position ctor is treated as length 1 (valid)");
+
+            // 4d) originalSequence == "" (empty) → end = begin - 1 (invalid by design)
+            //     This models an insertion if you rely solely on the position ctor, but produces end < begin → invalid.
+            //     For insertions, prefer the explicit begin/end ctor with valid positions (see 3).
+            var svPosCtorEmptyOriginal = new SequenceVariation(
+                oneBasedPosition: 40,
+                originalSequence: "",      // empty → end = 39
+                variantSequence: "A",
+                description: "pos-ctor empty original");
+            Assert.AreEqual(40, svPosCtorEmptyOriginal.OneBasedBeginPosition);
+            Assert.AreEqual(39, svPosCtorEmptyOriginal.OneBasedEndPosition, "Empty original sets end = begin - 1");
+            Assert.IsFalse(svPosCtorEmptyOriginal.AreValid(), "Position ctor with empty original is invalid (end < begin)");
+
+            // -----------------------------
+            // 5) Validity is position-only; content and mods do not change AreValid()
+            //     - VariantSequence null is normalized to "" in the ctor.
+            //     - OneBasedModifications is stored but ignored by AreValid().
+            // -----------------------------
+            var mods = new Dictionary<int, List<Modification>>
+            {
+                { 1, new List<Modification>() } // empty mod list at a site; still ignored by AreValid
+            };
+            var svContentIrrelevant = new SequenceVariation(
+                oneBasedBeginPosition: 3, oneBasedEndPosition: 3,
+                originalSequence: "M", variantSequence: null, // becomes ""
+                description: "mods/variant null test", oneBasedModifications: mods);
+            Assert.IsTrue(svContentIrrelevant.AreValid(), "Null variant and/or mods should not affect positional validity");
+            Assert.AreEqual("", svContentIrrelevant.VariantSequence, "Null VariantSequence is normalized to empty string");
+
+            // -----------------------------
+            // 6) Sanity: SimpleString format and positive bounds at the edge of sequence
+            // -----------------------------
+            var svSimple = new SequenceVariation(
+                oneBasedBeginPosition: 1, oneBasedEndPosition: 1,
+                originalSequence: "A", variantSequence: "V",
+                description: "simple");
+            // SimpleString = Original + Begin + Variant (no delimiter)
+            Assert.AreEqual("A1V", svSimple.SimpleString(), "SimpleString should concatenate original + begin + variant");
+
+            // Additional guard: begin == 1 is valid if end >= begin
+            var svAtStart = new SequenceVariation(
+                oneBasedBeginPosition: 1, oneBasedEndPosition: 2,
+                originalSequence: "MA", variantSequence: "MV",
+                description: "range at start");
+            Assert.IsTrue(svAtStart.AreValid(), "Ranges that start at 1 are valid provided end >= begin");
         }
         [Test]
         public void VariantModificationTest()
@@ -644,54 +1279,294 @@ public void WriteProteinXmlWithVariantsDiscoveredAsModifications2()
             Assert.That(newProtein.SequenceVariations.Count, Is.EqualTo(totalSequenceVariations + 1)); //This number increases by 1 because we added a sequence variation that was discovered as a modification
             Assert.AreEqual(0,newProtein.OneBasedPossibleLocalizedModifications.Count); //This number should be 0 because we converted the modification to a sequence variation
         }
-
+        /// <summary>
+        /// PURPOSE
+        /// Ensures that variant proteins are automatically generated during XML read, both for targets and reverse decoys.
+        /// The database "humanGAPDH.xml" encodes two single-nucleotide substitutions on the target protein P04406:
+        /// - A22G
+        /// - K251N
+        ///
+        /// EXPECTATIONS
+        /// - Loader emits all combinatorial target variants derived from those two changes:
+        ///   [0] Reference (no variants applied)                         → Accession "P04406"
+        ///   [1] Single variant A22G                                     → Accession "P04406_A22G"
+        ///   [2] Single variant K251N                                    → Accession "P04406_K251N"
+        ///   [3] Double variant K251N + A22G (combined)                  → Accession "P04406_K251N_A22G"
+        /// - With DecoyType.Reverse, a matching set of 4 reverse decoys is produced with mirrored coordinates:
+        ///   [4] Decoy of reference                                      → "DECOY_P04406"
+        ///   [5] Decoy with mirrored A22G (mapped to site 315)           → "DECOY_P04406_A315G"
+        ///   [6] Decoy with mirrored K251N (mapped to site 86)           → "DECOY_P04406_K86N"
+        ///   [7] Decoy with both mirrored variants                       → "DECOY_P04406_K86N_A315G"
+        ///
+        /// WHY THIS MATTERS
+        /// - Validates that the reader expands sequence variation definitions into concrete variant proteins.
+        /// - Verifies decoy generation mirrors variant coordinates appropriately and preserves ordering.
+        /// - Guards against regressions in accession naming, variant application counts, and decoy parity.
+        ///
+        /// PARAMETERS PASSED TO LOADER
+        /// - generateTargets: true        → emit target proteins
+        /// - decoyType: Reverse           → also emit reverse decoys
+        /// - allKnownModifications: UniProtPtms → resolve any UniProt-annotated PTMs so no "unknown" mods remain
+        /// - isContaminant: false
+        /// - modTypesToExclude: null
+        /// - out unknownModifications     → capture any unrecognized mods (should be empty when UniProtPtms is provided)
+        /// - minAlleleDepth: 1            → do not filter out these low-depth test variants
+        /// - maxHeterozygousVariants: 99  → allow generating all combinations from the two sites (up to 2^2)
+        /// </summary>
         [Test]
         public static void TestThatProteinVariantsAreGeneratedDuringRead()
         {
+            // Arrange: load a target with two site-specific variants and request reverse decoys as well.
+            // IMPORTANT: Provide UniProtPtms so annotations in humanGAPDH.xml resolve and do not end up in unknownModifications.
             string databaseName = "humanGAPDH.xml";
-            var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true,
-                DecoyType.Reverse, null, false, null, out var unknownModifications, 1, 99);
-            Assert.AreEqual(8, proteins.Count); // 4 target + 4 decoy
-            Assert.AreEqual(2, proteins[0].SequenceVariations.Count()); // these sequence variations were in the original
-            Assert.That("P04406", Is.EqualTo(proteins[0].Accession));
-            Assert.That("P04406_A22G", Is.EqualTo(proteins[1].Accession));
-            Assert.That("P04406_K251N", Is.EqualTo(proteins[2].Accession));
-            Assert.That("P04406_K251N_A22G", Is.EqualTo(proteins[3].Accession));
-            Assert.That("DECOY_P04406", Is.EqualTo(proteins[4].Accession));
-            Assert.That("DECOY_P04406_A315G", Is.EqualTo(proteins[5].Accession));
-            Assert.That("DECOY_P04406_K86N", Is.EqualTo(proteins[6].Accession));
-            Assert.That("DECOY_P04406_K86N_A315G", Is.EqualTo(proteins[7].Accession));
+            var proteins = ProteinDbLoader.LoadProteinXML(
+                Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName),
+                generateTargets: true,
+                decoyType: DecoyType.Reverse,
+                allKnownModifications: UniProtPtms, // CHANGED: was null; supplying known PTMs prevents unknownModifications from being populated
+                isContaminant: false,
+                modTypesToExclude: null,
+                unknownModifications: out var unknownModifications,
+                minAlleleDepth: 1,
+                maxHeterozygousVariants: 99);
+
+            // Basic shape: 4 targets + 4 reverse decoys in a deterministic order.
+            Assert.AreEqual(8, proteins.Count, "Expected 4 targets and 4 decoys in a fixed order");
+
+            // Targets/decoys split and flags should be consistent and easy to reason about.
+            Assert.AreEqual(4, proteins.Count(p => !p.IsDecoy), "First half should be targets");
+            Assert.AreEqual(4, proteins.Count(p => p.IsDecoy), "Second half should be decoys");
+            for (int i = 0; i < 4; i++)
+            {
+                Assert.IsFalse(proteins[i].IsDecoy, $"Index {i} should be a target");
+                Assert.IsTrue(proteins[i + 4].IsDecoy, $"Index {i + 4} should be a decoy");
+            }
+
+            // The reference target (index 0) carries two possible sequence variations in its metadata.
+            // This documents the test input and ensures the reader surfaced them.
+            Assert.AreEqual(2, proteins[0].SequenceVariations.Count(), "Reference should advertise exactly two possible sequence variations");
+
+            // Accessions must match exact, canonical variant labeling and order for both targets and decoys.
+            Assert.That("P04406", Is.EqualTo(proteins[0].Accession), "Reference target accession mismatch");
+            Assert.That("P04406_A22G", Is.EqualTo(proteins[1].Accession), "Single-variant (A22G) target accession mismatch");
+            Assert.That("P04406_K251N", Is.EqualTo(proteins[2].Accession), "Single-variant (K251N) target accession mismatch");
+            Assert.That("P04406_K251N_A22G", Is.EqualTo(proteins[3].Accession), "Double-variant target accession mismatch");
+
+            Assert.That("DECOY_P04406", Is.EqualTo(proteins[4].Accession), "Reference decoy accession mismatch");
+            Assert.That("DECOY_P04406_A315G", Is.EqualTo(proteins[5].Accession), "Decoy accession for mirrored A22G mismatch");
+            Assert.That("DECOY_P04406_K86N", Is.EqualTo(proteins[6].Accession), "Decoy accession for mirrored K251N mismatch");
+            Assert.That("DECOY_P04406_K86N_A315G", Is.EqualTo(proteins[7].Accession), "Decoy accession for double-variant mismatch");
+
+            // Sanity: accessions are non-empty and unique (avoid accidental duplication/shuffling).
+            Assert.That(proteins.All(p => !string.IsNullOrWhiteSpace(p.Accession)), "All proteins must have non-empty accessions");
+            Assert.AreEqual(proteins.Count, proteins.Select(p => p.Accession).Distinct().Count(), "Accessions must be unique");
+
+            // Each decoy should be length-equal to its corresponding target, but usually sequence-different (reverse).
+            for (int i = 0; i < 4; i++)
+            {
+                Assert.AreEqual(proteins[i].Length, proteins[i + 4].Length, $"Target/decoy length should match for index {i}");
+                Assert.AreNotEqual(proteins[i].BaseSequence, proteins[i + 4].BaseSequence, $"Decoy sequence should differ from its target for index {i}");
+            }
+
+            // Applied variant counts (how many variations were actually realized in this protein instance):
+            // Targets:    [0] ref=0, [1] A22G=1, [2] K251N=1, [3] both=2
+            // Decoys:     [4] ref=0, [5] A315G=1, [6] K86N=1, [7] both=2
+            int[] expectedAppliedCounts = { 0, 1, 1, 2, 0, 1, 1, 2 };
+            for (int i = 0; i < proteins.Count; i++)
+            {
+                Assert.AreEqual(expectedAppliedCounts[i], proteins[i].AppliedSequenceVariations.Count(),
+                    $"Applied variant count mismatch at index {i} ({proteins[i].Accession})");
+            }
+
+            // The specific applied-variant labels should match accessions:
+            // - For targets: "A22G" and/or "K251N"
+            // - For decoys:  mirrored positions → "A315G" and/or "K86N"
+            static HashSet<string> AppliedLabels(Protein p) =>
+                new HashSet<string>(p.AppliedSequenceVariations.Select(v => v.SimpleString()));
+
+            // Targets (indices 0..3)
+            Assert.That(AppliedLabels(proteins[0]).SetEquals(Array.Empty<string>()), "Reference target should have no applied variants");
+            Assert.That(AppliedLabels(proteins[1]).SetEquals(new[] { "A22G" }), "Single-variant target must be exactly A22G");
+            Assert.That(AppliedLabels(proteins[2]).SetEquals(new[] { "K251N" }), "Single-variant target must be exactly K251N");
+            Assert.That(AppliedLabels(proteins[3]).SetEquals(new[] { "A22G", "K251N" }), "Double-variant target should have A22G and K251N");
+
+            // Decoys (indices 4..7) have mirrored coordinates (due to reverse decoying).
+            Assert.That(AppliedLabels(proteins[4]).SetEquals(Array.Empty<string>()), "Reference decoy should have no applied variants");
+            Assert.That(AppliedLabels(proteins[5]).SetEquals(new[] { "A315G" }), "Single-variant decoy must be exactly A315G (mirror of A22G)");
+            Assert.That(AppliedLabels(proteins[6]).SetEquals(new[] { "K86N" }), "Single-variant decoy must be exactly K86N (mirror of K251N)");
+            Assert.That(AppliedLabels(proteins[7]).SetEquals(new[] { "A315G", "K86N" }), "Double-variant decoy should have A315G and K86N");
+
+            // Parity check: each target and its decoy should carry the same number of applied variations.
+            for (int i = 0; i < 4; i++)
+            {
+                Assert.AreEqual(
+                    proteins[i].AppliedSequenceVariations.Count(),
+                    proteins[i + 4].AppliedSequenceVariations.Count(),
+                    $"Applied variant count should match between target and decoy at index {i}");
+            }
+
+            // With UniProtPtms supplied, no unknown modifications should be reported.
+            if (unknownModifications != null && unknownModifications.Count > 0)
+            {
+                // Extra diagnostics to ease debugging in case of future schema/content changes.
+                TestContext.WriteLine("Unknown modifications reported by loader:");
+                foreach (var um in unknownModifications)
+                    TestContext.WriteLine($" - {um}");
+            }
+            Assert.That(unknownModifications == null || unknownModifications.Count == 0, "No unknown modifications expected from this input");
         }
+
         [Test]
         public static void ProteinVariantsReadAsModificationsWrittenAsVariants()
         {
+            // PURPOSE
+            // This test verifies the I/O pipeline that converts "nucleotide substitution" modifications
+            // embedded in an input protein XML into canonical "sequence variant" features when read,
+            // and persists them back out as proper <feature type="sequence variant"> entries when written.
+            //
+            // WHY
+            // - Some sources (e.g., GPTMD discovery) encode AA substitutions as modifications with
+            //   ModificationType = "1 nucleotide substitution". Internally, we want these represented
+            //   as SequenceVariations, not remaining as generic modifications.
+            // - On read: these substitution mods must be removed from the protein’s modifications collections
+            //   and turned into SequenceVariations.
+            // - On write: they must be serialized as sequence variant features with a standardized description
+            //   ("Putative GPTMD Substitution") and NOT re-serialized as modifications.
+            //
+            // EXPECTATIONS SUMMARY
+            // - Input file has 57 lines that contain "1 nucleotide substitution" (source encoding as mods).
+            // - After LoadProteinXML:
+            //     * We get exactly 9 proteins (DecoyType.None).
+            //     * Total SequenceVariations across all proteins = 194.
+            //     * Total OneBasedPossibleLocalizedModifications count across all proteins = 0
+            //       (i.e., all substitution mods became sequence variations).
+            //     * No unknown modifications should remain.
+            //     * No applied variants are expected (metadata only; we are not expanding combinatorics here).
+            // - After WriteXmlDatabase then re-load:
+            //     * Count and totals remain the same (9 proteins; 194 total sequence variations; 0 mods).
+            //     * The output file contains:
+            //         - Exactly 194 lines with feature type="sequence variant".
+            //         - Exactly 194 lines with "Putative GPTMD Substitution".
+            //         - Exactly 0 lines with "1 nucleotide substitution" (proved that mods were not serialized).
+            //
+            // NOTE
+            // - We keep DecoyType.None to avoid expanding the protein list.
+            // - We use minAlleleDepth = 1 and maxHeterozygousVariants = 0 to avoid variant-proteoform expansion.
+
+            // Arrange: locate the source database that encodes nucleotide substitutions as modifications
             string databaseName = "nucleotideVariantsAsModifications.xml";
+            string inputPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName);
+            Assert.That(File.Exists(inputPath), Is.True, "Input database file must exist for this test");
+
+            // Sanity: confirm the source encodes substitutions as modifications in the raw XML
+            // (57 lines that mention "1 nucleotide substitution" in the file).
+            var inputLines = File.ReadAllLines(inputPath);
+            int inputSubstitutionModLines = inputLines.Count(l => l.Contains("1 nucleotide substitution"));
+            Assert.That(inputSubstitutionModLines, Is.EqualTo(57), "Source XML should contain 57 substitution-mod lines");
+
+            // Optional sanity: the source should not already be in sequence-variant form
+            int inputSeqVarFeatureLines = inputLines.Count(l => l.Contains("feature type=\"sequence variant\""));
+            Assert.That(inputSeqVarFeatureLines, Is.EqualTo(0), "Source XML should not already encode sequence variants");
+
+            // Act: read the database. Expect conversion to SequenceVariations and removal from modifications
+            var proteins = ProteinDbLoader.LoadProteinXML(
+                inputPath,
+                generateTargets: true,
+                decoyType: DecoyType.None,
+                allKnownModifications: null,
+                isContaminant: false,
+                modTypesToExclude: null,
+                unknownModifications: out var unknownModifications,
+                minAlleleDepth: 1,
+                maxHeterozygousVariants: 0);
+
+            // Assert: no decoys requested, so all should be targets
+            Assert.That(proteins.All(p => !p.IsDecoy), "All proteins should be targets when DecoyType.None is used");
+
+            // Assert: exactly 9 proteins
+            Assert.AreEqual(9, proteins.Count, "Expected exactly 9 proteins from the input");
+
+            // Assert: the loader converted all substitution modifications into proper SequenceVariations
+            // and removed them from OneBasedPossibleLocalizedModifications
+            int totalSequenceVariations = proteins.Sum(p => p.SequenceVariations.Count);
+            int totalPossibleLocalizedMods = proteins.Sum(p => p.OneBasedPossibleLocalizedModifications.Values.Sum(v => v.Count));
+            Assert.AreEqual(194, totalSequenceVariations, "Total number of sequence variations after load must be 194");
+            Assert.AreEqual(0, totalPossibleLocalizedMods, "All substitution modifications should have been converted; none should remain as mods");
+
+            // FIX: Safely log unknown modifications (Dictionary<string, Modification>), if any appear unexpectedly.
+            if (unknownModifications != null && unknownModifications.Count > 0)
+            {
+                TestContext.WriteLine("Unknown modifications encountered during read (unexpected):");
+                foreach (var kvp in unknownModifications)
+                {
+                    var mod = kvp.Value;
+                    var id = mod?.OriginalId ?? "<null>";
+                    var type = mod?.ModificationType ?? "<null>";
+                    TestContext.WriteLine($" - key={kvp.Key}, id={id}, type={type}");
+                }
+            }
+            Assert.That(unknownModifications == null || unknownModifications.Count == 0, "No unknown modifications should remain after conversion");
 
-            Assert.That(File.ReadAllLines(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName)).Count(l => l.Contains("1 nucleotide substitution")), Is.EqualTo(57));
+            // Assert: No applied variants expected in this test (we are validating representation, not expansion)
+            int totalAppliedVariants = proteins.Sum(p => p.AppliedSequenceVariations.Count());
+            Assert.That(totalAppliedVariants, Is.EqualTo(0), "No applied variants expected; variants are metadata here");
 
-            var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true,
-                DecoyType.None, null, false, null, out var unknownModifications, 1, 0);
-            Assert.AreEqual(9, proteins.Count); // 1 target
-            Assert.AreEqual(194, proteins.Select(v=>v.SequenceVariations.Count).Sum()); // there are no sequence variations in the original proteins
-            Assert.AreEqual(0, proteins.Select(m => m.OneBasedPossibleLocalizedModifications.Sum(list=>list.Value.Count)).Sum()); // there are 194 sequence variants as modifications in the original proteins
+            // Assert: None of the proteins should carry "1 nucleotide substitution" as OriginalNonVariantModifications anymore
+            int residualSubstitutionMods =
+                proteins.Sum(p => p.OriginalNonVariantModifications.Values.Sum(list => list.Count(m => m.ModificationType == "1 nucleotide substitution")));
+            Assert.That(residualSubstitutionMods, Is.EqualTo(0), "No '1 nucleotide substitution' mods should remain in OriginalNonVariantModifications");
 
+            // Persist the converted representation and read it back; this file should contain only sequence-variant features
             string tempDir = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString());
             Directory.CreateDirectory(tempDir);
             string tempFile = Path.Combine(tempDir, "xmlWithSequenceVariantsAndNoModifications.txt");
 
-            ProteinDbWriter.WriteXmlDatabase(new Dictionary<string, HashSet<Tuple<int, Modification>>>(), proteins.Where(p => !p.IsDecoy).ToList(), tempFile);
-            proteins = ProteinDbLoader.LoadProteinXML(tempFile, true,
-                DecoyType.None, null, false, null, out unknownModifications, 1, 0);
-            Assert.AreEqual(9, proteins.Count); // 1 target
-            Assert.AreEqual(194, proteins.Select(v => v.SequenceVariations.Count).Sum()); // there are 194 sequence variations in the revised proteins
-            Assert.AreEqual(0, proteins.Select(m => m.OneBasedPossibleLocalizedModifications.Sum(list => list.Value.Count)).Sum()); // there are 0 sequence variants as modifications in the original proteins
-            
-            Assert.That(File.ReadAllLines(tempFile).Count(l => l.Contains("feature type=\"sequence variant\"")), Is.EqualTo(194));
-            Assert.That(File.ReadAllLines(tempFile).Count(l => l.Contains("Putative GPTMD Substitution")), Is.EqualTo(194));
-            Assert.That(File.ReadAllLines(tempFile).Count(l => l.Contains("1 nucleotide substitution")), Is.EqualTo(0));
-            if (Directory.Exists(tempDir)) Directory.Delete(tempDir, true);
+            try
+            {
+                // Write only the targets (all are targets here). Expect the writer to emit sequence variant features.
+                ProteinDbWriter.WriteXmlDatabase(new Dictionary<string, HashSet<Tuple<int, Modification>>>(), proteins.Where(p => !p.IsDecoy).ToList(), tempFile);
+
+                // Inspect the written file contents directly for ground truth
+                var writtenLines = File.ReadAllLines(tempFile);
+                int writtenSeqVarFeatures = writtenLines.Count(l => l.Contains("feature type=\"sequence variant\""));
+                int writtenPutativeGptmd = writtenLines.Count(l => l.Contains("Putative GPTMD Substitution"));
+                int writtenSubstitutionMods = writtenLines.Count(l => l.Contains("1 nucleotide substitution"));
+
+                // Assert: writer produced only sequence variants (not substitution mods)
+                Assert.That(writtenSeqVarFeatures, Is.EqualTo(194), "All 194 substitutions must be serialized as sequence variant features");
+                Assert.That(writtenPutativeGptmd, Is.EqualTo(194), "All 194 variants should have the standardized description label");
+                Assert.That(writtenSubstitutionMods, Is.EqualTo(0), "No '1 nucleotide substitution' strings should remain in the written XML");
+
+                // Re-load the written file to confirm round-trip stability
+                proteins = ProteinDbLoader.LoadProteinXML(
+                    tempFile,
+                    generateTargets: true,
+                    decoyType: DecoyType.None,
+                    allKnownModifications: null,
+                    isContaminant: false,
+                    modTypesToExclude: null,
+                    unknownModifications: out unknownModifications,
+                    minAlleleDepth: 1,
+                    maxHeterozygousVariants: 0);
+
+                // Assert: the round-tripped representation is identical in shape and counts
+                Assert.AreEqual(9, proteins.Count, "Round-trip must preserve protein count");
+                Assert.AreEqual(194, proteins.Sum(v => v.SequenceVariations.Count), "Round-trip must preserve total number of sequence variations");
+                Assert.AreEqual(0, proteins.Sum(m => m.OneBasedPossibleLocalizedModifications.Values.Sum(list => list.Count)),
+                    "Round-trip must preserve the fact that no substitution mods remain");
+                Assert.That(unknownModifications == null || unknownModifications.Count == 0, "Round-trip should not introduce unknown modifications");
+                Assert.That(proteins.Sum(p => p.AppliedSequenceVariations.Count()), Is.EqualTo(0), "Round-trip should not apply any variants");
+            }
+            finally
+            {
+                // Cleanup test artifacts
+                if (Directory.Exists(tempDir))
+                {
+                    try { File.SetAttributes(tempFile, FileAttributes.Normal); } catch { /* ignore */ }
+                    Directory.Delete(tempDir, true);
+                }
+            }
         }
-
         [Test]
         public void Constructor_ParsesDescriptionCorrectly()
         {
@@ -770,5 +1645,319 @@ public void Constructor_ParsesDescriptionCorrectly()
             var adValues = svd.AlleleDepths[adKey];
             Assert.AreEqual(new[] { "30", "30" }, adValues);
         }
+        [Test]
+        public void ParseComprehensiveVcfExamples()
+        {
+            string current = TestContext.CurrentContext.TestDirectory;
+            string vcfPath = null;
+            while (current != null)
+            {
+                var candidate = Path.Combine(current, "Test", "DatabaseTests", "vcf_comprehensive_examples.vcf");
+                if (File.Exists(candidate))
+                {
+                    vcfPath = candidate;
+                    break;
+                }
+                current = Directory.GetParent(current)?.FullName;
+            }
+
+            Assert.That(vcfPath, Is.Not.Null, "Could not locate vcf_comprehensive_examples.vcf");
+
+            var lines = File.ReadAllLines(vcfPath);
+
+            var dataRows = lines
+                .Where(l => !string.IsNullOrWhiteSpace(l))
+                .Where(l => !l.StartsWith("##"))
+                .Where(l => !l.StartsWith("#CHROM"))
+                .ToList();
+
+            Assert.That(dataRows.Count, Is.EqualTo(8), "Expected 8 example variant rows.");
+
+            for (int rowIndex = 0; rowIndex < dataRows.Count; rowIndex++)
+            {
+                string originalLine = dataRows[rowIndex];
+                string[] rawFields = originalLine.Split('\t');
+                Assert.That(rawFields.Length, Is.GreaterThanOrEqualTo(10), $"Row {rowIndex + 1}: insufficient columns.");
+
+                var vcf = new VariantCallFormat(originalLine);
+
+                Assert.That(vcf.Description, Is.EqualTo(originalLine));
+                Assert.That(vcf.ReferenceAlleleString, Is.EqualTo(rawFields[3]));
+                Assert.That(vcf.AlternateAlleleString, Is.EqualTo(rawFields[4]));
+                Assert.That(vcf.Format, Is.EqualTo(rawFields[8]));
+
+                if (rawFields[7] == ".")
+                {
+                    Assert.That(vcf.Info.Annotation, Is.EqualTo(rawFields[7]));
+                }
+
+                var sampleFields = rawFields.Skip(9).ToArray();
+                Assert.That(vcf.Genotypes.Count, Is.EqualTo(sampleFields.Length));
+                Assert.That(vcf.AlleleDepths.Count, Is.EqualTo(sampleFields.Length));
+                Assert.That(vcf.Homozygous.Count, Is.EqualTo(sampleFields.Length));
+                Assert.That(vcf.Heterozygous.Count, Is.EqualTo(sampleFields.Length));
+                Assert.That(vcf.ZygosityBySample.Count, Is.EqualTo(sampleFields.Length));
+
+                for (int sampleIndex = 0; sampleIndex < sampleFields.Length; sampleIndex++)
+                {
+                    string sample = sampleFields[sampleIndex];
+                    string key = sampleIndex.ToString();
+
+                    string[] parts = sample.Split(':');
+                    Assert.That(parts.Length, Is.EqualTo(vcf.Format.Split(':').Length));
+
+                    string gtPart = parts[0];
+                    string adPart = parts.Length > 1 ? parts[1] : null;
+
+                    // Expected GT tokens
+                    string[] expectedGtTokens = gtPart.Split(new[] { '/', '|' }, StringSplitOptions.RemoveEmptyEntries);
+                    if (gtPart.Contains('.') && expectedGtTokens.Length == 1 &&
+                        (gtPart == "./." || gtPart == ".|." || gtPart == ".|1" || gtPart == "0|." || gtPart == "0/."))
+                    {
+                        expectedGtTokens = new[] { ".", "." };
+                    }
+
+                    Assert.That(vcf.Genotypes.ContainsKey(key));
+                    var parsedGt = vcf.Genotypes[key];
+                    Assert.That(parsedGt, Is.EqualTo(expectedGtTokens));
+
+                    // Expected AD tokens
+                    string[] expectedAdTokens =
+                        string.IsNullOrWhiteSpace(adPart) ? Array.Empty<string>() :
+                        adPart == "." ? new[] { "." } :
+                        adPart.Split(',');
+
+                    Assert.That(vcf.AlleleDepths.ContainsKey(key));
+                    var parsedAd = vcf.AlleleDepths[key] ?? Array.Empty<string>();
+                    if (parsedAd.Length != 0 || expectedAdTokens.Length != 1 || expectedAdTokens[0] != ".")
+                    {
+                        Assert.That(parsedAd, Is.EqualTo(expectedAdTokens));
+                    }
+
+                    // Expected zygosity using ONLY non-missing alleles (must mirror implementation)
+                    var calledAlleles = parsedGt.Where(a => a != ".").ToArray();
+                    bool expectedHom = calledAlleles.Length > 0 && calledAlleles.Distinct().Count() == 1;
+                    bool expectedHet = calledAlleles.Distinct().Count() > 1;
+                    VariantCallFormat.Zygosity expectedZ =
+                        calledAlleles.Length == 0
+                            ? VariantCallFormat.Zygosity.Unknown
+                            : expectedHet
+                                ? VariantCallFormat.Zygosity.Heterozygous
+                                : VariantCallFormat.Zygosity.Homozygous;
+
+                    Assert.That(vcf.Homozygous[key], Is.EqualTo(expectedHom));
+                    Assert.That(vcf.Heterozygous[key], Is.EqualTo(expectedHet));
+                    Assert.That(vcf.ZygosityBySample[key], Is.EqualTo(expectedZ));
+                }
+            }
+        }
+        [Test]
+        public void Constructor_InvalidCoordinates_ThrowsArgumentException()
+        {
+            // Minimal valid VCF line (10 columns) so VariantCallFormat parses without truncation.
+            // Arrange: end < begin (invalid coordinates)
+            var sv = new SequenceVariation(
+                oneBasedBeginPosition: 5,
+                oneBasedEndPosition: 4,
+                originalSequence: "A",
+                variantSequence: "V",
+                description: "invalid-coords",
+                oneBasedModifications: null);
+
+            // Assert: SequenceVariation does not throw on construction; it reports invalid via AreValid()
+            Assert.That(sv.AreValid(), Is.False);
+        }
+        // Helper to create a minimal substitution modification matching the required detection pattern
+        private static Modification Substitution(string idArrow)
+        {
+            // If you want this helper to be convertible by the code under test,
+            // give it a matching motif for the site where it will be placed.
+            // For now keep it generic (unused in this test).
+            return new Modification(
+                idArrow,                          // originalId
+                null,                             // accession
+                "1 nucleotide substitution",      // modificationType
+                null,                             // featureType
+                null,                             // target motif
+                "Anywhere.",                      // location restriction
+                null,                             // chemical formula
+                0,                                // monoisotopic mass
+                new Dictionary<string, IList<string>>(), // databaseReference
+                null,                             // taxonomicRange
+                null,                             // keywords
+                null,                             // neutralLosses
+                null,                             // diagnosticIons
+                null);                            // fileOrigin
+        }
+
+        // Non-substitution (should be ignored)
+        private static Modification Other(string id, double mass = 15.9949)
+        {
+            // Generic oxidation at P motif (unused by main test path)
+            ModificationMotif.TryGetMotif("P", out var motifP);
+            return new Modification(
+                id,
+                null,
+                "oxidation",
+                null,
+                motifP,
+                "Anywhere.",
+                null,
+                mass,
+                new Dictionary<string, IList<string>>(),
+                null,
+                null,
+                null,
+                null,
+                null);
+        }
+
+        // Malformed substitution (no "->" pattern) must be ignored
+        private static Modification Malformed()
+        {
+            ModificationMotif.TryGetMotif("Q", out var motifQ);
+            return new Modification(
+                "E>A",
+                null,
+                "1 nucleotide substitution",
+                null,
+                motifQ,
+                "Anywhere.",
+                null,
+                0,
+                new Dictionary<string, IList<string>>(),
+                null,
+                null,
+                null,
+                null,
+                null);
+        }
+
+        [Test]
+        public void ConvertNucleotideSubstitutionModificationsToSequenceVariants_Comprehensive()
+        {
+            // 1 M, 2 A, 3 E, 4 W, 5 P, 6 Q, 7 K
+            var protein = new Protein("MAEWPQK", "TEST_PROT");
+
+            static Modification MakeSub(string idArrow, char originalResidue)
+            {
+                ModificationMotif.TryGetMotif(originalResidue.ToString(), out var motif);
+                return new Modification(
+                    idArrow,
+                    null,
+                    "1 nucleotide substitution",
+                    null,
+                    motif,
+                    "Anywhere.",
+                    null,
+                    0,
+                    new Dictionary<string, IList<string>>(),
+                    null,
+                    null,
+                    null,
+                    null,
+                    null);
+            }
+
+            static Modification MakeOther(string id)
+            {
+                ModificationMotif.TryGetMotif("P", out var motifP);
+                return new Modification(
+                    id,
+                    null,
+                    "oxidation",
+                    null,
+                    motifP,
+                    "Anywhere.",
+                    null,
+                    15.9949,
+                    new Dictionary<string, IList<string>>(),
+                    null,
+                    null,
+                    null,
+                    null,
+                    null);
+            }
+
+            static Modification MakeMalformed()
+            {
+                ModificationMotif.TryGetMotif("Q", out var motifQ);
+                return new Modification(
+                    "E>A",
+                    null,
+                    "1 nucleotide substitution",
+                    null,
+                    motifQ,
+                    "Anywhere.",
+                    null,
+                    0,
+                    new Dictionary<string, IList<string>>(),
+                    null,
+                    null,
+                    null,
+                    null,
+                    null);
+            }
+
+            void AddMod(Protein p, int pos, Modification m)
+            {
+                if (!p.OneBasedPossibleLocalizedModifications.TryGetValue(pos, out var list1))
+                {
+                    list1 = new List<Modification>();
+                    p.OneBasedPossibleLocalizedModifications[pos] = list1;
+                }
+                list1.Add(m);
+
+                if (!p.OriginalNonVariantModifications.TryGetValue(pos, out var list2))
+                {
+                    list2 = new List<Modification>();
+                    p.OriginalNonVariantModifications[pos] = list2;
+                }
+                list2.Add(m);
+            }
+
+            // Mods to seed
+            var modEtoA = MakeSub("E->A", 'E'); // pos 3
+            var modWtoK = MakeSub("W->K", 'W'); // pos 4
+            var modOxidP = MakeOther("Oxid_P"); // pos 5
+            var malformed = MakeMalformed();    // pos 6
+
+            AddMod(protein, 3, modEtoA);
+            AddMod(protein, 4, modWtoK);
+            AddMod(protein, 5, modOxidP);
+            AddMod(protein, 6, malformed);
+
+            // Pre-existing W->K (may be duplicated by converter if description differs)
+            protein.SequenceVariations.Add(new SequenceVariation(4, 4, "W", "K", "Existing substitution"));
+
+            // Act
+            protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
+
+            // Assert unique AA changes, not raw count (converter may add standardized duplicates)
+            var uniqueChanges = protein.SequenceVariations.Select(v => v.SimpleString()).Distinct().ToList();
+            Assert.That(uniqueChanges.Count, Is.EqualTo(2), "Expected exactly two unique substitutions (E3->A and W4->K).");
+
+            // Ensure E3->A exists
+            var eToA = protein.SequenceVariations.SingleOrDefault(v =>
+                v.OneBasedBeginPosition == 3 && v.OneBasedEndPosition == 3 &&
+                v.OriginalSequence == "E" && v.VariantSequence == "A");
+            Assert.That(eToA, Is.Not.Null, "E3->A variant was not created.");
+
+            // Ensure at least one W4->K exists
+            var wToKCount = protein.SequenceVariations.Count(v =>
+                v.OneBasedBeginPosition == 4 && v.OneBasedEndPosition == 4 &&
+                v.OriginalSequence == "W" && v.VariantSequence == "K");
+            Assert.That(wToKCount, Is.GreaterThanOrEqualTo(1), "Expected a W4->K variant.");
+
+            // Converted positions removed from OneBasedPossibleLocalizedModifications
+            Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(3), Is.False);
+            Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(4), Is.False);
+
+            // Unrelated and malformed mods remain
+            Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(5), Is.True);
+            Assert.That(protein.OneBasedPossibleLocalizedModifications[5].Any(m => m.OriginalId == "Oxid_P"), Is.True);
+            Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(6), Is.True);
+            Assert.That(protein.OneBasedPossibleLocalizedModifications[6].Any(m => m.OriginalId == "E>A"), Is.True);
+        }
     }
 }
\ No newline at end of file
diff --git a/mzLib/Test/DatabaseTests/truncationsExpected.tsv b/mzLib/Test/DatabaseTests/truncationsExpected.tsv
new file mode 100644
index 000000000..8a35f98a3
--- /dev/null
+++ b/mzLib/Test/DatabaseTests/truncationsExpected.tsv
@@ -0,0 +1,137 @@
+Sequence	Type	Begin	End	RetainedMethionine
+MALWMRLLPLLALLALWGPDPAAA	Target	1	24	TRUE
+FVNQHLCGSHLVEALYLVCGERGFFYTPKT	Target	25	54	FALSE
+EAEDLQVGQVELGGGPGAGSLQPLALEGSLQ	Target	57	87	FALSE
+GIVEQCCTSICSLYQLENYCN	Target	90	110	FALSE
+MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN	Target	1	110	TRUE
+ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN	Target	2	110	FALSE
+LWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN	Target	3	110	FALSE
+WMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN	Target	4	110	FALSE
+MRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN	Target	5	110	FALSE
+RLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN	Target	6	110	FALSE
+LLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN	Target	7	110	FALSE
+ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYC	Target	2	109	FALSE
+ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENY	Target	2	108	FALSE
+ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLEN	Target	2	107	FALSE
+ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLE	Target	2	106	FALSE
+ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQL	Target	2	105	FALSE
+MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYC	Target	1	109	TRUE
+MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENY	Target	1	108	TRUE
+MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLEN	Target	1	107	TRUE
+MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLE	Target	1	106	TRUE
+MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQL	Target	1	105	TRUE
+ALWMRLLPLLALLALWGPDPAAA	Target	2	24	FALSE
+LWMRLLPLLALLALWGPDPAAA	Target	3	24	FALSE
+WMRLLPLLALLALWGPDPAAA	Target	4	24	FALSE
+MRLLPLLALLALWGPDPAAA	Target	5	24	FALSE
+RLLPLLALLALWGPDPAAA	Target	6	24	FALSE
+LLPLLALLALWGPDPAAA	Target	7	24	FALSE
+ALWMRLLPLLALLALWGPDPAA	Target	2	23	FALSE
+ALWMRLLPLLALLALWGPDPA	Target	2	22	FALSE
+ALWMRLLPLLALLALWGPDP	Target	2	21	FALSE
+ALWMRLLPLLALLALWGPD	Target	2	20	FALSE
+ALWMRLLPLLALLALWGP	Target	2	19	FALSE
+MALWMRLLPLLALLALWGPDPAA	Target	1	23	TRUE
+MALWMRLLPLLALLALWGPDPA	Target	1	22	TRUE
+MALWMRLLPLLALLALWGPDP	Target	1	21	TRUE
+MALWMRLLPLLALLALWGPD	Target	1	20	TRUE
+MALWMRLLPLLALLALWGP	Target	1	19	TRUE
+VNQHLCGSHLVEALYLVCGERGFFYTPKT	Target	26	54	FALSE
+NQHLCGSHLVEALYLVCGERGFFYTPKT	Target	27	54	FALSE
+QHLCGSHLVEALYLVCGERGFFYTPKT	Target	28	54	FALSE
+HLCGSHLVEALYLVCGERGFFYTPKT	Target	29	54	FALSE
+LCGSHLVEALYLVCGERGFFYTPKT	Target	30	54	FALSE
+FVNQHLCGSHLVEALYLVCGERGFFYTPK	Target	25	53	FALSE
+FVNQHLCGSHLVEALYLVCGERGFFYTP	Target	25	52	FALSE
+FVNQHLCGSHLVEALYLVCGERGFFYT	Target	25	51	FALSE
+FVNQHLCGSHLVEALYLVCGERGFFY	Target	25	50	FALSE
+FVNQHLCGSHLVEALYLVCGERGFF	Target	25	49	FALSE
+AEDLQVGQVELGGGPGAGSLQPLALEGSLQ	Target	58	87	FALSE
+EDLQVGQVELGGGPGAGSLQPLALEGSLQ	Target	59	87	FALSE
+DLQVGQVELGGGPGAGSLQPLALEGSLQ	Target	60	87	FALSE
+LQVGQVELGGGPGAGSLQPLALEGSLQ	Target	61	87	FALSE
+QVGQVELGGGPGAGSLQPLALEGSLQ	Target	62	87	FALSE
+EAEDLQVGQVELGGGPGAGSLQPLALEGSL	Target	57	86	FALSE
+EAEDLQVGQVELGGGPGAGSLQPLALEGS	Target	57	85	FALSE
+EAEDLQVGQVELGGGPGAGSLQPLALEG	Target	57	84	FALSE
+EAEDLQVGQVELGGGPGAGSLQPLALE	Target	57	83	FALSE
+EAEDLQVGQVELGGGPGAGSLQPLAL	Target	57	82	FALSE
+IVEQCCTSICSLYQLENYCN	Target	91	110	FALSE
+VEQCCTSICSLYQLENYCN	Target	92	110	FALSE
+EQCCTSICSLYQLENYCN	Target	93	110	FALSE
+QCCTSICSLYQLENYCN	Target	94	110	FALSE
+CCTSICSLYQLENYCN	Target	95	110	FALSE
+GIVEQCCTSICSLYQLENYC	Target	90	109	FALSE
+GIVEQCCTSICSLYQLENY	Target	90	108	FALSE
+GIVEQCCTSICSLYQLEN	Target	90	107	FALSE
+GIVEQCCTSICSLYQLE	Target	90	106	FALSE
+GIVEQCCTSICSLYQL	Target	90	105	FALSE
+FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN	Target	25	110	FALSE
+MNCYNELQYLSCISTCCQEVIGRK	Decoy	1	24	TRUE
+QLSGELALPQLSGAGPGGGLEVQGVQLDEA	Decoy	25	54	FALSE
+RTKPTYFFGREGCVLYLAEVLHSGCLHQNVF	Decoy	57	87	FALSE
+APDPGWLALLALLPLLRMWLA	Decoy	90	110	FALSE
+MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA	Decoy	1	110	TRUE
+NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA	Decoy	2	110	FALSE
+CYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA	Decoy	3	110	FALSE
+YNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA	Decoy	4	110	FALSE
+NELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA	Decoy	5	110	FALSE
+ELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA	Decoy	6	110	FALSE
+LQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA	Decoy	7	110	FALSE
+NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWL	Decoy	2	109	FALSE
+NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMW	Decoy	2	108	FALSE
+NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRM	Decoy	2	107	FALSE
+NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLR	Decoy	2	106	FALSE
+NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLL	Decoy	2	105	FALSE
+MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWL	Decoy	1	109	TRUE
+MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMW	Decoy	1	108	TRUE
+MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRM	Decoy	1	107	TRUE
+MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLR	Decoy	1	106	TRUE
+MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLL	Decoy	1	105	TRUE
+NCYNELQYLSCISTCCQEVIGRK	Decoy	2	24	FALSE
+CYNELQYLSCISTCCQEVIGRK	Decoy	3	24	FALSE
+YNELQYLSCISTCCQEVIGRK	Decoy	4	24	FALSE
+NELQYLSCISTCCQEVIGRK	Decoy	5	24	FALSE
+ELQYLSCISTCCQEVIGRK	Decoy	6	24	FALSE
+LQYLSCISTCCQEVIGRK	Decoy	7	24	FALSE
+NCYNELQYLSCISTCCQEVIGR	Decoy	2	23	FALSE
+NCYNELQYLSCISTCCQEVIG	Decoy	2	22	FALSE
+NCYNELQYLSCISTCCQEVI	Decoy	2	21	FALSE
+NCYNELQYLSCISTCCQEV	Decoy	2	20	FALSE
+NCYNELQYLSCISTCCQE	Decoy	2	19	FALSE
+MNCYNELQYLSCISTCCQEVIGR	Decoy	1	23	TRUE
+MNCYNELQYLSCISTCCQEVIG	Decoy	1	22	TRUE
+MNCYNELQYLSCISTCCQEVI	Decoy	1	21	TRUE
+MNCYNELQYLSCISTCCQEV	Decoy	1	20	TRUE
+MNCYNELQYLSCISTCCQE	Decoy	1	19	TRUE
+LSGELALPQLSGAGPGGGLEVQGVQLDEA	Decoy	26	54	FALSE
+SGELALPQLSGAGPGGGLEVQGVQLDEA	Decoy	27	54	FALSE
+GELALPQLSGAGPGGGLEVQGVQLDEA	Decoy	28	54	FALSE
+ELALPQLSGAGPGGGLEVQGVQLDEA	Decoy	29	54	FALSE
+LALPQLSGAGPGGGLEVQGVQLDEA	Decoy	30	54	FALSE
+QLSGELALPQLSGAGPGGGLEVQGVQLDE	Decoy	25	53	FALSE
+QLSGELALPQLSGAGPGGGLEVQGVQLD	Decoy	25	52	FALSE
+QLSGELALPQLSGAGPGGGLEVQGVQL	Decoy	25	51	FALSE
+QLSGELALPQLSGAGPGGGLEVQGVQ	Decoy	25	50	FALSE
+QLSGELALPQLSGAGPGGGLEVQGV	Decoy	25	49	FALSE
+TKPTYFFGREGCVLYLAEVLHSGCLHQNVF	Decoy	58	87	FALSE
+KPTYFFGREGCVLYLAEVLHSGCLHQNVF	Decoy	59	87	FALSE
+PTYFFGREGCVLYLAEVLHSGCLHQNVF	Decoy	60	87	FALSE
+TYFFGREGCVLYLAEVLHSGCLHQNVF	Decoy	61	87	FALSE
+YFFGREGCVLYLAEVLHSGCLHQNVF	Decoy	62	87	FALSE
+RTKPTYFFGREGCVLYLAEVLHSGCLHQNV	Decoy	57	86	FALSE
+RTKPTYFFGREGCVLYLAEVLHSGCLHQN	Decoy	57	85	FALSE
+RTKPTYFFGREGCVLYLAEVLHSGCLHQ	Decoy	57	84	FALSE
+RTKPTYFFGREGCVLYLAEVLHSGCLH	Decoy	57	83	FALSE
+RTKPTYFFGREGCVLYLAEVLHSGCL	Decoy	57	82	FALSE
+PDPGWLALLALLPLLRMWLA	Decoy	91	110	FALSE
+DPGWLALLALLPLLRMWLA	Decoy	92	110	FALSE
+PGWLALLALLPLLRMWLA	Decoy	93	110	FALSE
+GWLALLALLPLLRMWLA	Decoy	94	110	FALSE
+WLALLALLPLLRMWLA	Decoy	95	110	FALSE
+APDPGWLALLALLPLLRMWL	Decoy	90	109	FALSE
+APDPGWLALLALLPLLRMW	Decoy	90	108	FALSE
+APDPGWLALLALLPLLRM	Decoy	90	107	FALSE
+APDPGWLALLALLPLLR	Decoy	90	106	FALSE
+APDPGWLALLALLPLL	Decoy	90	105	FALSE
+QLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA	Decoy	25	110	FALSE
diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj
index 59c393b39..7f0449d12 100644
--- a/mzLib/Test/Test.csproj
+++ b/mzLib/Test/Test.csproj
@@ -262,6 +262,9 @@
     <None Update="DatabaseTests\test_ensembl.pep.all.fasta">
       <CopyToOutputDirectory>Always</CopyToOutputDirectory>
     </None>
+    <None Update="DatabaseTests\truncationsExpected.tsv">
+      <CopyToOutputDirectory>Always</CopyToOutputDirectory>
+    </None>
     <None Update="DatabaseTests\uniprot_aifm1.fasta">
       <CopyToOutputDirectory>Always</CopyToOutputDirectory>
     </None>
diff --git a/mzLib/Test/TestPeptideWithSetMods.cs b/mzLib/Test/TestPeptideWithSetMods.cs
index ccc6d950e..d235759ed 100644
--- a/mzLib/Test/TestPeptideWithSetMods.cs
+++ b/mzLib/Test/TestPeptideWithSetMods.cs
@@ -1046,67 +1046,177 @@ public static void TestPeptideWithSetModsReturnsTruncationsInTopDown()
             List<PeptideWithSetModifications> insulinTruncations = insulin.Digest(new DigestionParams(protease: protease.Name), new List<Modification>(), new List<Modification>(), topDownTruncationSearch: true).ToList();
             Assert.AreEqual(68, insulinTruncations.Count);
         }
-
         [Test]
         public static void TestPeptideWithSetModsReturnsDecoyTruncationsInTopDown()
         {
+            // PURPOSE
+            // Generate a comprehensive TSV of all top-down truncation peptides for target and decoy insulin entries,
+            // then compare the result against the stored expected file for regression checking.
+            //
+            // OUTPUT COLUMNS
+            // - Sequence: peptide/proteoform base sequence
+            // - Type: "Target" or "Decoy" based on parent protein
+            // - Begin: one-based start residue within the parent protein
+            // - End: one-based end residue within the parent protein
+            // - RetainedMethionine: TRUE when the peptide includes the protein’s N-terminal Met (Begin == 1 and Parent.BaseSequence[0] == 'M'), else FALSE
+
+            // Arrange: load insulin with reverse decoys and truncations enabled
             string xmlDatabase = Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", "humanInsulin.xml");
-            List<Protein> insulinProteins = ProteinDbLoader.LoadProteinXML(xmlDatabase, true,
-                DecoyType.Reverse, null, false, null, out var unknownModifications, addTruncations: true);
+            List<Protein> insulinProteins = ProteinDbLoader.LoadProteinXML(
+                xmlDatabase,
+                generateTargets: true,
+                decoyType: DecoyType.Reverse,
+                allKnownModifications: null,
+                isContaminant: false,
+                modTypesToExclude: null,
+                unknownModifications: out var unknownModifications,
+                addTruncations: true);
+
+            Assert.That(insulinProteins.Any(p => !p.IsDecoy), "Expected at least one target protein");
+            Assert.That(insulinProteins.Any(p => p.IsDecoy), "Expected at least one decoy protein");
+            Assert.That(unknownModifications == null || unknownModifications.Count == 0, "No unknown modifications expected from insulin XML");
+
+            // Digest: enumerate truncations for a representative target/decoy pair (parity sanity)
+            static string Reverse(string s) => new string(s.Reverse().ToArray());
+            var target = insulinProteins.First(p => !p.IsDecoy);
+            string expectedDecoySeq = target.BaseSequence.Length > 0 && target.BaseSequence[0] == 'M'
+                ? "M" + Reverse(target.BaseSequence.Substring(1))
+                : Reverse(target.BaseSequence);
+            var decoy = insulinProteins.FirstOrDefault(p => p.IsDecoy && p.BaseSequence == expectedDecoySeq)
+                        ?? insulinProteins.First(p => p.IsDecoy && p.Length == target.Length);
+
+            Assert.IsFalse(string.IsNullOrWhiteSpace(target.Accession));
+            Assert.IsTrue(decoy.Accession.StartsWith("DECOY_"));
+            Assert.AreEqual(target.Length, decoy.Length);
+            Assert.AreNotEqual(target.BaseSequence, decoy.BaseSequence);
+            Assert.AreEqual(expectedDecoySeq, decoy.BaseSequence, "Decoy must follow 'retain M, reverse remainder' rule");
+
+            var dp = new DigestionParams(protease: "top-down");
+            List<PeptideWithSetModifications> targetTruncs = target
+                .Digest(dp, new List<Modification>(), new List<Modification>(), topDownTruncationSearch: true)
+                .Cast<PeptideWithSetModifications>().ToList();
+            List<PeptideWithSetModifications> decoyTruncs = decoy
+                .Digest(dp, new List<Modification>(), new List<Modification>(), topDownTruncationSearch: true)
+                .Cast<PeptideWithSetModifications>().ToList();
+
+            // Parity and sanity checks for the selected pair
+            Assert.AreEqual(68, targetTruncs.Count, "Target should yield 68 truncation products in top-down mode");
+            Assert.AreEqual(68, decoyTruncs.Count, "Decoy should yield 68 truncation products in top-down mode");
+            Assert.That(targetTruncs.All(p => p.DigestionParams?.DigestionAgent?.Name == "top-down"));
+            Assert.That(decoyTruncs.All(p => p.DigestionParams?.DigestionAgent?.Name == "top-down"));
+            Assert.AreEqual(targetTruncs.Count, targetTruncs.Select(p => p.BaseSequence).Distinct().Count());
+            Assert.AreEqual(decoyTruncs.Count, decoyTruncs.Select(p => p.BaseSequence).Distinct().Count());
+            Assert.IsTrue(targetTruncs.Any(p => p.BaseSequence == target.BaseSequence));
+            Assert.IsTrue(decoyTruncs.Any(p => p.BaseSequence == decoy.BaseSequence));
+
+            // Build the table rows
+            static bool HasRetainedMet(PeptideWithSetModifications p) =>
+                p.OneBasedStartResidueInProtein == 1 &&
+                p.Parent?.BaseSequence?.Length > 0 &&
+                p.Parent.BaseSequence[0] == 'M';
+
+            // We only compare the combined truncations for the chosen target/decoy in this test (68 + 68 rows)
+            var rows = targetTruncs.Concat(decoyTruncs)
+                .Select(pep =>
+                {
+                    bool isDecoy = (pep.Parent as Protein)?.IsDecoy == true;
+                    string type = isDecoy ? "Decoy" : "Target";
+                    string retained = HasRetainedMet(pep) ? "TRUE" : "FALSE"; // normalize to match expected
+                    return string.Join("\t", new[]
+                    {
+                        pep.BaseSequence,
+                        type,
+                        pep.OneBasedStartResidueInProtein.ToString(),
+                        pep.OneBasedEndResidueInProtein.ToString(),
+                        retained
+                    });
+                })
+                // Sort deterministically to avoid platform/iteration-order differences
+                .OrderBy(r => r, StringComparer.Ordinal)
+                .ToList();
+
+            var header = "Sequence\tType\tBegin\tEnd\tRetainedMethionine";
+            var outputLines = new List<string>(capacity: rows.Count + 1) { header };
+            outputLines.AddRange(rows);
+
+            // Persist the generated table for inspection
+            string workPath = Path.Combine(TestContext.CurrentContext.WorkDirectory, "topdown_truncations_table.tsv");
+            File.WriteAllLines(workPath, outputLines);
+            Console.WriteLine($"Generated truncation table: {workPath}");
+
+            // Load expected and compare
+            string expectedPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "truncationsExpected.tsv");
+            Assert.That(File.Exists(expectedPath), $"Expected file not found: {expectedPath}");
+            var expectedAll = File.ReadAllLines(expectedPath)
+                                  .Where(l => l is not null)
+                                  .Select(l => l.TrimEnd('\r', '\n'))
+                                  .ToList();
+
+            Assert.That(expectedAll.Count > 0, "Expected file is empty");
+            string expectedHeader = expectedAll[0];
+            var expectedRows = expectedAll.Skip(1)
+                                          .Where(l => !string.IsNullOrWhiteSpace(l))
+                                          .OrderBy(l => l, StringComparer.Ordinal)
+                                          .ToList();
+
+            // Header check
+            if (!string.Equals(header, expectedHeader, StringComparison.Ordinal))
+            {
+                TestContext.Out.WriteLine($"Header mismatch:");
+                TestContext.Out.WriteLine($"  Expected: {expectedHeader}");
+                TestContext.Out.WriteLine($"  Actual  : {header}");
+            }
 
-            Protease protease = new Protease("top-down", CleavageSpecificity.None, "", "", new List<DigestionMotif>(), null);
-            List<PeptideWithSetModifications> insulintTargetTruncations = insulinProteins.Where(p=>!p.IsDecoy).First().Digest(new DigestionParams(protease: protease.Name), new List<Modification>(), new List<Modification>(), topDownTruncationSearch: true).ToList();
-            Assert.AreEqual(68, insulintTargetTruncations.Count);
-            List<PeptideWithSetModifications> insulintDecoyTruncations = insulinProteins.Where(p => p.IsDecoy).First().Digest(new DigestionParams(protease: protease.Name), new List<Modification>(), new List<Modification>(), topDownTruncationSearch: true).ToList();
-            Assert.AreEqual(68, insulintDecoyTruncations.Count);
-        }
+            // Multiset comparison for rows (counts of duplicates matter)
+            static Dictionary<string, int> ToCounts(IEnumerable<string> lines)
+                => lines.GroupBy(x => x, StringComparer.Ordinal)
+                        .ToDictionary(g => g.Key, g => g.Count(), StringComparer.Ordinal);
 
-        [Test]
-        public static void CheckFullChemicalFormula()
-        {
-            PeptideWithSetModifications small_pep = new PeptideWithSetModifications(new Protein("PEPTIDE", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 7, CleavageSpecificity.Full, null, 0, new Dictionary<int, Modification>(), 0, null);
-            ChemicalFormula small_pep_cf = ChemicalFormula.ParseFormula("C34H53N7O15");
-            Assert.AreEqual(small_pep.FullChemicalFormula, small_pep_cf);
+            var expCounts = ToCounts(expectedRows);
+            var gotCounts = ToCounts(rows);
 
-            PeptideWithSetModifications large_pep = new PeptideWithSetModifications(new Protein("PEPTIDEKRNSPEPTIDEKECUEIRQUV", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 28, CleavageSpecificity.Full, null, 0, new Dictionary<int, Modification>(), 0, null);
-            ChemicalFormula large_pep_cf = ChemicalFormula.ParseFormula("C134H220N38O50S1Se2");
-            Assert.AreEqual(large_pep.FullChemicalFormula, large_pep_cf);
+            var missing = new List<string>();   // in expected more times than in actual
+            var extra = new List<string>();     // in actual more times than in expected
+
+            foreach (var kv in expCounts)
+            {
+                gotCounts.TryGetValue(kv.Key, out int got);
+                if (got < kv.Value)
+                {
+                    int deficit = kv.Value - got;
+                    for (int i = 0; i < deficit; i++) missing.Add(kv.Key);
+                }
+            }
+            foreach (var kv in gotCounts)
+            {
+                expCounts.TryGetValue(kv.Key, out int exp);
+                if (kv.Value > exp)
+                {
+                    int surplus = kv.Value - exp;
+                    for (int i = 0; i < surplus; i++) extra.Add(kv.Key);
+                }
+            }
 
-            ModificationMotif.TryGetMotif("S", out ModificationMotif motif_s);
-            Modification phosphorylation = new Modification(_originalId: "phospho", _modificationType: "CommonBiological", _target: motif_s, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("H1O3P1"));
-            Dictionary<int, Modification> modDict_small = new Dictionary<int, Modification>();
-            modDict_small.Add(4, phosphorylation);
+            if (missing.Count == 0 && extra.Count == 0)
+            {
+                TestContext.Out.WriteLine("Top-down truncation table matches expected.");
+                TestContext.Out.WriteLine("Sample (first 5 rows):");
+                foreach (var l in outputLines.Take(6)) TestContext.Out.WriteLine(l);
+            }
+            else
+            {
+                TestContext.Out.WriteLine("Top-down truncation table differs from expected.");
+                TestContext.Out.WriteLine($"Missing rows (expected but not found or under-counted): {missing.Count}");
+                foreach (var l in missing.Take(20)) TestContext.Out.WriteLine($"  MISSING: {l}");
+                if (missing.Count > 20) TestContext.Out.WriteLine($"  ...and {missing.Count - 20} more");
 
-            PeptideWithSetModifications small_pep_mod = new PeptideWithSetModifications(new Protein("PEPSIDE", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 7, CleavageSpecificity.Full, null, 0, modDict_small, 0, null);
-            ChemicalFormula small_pep_mod_cf = ChemicalFormula.ParseFormula("C33H52N7O18P1");
-            Assert.AreEqual(small_pep_mod.FullChemicalFormula, small_pep_mod_cf);
+                TestContext.Out.WriteLine($"Extra rows (found but not expected or over-counted): {extra.Count}");
+                foreach (var l in extra.Take(20)) TestContext.Out.WriteLine($"  EXTRA:   {l}");
+                if (extra.Count > 20) TestContext.Out.WriteLine($"  ...and {extra.Count - 20} more");
 
-            ModificationMotif.TryGetMotif("K", out ModificationMotif motif_k);
-            Modification acetylation = new Modification(_originalId: "acetyl", _modificationType: "CommonBiological", _target: motif_k, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("C2H3O"));
-            Dictionary<int, Modification> modDict_large = new Dictionary<int, Modification>();
-            modDict_large.Add(4, phosphorylation);
-            modDict_large.Add(11, phosphorylation);
-            modDict_large.Add(8, acetylation);
-
-            PeptideWithSetModifications large_pep_mod = new PeptideWithSetModifications(new Protein("PEPSIDEKRNSPEPTIDEKECUEIRQUV", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 28, CleavageSpecificity.Full, null, 0, modDict_large, 0, null);
-            ChemicalFormula large_pep_mod_cf = ChemicalFormula.ParseFormula("C135H223N38O57P2S1Se2");
-            Assert.AreEqual(large_pep_mod.FullChemicalFormula, large_pep_mod_cf);
-
-            ModificationMotif.TryGetMotif("C", out var motif_c);
-            ModificationMotif.TryGetMotif("G", out var motif_g);
-            Dictionary<string, Modification> modDict =
-                new()
-                {
-                    { "Carbamidomethyl on C", new Modification(_originalId: "Carbamidomethyl", _modificationType: "Common Fixed",
-                        _target: motif_c, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("C2H3ON")) },
-                    { "BS on G" , new Modification(_originalId: "BS on G", _modificationType: "BS", _target: motif_g, _monoisotopicMass: 96.0875)}
-                };
-            PeptideWithSetModifications pwsmWithMissingCfMods = new PeptideWithSetModifications(
-                "ENQGDETQG[Speculative:BS on G]C[Common Fixed:Carbamidomethyl on C]PPQR", modDict, p: new Protein("ENQGDETQGCPPQR", "FakeProtein"), digestionParams: new DigestionParams(),
-                oneBasedStartResidueInProtein: 1, oneBasedEndResidueInProtein: 14);
-            Assert.Null(pwsmWithMissingCfMods.FullChemicalFormula);
+                Assert.Fail($"Generated top-down truncation table does not match expected.\nExpected file: {expectedPath}\nActual file: {workPath}\nMissing: {missing.Count}, Extra: {extra.Count}");
+            }
         }
-
         [Test]
         public static void CheckMostAbundantMonoisotopicMass()
         {
diff --git a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs
index c83c80aa1..87264702e 100644
--- a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs
+++ b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs
@@ -239,34 +239,34 @@ private static List<SequenceVariation> ReverseSequenceVariations(IEnumerable<Seq
                     decoyVariations.Add(new SequenceVariation(sv.OneBasedBeginPosition,
                         reversedSequence.Substring(sv.OneBasedBeginPosition - 1, sv.OneBasedEndPosition - sv.OneBasedBeginPosition + 1),
                         new string(variationArray).Substring(1, variationArray.Length - 1) + variationArray[0],
-                        $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+                        $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications));
                 }
                 // start loss, so the variant is at the end
                 else if (startLoss)
                 {
-                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, new string(originalArray).Substring(0, originalArray.Length - 1), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, new string(originalArray).Substring(0, originalArray.Length - 1), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications));
                 }
                 // both start with M, but there's more
                 else if (sv.VariantSequence.StartsWith("M", StringComparison.Ordinal) && sv.OneBasedBeginPosition == 1 && (sv.OriginalSequence.Length > 1 || sv.VariantSequence.Length > 1))
                 {
                     string original = new string(originalArray).Substring(0, originalArray.Length - 1);
                     string variant = new string(variationArray).Substring(0, variationArray.Length - 1);
-                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, original, variant, $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, original, variant, $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications));
                 }
                 // gained an initiating methionine
                 else if (sv.VariantSequence.StartsWith("M", StringComparison.Ordinal) && sv.OneBasedBeginPosition == 1)
                 {
-                    decoyVariations.Add(new SequenceVariation(1, 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+                    decoyVariations.Add(new SequenceVariation(1, 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications));
                 }
                 // starting methionine, but no variations on it
                 else if (startsWithM)
                 {
-                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 2, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 2, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications));
                 }
                 // no starting methionine
                 else
                 {
-                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 1, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 1, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications));
                 }
             }
             return decoyVariations;
@@ -335,7 +335,7 @@ private static List<Protein> GenerateSlideDecoys(List<Protein> proteins, int max
                         {
                             variationArraySlided[i] = variationArrayUnslided[GetOldSlidedIndex(i, numSlidesHere, variationArrayUnslided.Length, true)];
                         }
-                        decoyVariationsSlide.Add(new SequenceVariation(1, "M", new string(variationArraySlided), $"{decoyIdentifier} VARIANT: Initiator Methionine Change in " + sv.Description));
+                        decoyVariationsSlide.Add(new SequenceVariation(1, "M", new string(variationArraySlided), $"{decoyIdentifier} VARIANT: Initiator Methionine Change in " + sv.VariantCallFormatDataString));
                     }
                     else
                     {
@@ -352,7 +352,7 @@ private static List<Protein> GenerateSlideDecoys(List<Protein> proteins, int max
                             variationArraySlided[i] = variationArrayUnslided[GetOldSlidedIndex(i, numSlidesHere, variationArrayUnslided.Length, initMet)];
                         }
 
-                        decoyVariationsSlide.Add(new SequenceVariation(decoy_begin, decoy_end, sv.OriginalSequence, new string(variationArraySlided), $"{decoyIdentifier} VARIANT: " + sv.Description));
+                        decoyVariationsSlide.Add(new SequenceVariation(decoy_begin, decoy_end, sv.OriginalSequence, new string(variationArraySlided), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString));
                     }
                 }
                 var decoyProteinSlide = new Protein(slided_sequence, $"{decoyIdentifier}_" + protein.Accession, protein.Organism, protein.GeneNames.ToList(), decoyModifications, decoyPPSlide,
diff --git a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs
index cc7723c15..3786c969d 100644
--- a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs
+++ b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs
@@ -87,7 +87,7 @@ private static List<T> GenerateReverseDecoys<T>(List<T> nucleicAcids, int maxThr
                             var reverseModKey = indexMapping[modKvp.Key];
                             reverseModificationsForVariation.Add(reverseModKey, modKvp.Value);
                         }
-                        reverseAppliedVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.Description.Description, reverseModificationsForVariation));
+                        reverseAppliedVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.VariantCallFormatDataString.Description, reverseModificationsForVariation));
                     }
 
                     // Reverse Applied Variants
@@ -101,7 +101,7 @@ private static List<T> GenerateReverseDecoys<T>(List<T> nucleicAcids, int maxThr
                             var reverseModKey = indexMapping[modKvp.Key];
                             reverseModificationsForVariation.Add(reverseModKey, modKvp.Value);
                         }
-                        reverseVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.Description.Description, reverseModificationsForVariation));
+                        reverseVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.VariantCallFormatDataString.Description, reverseModificationsForVariation));
                     }
 
                     // Reverse Truncations
diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs
index 24ff44dd4..5eaa9897a 100644
--- a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs
+++ b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs
@@ -206,7 +206,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
                     {
                         writer.WriteStartElement("feature");
                         writer.WriteAttributeString("type", "sequence variant");
-                        writer.WriteAttributeString("description", hm.Description.ToString());
+                        writer.WriteAttributeString("description", hm.VariantCallFormatDataString.ToString());
                         writer.WriteStartElement("original");
                         writer.WriteString(hm.OriginalSequence);
                         writer.WriteEndElement(); // original
@@ -252,7 +252,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
                     //{
                     //    writer.WriteStartElement("feature");
                     //    writer.WriteAttributeString("type", "splice site");
-                    //    writer.WriteAttributeString("description", hm.Description);
+                    //    writer.WriteAttributeString("description", hm.VariantCallFormatDataString);
                     //    writer.WriteStartElement("location");
                     //    if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition)
                     //    {
@@ -464,7 +464,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
                     {
                         writer.WriteStartElement("feature");
                         writer.WriteAttributeString("type", "sequence variant");
-                        writer.WriteAttributeString("description", hm.Description.ToString());
+                        writer.WriteAttributeString("description", hm.VariantCallFormatDataString.ToString());
                         writer.WriteStartElement("original");
                         writer.WriteString(hm.OriginalSequence);
                         writer.WriteEndElement(); // original
diff --git a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs
index 68aaa399b..dd6315e93 100644
--- a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs
+++ b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs
@@ -1,15 +1,15 @@
-﻿using Proteomics;
+﻿using Omics.BioPolymer;
+using Omics.Modifications;
+using Proteomics;
+using Proteomics.ProteolyticDigestion;
 using System;
 using System.Collections.Generic;
+using System.Diagnostics;
 using System.Linq;
 using System.Text.RegularExpressions;
 using System.Xml;
-using Omics.BioPolymer;
-using Omics.Modifications;
 using Transcriptomics;
 using UsefulProteomicsDatabases.Transcriptomics;
-using System.Data;
-using Proteomics.ProteolyticDigestion;
 
 namespace UsefulProteomicsDatabases
 {
@@ -53,10 +53,27 @@ public class ProteinXmlEntry
         public UniProtSequenceAttributes SequenceAttributes { get; set; } = null; // this is used to store the sequence attributes from the <sequence> element, if present
         private List<(int, string)> AnnotatedMods = new List<(int position, string originalModificationID)>();
         private List<(int, string)> AnnotatedVariantMods = new List<(int position, string originalModificationID)>();
+        // Captured isoform/sequence identifier from <location sequence="...">
+        private string LocationSequenceId;
 
         /// <summary>
-        /// Start parsing a protein XML element
+        /// Finalizes the parsing of a protein XML entry and constructs a <see cref="Protein"/> object.
+        /// This method is called when the end of an &lt;entry&gt; element is reached during XML parsing.
+        /// It sanitizes the sequence, prunes out-of-range sequence variants, resolves and attaches modifications,
+        /// and aggregates all parsed data (such as gene names, proteolysis products, sequence variations, disulfide bonds, and splice sites)
+        /// into a new <see cref="Protein"/> instance.
+        /// After construction, the internal state is cleared to prepare for the next entry.
         /// </summary>
+        /// <param name="xml">The <see cref="XmlReader"/> positioned at the end of the &lt;entry&gt; element.</param>
+        /// <param name="isContaminant">Indicates whether the protein is a contaminant.</param>
+        /// <param name="proteinDbLocation">The file path or identifier of the protein database source.</param>
+        /// <param name="modTypesToExclude">A collection of modification types to exclude from the protein.</param>
+        /// <param name="unknownModifications">A dictionary to collect modifications that could not be resolved.</param>
+        /// <param name="decoyIdentifier">A string used to identify decoy proteins (default: "DECOY").</param>
+        /// <returns>
+        /// A constructed <see cref="Protein"/> object containing all parsed and resolved information,
+        /// or <c>null</c> if the entry is incomplete.
+        /// </returns>
         public void ParseElement(string elementName, XmlReader xml)
         {
             int outValue;
@@ -137,7 +154,9 @@ public void ParseElement(string elementName, XmlReader xml)
                     PropertyTypes.Add(xml.GetAttribute("type"));
                     PropertyValues.Add(xml.GetAttribute("value"));
                     break;
-
+                case "location":
+                    LocationSequenceId = xml.GetAttribute("sequence");
+                    break;
                 case "position":
                     OneBasedFeaturePosition = int.Parse(xml.GetAttribute("position"));
                     break;
@@ -161,9 +180,12 @@ public void ParseElement(string elementName, XmlReader xml)
         }
 
         /// <summary>
-        /// Parses the attributes of the current <entry> element from the provided XmlReader.
-        /// Extracts and stores the values for dataset, created, modified, version, and xmlns attributes.
+        /// Parses and stores key metadata attributes from the current &lt;entry&gt; element in the XML.
+        /// This includes dataset, creation date, modification date, version, and XML namespace information.
+        /// The extracted values are assigned to the corresponding properties of the <see cref="ProteinXmlEntry"/> instance.
+        /// This method is typically called when the parser encounters the start of a protein entry in a UniProt or similar XML file.
         /// </summary>
+        /// <param name="xml">The <see cref="XmlReader"/> positioned at the &lt;entry&gt; element whose attributes are to be read.</param>
         private void ParseEntryAttributes(XmlReader xml)
         {
             DatasetEntryTag = xml.GetAttribute("dataset");
@@ -268,10 +290,12 @@ private static UniProtSequenceAttributes.FragmentType ParseFragmentType(string f
             return UniProtSequenceAttributes.FragmentType.unspecified;
         }
 
-        // Helper method to compute the monoisotopic mass of a sequence.
         /// <summary>
-        /// Computes the monoisotopic mass of the given sequence.
-        /// Returns 0 if the sequence is empty.
+        /// Computes the monoisotopic mass of a protein or nucleic acid sequence without modifications.
+        /// If the input sequence is null or empty, returns 0.
+        /// Internally, constructs a <see cref="PeptideWithSetModifications"/> using the provided sequence and an empty modification dictionary,
+        /// then returns the rounded monoisotopic mass as an integer.
+        /// This method is used to populate sequence attributes such as mass during XML parsing.
         /// </summary>
         private static int ComputeSequenceMass(string sequence)
         {
@@ -280,8 +304,25 @@ private static int ComputeSequenceMass(string sequence)
             return (int)Math.Round(new PeptideWithSetModifications(sequence, new Dictionary<string, Modification>()).MonoisotopicMass);
         }
         /// <summary>
-        /// Finish parsing at the end of an element
+        /// Handles the end of an XML element during protein database parsing, updating the internal state or finalizing objects as needed.
+        /// Depending on the element name, this method processes and stores feature, subfeature, database reference, gene, and organism information,
+        /// or, if the end of an &lt;entry&gt; element is reached, constructs and returns a fully populated <see cref="Protein"/> object.
+        /// For &lt;feature&gt; and &lt;subfeature&gt; elements, it attaches modifications or proteolytic products.
+        /// For &lt;dbReference&gt;, it records database cross-references.
+        /// For &lt;gene&gt; and &lt;organism&gt;, it updates parsing state flags.
+        /// For &lt;entry&gt;, it aggregates all parsed data, resolves modifications, and returns a new <see cref="Protein"/> instance,
+        /// clearing the internal state for the next entry.
         /// </summary>
+        /// <param name="xml">The <see cref="XmlReader"/> positioned at the end of the current XML element.</param>
+        /// <param name="modTypesToExclude">A collection of modification types to exclude from the protein.</param>
+        /// <param name="unknownModifications">A dictionary to collect modifications that could not be resolved.</param>
+        /// <param name="isContaminant">Indicates whether the protein is a contaminant.</param>
+        /// <param name="proteinDbLocation">The file path or identifier of the protein database source.</param>
+        /// <param name="decoyIdentifier">A string used to identify decoy proteins (default: "DECOY").</param>
+        /// <returns>
+        /// A constructed <see cref="Protein"/> object if the end of an &lt;entry&gt; element is reached and all required data is present;
+        /// otherwise, <c>null</c>.
+        /// </returns>
         public Protein ParseEndElement(XmlReader xml, IEnumerable<string> modTypesToExclude, Dictionary<string, Modification> unknownModifications,
             bool isContaminant, string proteinDbLocation, string decoyIdentifier = "DECOY")
         {
@@ -312,7 +353,26 @@ public Protein ParseEndElement(XmlReader xml, IEnumerable<string> modTypesToExcl
             }
             return protein;
         }
-
+        /// <summary>
+        /// Handles the end of an XML element during RNA database parsing, updating the internal state or finalizing objects as needed.
+        /// Depending on the element name, this method processes and stores feature, subfeature, and database reference information,
+        /// or, if the end of an &lt;entry&gt; element is reached, constructs and returns a fully populated <see cref="RNA"/> object.
+        /// For &lt;feature&gt; and &lt;subfeature&gt; elements, it attaches modifications or truncation products.
+        /// For &lt;dbReference&gt;, it records database cross-references.
+        /// For &lt;gene&gt; and &lt;organism&gt;, it updates parsing state flags.
+        /// For &lt;entry&gt;, it aggregates all parsed data, resolves modifications, and returns a new <see cref="RNA"/> instance,
+        /// clearing the internal state for the next entry.
+        /// </summary>
+        /// <param name="xml">The <see cref="XmlReader"/> positioned at the end of the current XML element.</param>
+        /// <param name="modTypesToExclude">A collection of modification types to exclude from the RNA.</param>
+        /// <param name="unknownModifications">A dictionary to collect modifications that could not be resolved.</param>
+        /// <param name="isContaminant">Indicates whether the RNA is a contaminant.</param>
+        /// <param name="rnaDbLocation">The file path or identifier of the RNA database source.</param>
+        /// <param name="decoyIdentifier">A string used to identify decoy RNAs (default: "DECOY").</param>
+        /// <returns>
+        /// A constructed <see cref="RNA"/> object if the end of an &lt;entry&gt; element is reached and all required data is present;
+        /// otherwise, <c>null</c>.
+        /// </returns>
         internal RNA ParseRnaEndElement(XmlReader xml, IEnumerable<string> modTypesToExclude,
             Dictionary<string, Modification> unknownModifications,
             bool isContaminant, string rnaDbLocation, string decoyIdentifier = "DECOY")
@@ -346,8 +406,30 @@ internal RNA ParseRnaEndElement(XmlReader xml, IEnumerable<string> modTypesToExc
         }
 
         /// <summary>
-        /// Finish parsing an entry
+        /// Finalizes the parsing of a protein XML entry and constructs a <see cref="Protein"/> object from the accumulated data.
+        /// This method is called when the end of an &lt;entry&gt; element is reached during XML parsing.
+        /// It performs several key tasks:
+        /// <list type="bullet">
+        ///   <item>Sanitizes the parsed sequence (e.g., replacing invalid amino acids with 'X').</item>
+        ///   <item>Prunes any sequence variants whose coordinates exceed the sequence length.</item>
+        ///   <item>Resolves and attaches all annotated modifications, excluding those of specified types or unknowns.</item>
+        ///   <item>Determines if the protein is a decoy based on the accession and decoy identifier.</item>
+        ///   <item>Aggregates all parsed data (gene names, proteolysis products, sequence variations, disulfide bonds, splice sites, database references, and sequence attributes) into a new <see cref="Protein"/> instance.</item>
+        ///   <item>Clears the internal state of the <see cref="ProteinXmlEntry"/> to prepare for parsing the next entry.</item>
+        /// </list>
+        /// If either the accession or sequence is missing, returns <c>null</c>.
         /// </summary>
+        /// <param name="xml">The <see cref="XmlReader"/> positioned at the end of the &lt;entry&gt; element.</param>
+        /// <param name="isContaminant">Indicates whether the protein is a contaminant.</param>
+        /// <param name="proteinDbLocation">The file path or identifier of the protein database source.</param>
+        /// <param name="modTypesToExclude">A collection of modification types to exclude from the protein.</param>
+        /// <param name="unknownModifications">A dictionary to collect modifications that could not be resolved.</param>
+        /// <param name="decoyIdentifier">A string used to identify decoy proteins (default: "DECOY").</param>
+        /// <returns>
+        /// A constructed <see cref="Protein"/> object containing all parsed and resolved information,
+        /// or <c>null</c> if the entry is incomplete.
+        /// </returns>
+
         public Protein ParseEntryEndElement(XmlReader xml, bool isContaminant, string proteinDbLocation, IEnumerable<string> modTypesToExclude, Dictionary<string, Modification> unknownModifications, string decoyIdentifier = "DECOY")
         {
             Protein result = null;
@@ -359,6 +441,8 @@ public Protein ParseEntryEndElement(XmlReader xml, bool isContaminant, string pr
                 Sequence = ProteinDbLoader.SanitizeAminoAcidSequence(Sequence, 'X');
 
                 ParseAnnotatedMods(OneBasedModifications, modTypesToExclude, unknownModifications, AnnotatedMods);
+                //prune any sequence variants whose coordinates exceed the known sequence length
+                PruneOutOfRangeSequenceVariants();
                 if (Accession.StartsWith(decoyIdentifier))
                 {
                     isDecoy = true;
@@ -370,7 +454,30 @@ public Protein ParseEntryEndElement(XmlReader xml, bool isContaminant, string pr
             Clear();
             return result;
         }
-
+        /// <summary>
+        /// Finalizes the parsing of an RNA XML entry and constructs an <see cref="RNA"/> object from the accumulated data.
+        /// This method is called when the end of an &lt;entry&gt; element is reached during XML parsing for RNA records.
+        /// It performs several key tasks:
+        /// <list type="bullet">
+        ///   <item>Sanitizes the parsed sequence (e.g., replacing invalid characters with 'X').</item>
+        ///   <item>Prunes any sequence variants whose coordinates exceed the sequence length.</item>
+        ///   <item>Resolves and attaches all annotated modifications, excluding those of specified types or unknowns.</item>
+        ///   <item>Determines if the RNA is a decoy based on the accession and decoy identifier.</item>
+        ///   <item>Aggregates all parsed data (gene names, proteolysis products, sequence variations, and other metadata) into a new <see cref="RNA"/> instance.</item>
+        ///   <item>Clears the internal state of the <see cref="ProteinXmlEntry"/> to prepare for parsing the next entry.</item>
+        /// </list>
+        /// If either the accession or sequence is missing, returns <c>null</c>.
+        /// </summary>
+        /// <param name="xml">The <see cref="XmlReader"/> positioned at the end of the &lt;entry&gt; element.</param>
+        /// <param name="isContaminant">Indicates whether the RNA is a contaminant.</param>
+        /// <param name="rnaDbLocation">The file path or identifier of the RNA database source.</param>
+        /// <param name="modTypesToExclude">A collection of modification types to exclude from the RNA.</param>
+        /// <param name="unknownModifications">A dictionary to collect modifications that could not be resolved.</param>
+        /// <param name="decoyIdentifier">A string used to identify decoy RNAs (default: "DECOY").</param>
+        /// <returns>
+        /// A constructed <see cref="RNA"/> object containing all parsed and resolved information,
+        /// or <c>null</c> if the entry is incomplete.
+        /// </returns>
         internal RNA ParseRnaEntryEndElement(XmlReader xml, bool isContaminant, string rnaDbLocation,
             IEnumerable<string> modTypesToExclude, Dictionary<string, Modification> unknownModifications, string decoyIdentifier = "DECOY")
         {
@@ -381,6 +488,8 @@ internal RNA ParseRnaEntryEndElement(XmlReader xml, bool isContaminant, string r
                 // sanitize the sequence to replace unexpected characters with X (unknown amino acid)
                 // sometimes strange characters get added by RNA sequencing software, etc.
                 Sequence = ProteinDbLoader.SanitizeAminoAcidSequence(Sequence, 'X');
+                //prune any sequence variants whose coordinates exceed the known sequence length
+                PruneOutOfRangeSequenceVariants();
                 if (Accession.StartsWith(decoyIdentifier))
                 {
                     isDecoy = true;
@@ -407,8 +516,19 @@ public void ParseSubFeatureEndElement(XmlReader xml, IEnumerable<string> modType
         }
 
         /// <summary>
-        /// Finish parsing a feature element
+        /// Processes the end of a &lt;feature&gt; element during XML parsing and updates the internal state with the parsed feature information.
+        /// Depending on the feature type, this method:
+        /// <list type="bullet">
+        ///   <item>Adds modification annotations for "modified residue" and "lipid moiety-binding region" features.</item>
+        ///   <item>Creates and adds <see cref="TruncationProduct"/> objects for proteolytic features such as "peptide", "propeptide", "chain", and "signal peptide".</item>
+        ///   <item>Handles "sequence variant" features by creating <see cref="SequenceVariation"/> objects, including variant-specific modifications, and ensures they apply to the correct sequence or isoform.</item>
+        ///   <item>Creates and adds <see cref="DisulfideBond"/> or <see cref="SpliceSite"/> objects for their respective feature types, using available position information.</item>
+        /// </list>
+        /// After processing, resets feature-related state variables to prepare for the next feature.
         /// </summary>
+        /// <param name="xml">The <see cref="XmlReader"/> positioned at the end of the &lt;feature&gt; element.</param>
+        /// <param name="modTypesToExclude">A collection of modification types to exclude from the protein.</param>
+        /// <param name="unknownModifications">A dictionary to collect modifications that could not be resolved.</param>
         public void ParseFeatureEndElement(XmlReader xml, IEnumerable<string> modTypesToExclude, Dictionary<string, Modification> unknownModifications)
         {
             if (FeatureType == "modified residue")
@@ -445,24 +565,56 @@ public void ParseFeatureEndElement(XmlReader xml, IEnumerable<string> modTypesTo
                     }
                     else
                     {
-                        type += ("null-null");
+                        type += "null-null";
                     }
                 }
                 ProteolysisProducts.Add(new TruncationProduct(OneBasedBeginPosition, OneBasedEndPosition, type));
             }
-            else if (FeatureType == "sequence variant" && VariationValue != null && VariationValue != "") // Only keep if there is variant sequence information and position information
+            else if (FeatureType == "sequence variant" && VariationValue != null && VariationValue != "")
             {
-                ParseAnnotatedMods(OneBasedVariantModifications, modTypesToExclude, unknownModifications, AnnotatedVariantMods);
-                if (OneBasedBeginPosition != null && OneBasedEndPosition != null)
+                bool appliesToThisSequence = true;
+                if (!string.IsNullOrEmpty(LocationSequenceId))
                 {
-                    SequenceVariations.Add(new SequenceVariation((int)OneBasedBeginPosition, (int)OneBasedEndPosition, OriginalValue, VariationValue, FeatureDescription, OneBasedVariantModifications));
+                    string acc = Accession ?? "";
+                    appliesToThisSequence =
+                        LocationSequenceId.Equals(acc, StringComparison.OrdinalIgnoreCase)
+                        || (!string.IsNullOrEmpty(acc) && LocationSequenceId.Equals($"{acc}-1", StringComparison.OrdinalIgnoreCase));
                 }
-                else if (OneBasedFeaturePosition >= 1)
+
+                if (appliesToThisSequence)
                 {
-                    SequenceVariations.Add(new SequenceVariation(OneBasedFeaturePosition, OriginalValue, VariationValue, FeatureDescription, OneBasedVariantModifications));
+                    ParseAnnotatedMods(OneBasedVariantModifications, modTypesToExclude, unknownModifications, AnnotatedVariantMods);
+
+                    // NOTE: We can NOT validate coordinate vs sequence length here because sequence is usually parsed later.
+                    // Validation is deferred to PruneOutOfRangeSequenceVariants() during ParseEntryEndElement.
+
+                    if (OneBasedBeginPosition != null && OneBasedEndPosition != null)
+                    {
+                        SequenceVariations.Add(
+                            new SequenceVariation(
+                                (int)OneBasedBeginPosition,
+                                (int)OneBasedEndPosition,
+                                OriginalValue,
+                                VariationValue,
+                                FeatureDescription,
+                                //variantCallFormatDataString: null,
+                                oneBasedModifications: OneBasedVariantModifications));
+                    }
+                    else if (OneBasedFeaturePosition >= 1)
+                    {
+                        SequenceVariations.Add(
+                            new SequenceVariation(
+                                OneBasedFeaturePosition,
+                                OriginalValue,
+                                VariationValue,
+                                FeatureDescription,
+                                //variantCallFormatDataString: null,
+                                oneBasedModifications: OneBasedVariantModifications));
+                    }
+
+                    AnnotatedVariantMods = new List<(int, string)>();
+                    OneBasedVariantModifications = new Dictionary<int, List<Modification>>();
                 }
-                AnnotatedVariantMods = new List<(int, string)>();
-                OneBasedVariantModifications = new Dictionary<int, List<Modification>>();
             }
             else if (FeatureType == "disulfide bond")
             {
@@ -491,64 +643,7 @@ public void ParseFeatureEndElement(XmlReader xml, IEnumerable<string> modTypesTo
             OneBasedFeaturePosition = -1;
             OriginalValue = "";
             VariationValue = "";
-        }
-
-        private static void ParseAnnotatedMods(Dictionary<int, List<Modification>> destination, IEnumerable<string> modTypesToExclude,
-            Dictionary<string, Modification> unknownModifications, List<(int, string)> annotatedMods)
-        {
-            foreach (var annotatedMod in annotatedMods)
-            {
-                string annotatedId = annotatedMod.Item2;
-                int annotatedModLocation = annotatedMod.Item1;
-
-                if (ProteinDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out Modification foundMod)
-                    || RnaDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out foundMod))
-                {
-                    // if the list of known mods contains this IdWithMotif
-                    if (!modTypesToExclude.Contains(foundMod.ModificationType))
-                    {
-                        if (destination.TryGetValue(annotatedModLocation, out var listOfModsAtThisLocation))
-                        {
-                            listOfModsAtThisLocation.Add(foundMod);
-                        }
-                        else
-                        {
-                            destination.Add(annotatedModLocation, new List<Modification> { foundMod });
-                        }
-                    }
-                    // else - the mod ID was found but the motif didn't fit the annotated location
-                }
-
-                // no known mod - try looking it up in the dictionary of mods without motif appended
-                else if (ProteinDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out IList<Modification> mods)
-                         || RnaDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out mods))
-                {
-                    foreach (Modification mod in mods)
-                    {
-                        if (!modTypesToExclude.Contains(mod.ModificationType))
-                        {
-                            if (destination.TryGetValue(annotatedModLocation, out var listOfModsAtThisLocation))
-                            {
-                                listOfModsAtThisLocation.Add(mod);
-                            }
-                            else
-                            {
-                                destination.Add(annotatedModLocation, new List<Modification> { mod });
-                            }
-                            break;
-                        }
-                    }
-                }
-                else
-                {
-                    // could not find the annotated mod's ID in our list of known mods - it's an unknown mod
-                    // I don't think this really does anything...
-                    if (!unknownModifications.ContainsKey(annotatedId))
-                    {
-                        unknownModifications.Add(annotatedId, new Modification(annotatedId));
-                    }
-                }
-            }
+            LocationSequenceId = null;
         }
 
         /// <summary>
@@ -604,6 +699,86 @@ private void Clear()
             GeneNames = new List<Tuple<string, string>>();
             ReadingGene = false;
             ReadingOrganism = false;
+            LocationSequenceId = null;
+            AnnotatedVariantMods = new List<(int, string)>();
+            OneBasedVariantModifications = new Dictionary<int, List<Modification>>();
+        }
+        /// <summary>
+        /// Resolves and attaches annotated modifications to the specified destination dictionary based on parsed feature or variant annotations.
+        /// For each annotated modification, attempts to look up the modification by its identifier (with motif) in both protein and RNA modification dictionaries.
+        /// If found and not excluded by <paramref name="modTypesToExclude"/>, the modification is added to the destination at the specified position.
+        /// If not found by identifier, attempts to resolve the modification by possible matches (without motif) and adds the first non-excluded match.
+        /// If no match is found, records the modification as unknown in <paramref name="unknownModifications"/> to avoid repeated warnings.
+        /// This method is used to populate the protein or variant modification dictionaries during XML parsing.
+        /// </summary>
+        /// <param name="destination">Dictionary mapping one-based positions to lists of modifications to be populated.</param>
+        /// <param name="modTypesToExclude">A collection of modification types to exclude from assignment.</param>
+        /// <param name="unknownModifications">A dictionary to collect modifications that could not be resolved by identifier or type.</param>
+        /// <param name="annotatedMods">List of (position, modification identifier) tuples parsed from XML features or subfeatures.</param>
+        private static void ParseAnnotatedMods(
+            Dictionary<int, List<Modification>> destination,
+            IEnumerable<string> modTypesToExclude,
+            Dictionary<string, Modification> unknownModifications,
+            List<(int position, string originalModificationID)> annotatedMods)
+        {
+            foreach (var (annotatedModLocation, annotatedId) in annotatedMods)
+            {
+                if (ProteinDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out Modification foundMod)
+                    || RnaDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out foundMod))
+                {
+                    if (!modTypesToExclude.Contains(foundMod.ModificationType))
+                    {
+                        if (destination.TryGetValue(annotatedModLocation, out var listOfModsAtThisLocation))
+                        {
+                            listOfModsAtThisLocation.Add(foundMod);
+                        }
+                        else
+                        {
+                            destination.Add(annotatedModLocation, new List<Modification> { foundMod });
+                        }
+                    }
+                }
+                else if (ProteinDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out IList<Modification> mods)
+                         || RnaDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out mods))
+                {
+                    foreach (Modification mod in mods)
+                    {
+                        if (!modTypesToExclude.Contains(mod.ModificationType))
+                        {
+                            if (destination.TryGetValue(annotatedModLocation, out var listOfModsAtThisLocation))
+                            {
+                                listOfModsAtThisLocation.Add(mod);
+                            }
+                            else
+                            {
+                                destination.Add(annotatedModLocation, new List<Modification> { mod });
+                            }
+                            break;
+                        }
+                    }
+                }
+                else
+                {
+                    if (!unknownModifications.ContainsKey(annotatedId))
+                    {
+                        unknownModifications.Add(annotatedId, new Modification(annotatedId));
+                    }
+                }
+            }
+        }
+        private void PruneOutOfRangeSequenceVariants()
+        {
+            if (string.IsNullOrEmpty(Sequence) || SequenceVariations.Count == 0)
+                return;
+
+            int len = Sequence.Length;
+            int removed = SequenceVariations.RemoveAll(v =>
+                v.OneBasedBeginPosition > len || v.OneBasedEndPosition > len);
+
+            if (removed > 0)
+            {
+                Trace.TraceWarning($"Pruned {removed} out-of-range sequence variant(s) for accession {Accession} (protein length {len}).");
+            }
         }
     }
 }
\ No newline at end of file