From d09576c62f0308971f8af4f4724564470d5bf143 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Wed, 22 Oct 2025 15:13:05 -0500
Subject: [PATCH 01/38] rename sequence variant description as variant call
 format

---
 mzLib/Omics/BioPolymer/SequenceVariation.cs   | 10 +++---
 mzLib/Omics/BioPolymer/VariantApplication.cs  | 32 +++++++++----------
 ...antDescription.cs => VariantCallFormat.cs} |  6 ++--
 mzLib/Omics/IBioPolymerWithSetMods.cs         |  2 +-
 mzLib/Proteomics/Protein/DisulfideBond.cs     |  2 +-
 .../MsFraggerPeptide.cs                       |  2 +-
 .../MsFraggerProtein.cs                       |  2 +-
 .../IndividualResultRecords/MsFraggerPsm.cs   |  2 +-
 .../SpectrumMatchFromTsvHeader.cs             |  4 +--
 mzLib/Readers/Thermo/ThermoRawFileReader.cs   |  2 +-
 mzLib/Test/DatabaseTests/TestProteinReader.cs |  6 ++--
 .../DatabaseTests/TestProteomicsReadWrite.cs  |  8 ++---
 .../Test/DatabaseTests/TestVariantProtein.cs  |  6 ++--
 mzLib/Test/FlashLFQ/TestIsoTracker.cs         | 32 +++++++++----------
 mzLib/Test/Transcriptomics/TestDbLoader.cs    |  2 +-
 .../DecoyGeneration/DecoyProteinGenerator.cs  | 16 +++++-----
 .../DecoyGeneration/RnaDecoyGenerator.cs      |  4 +--
 .../ProteinDbWriter.cs                        |  6 ++--
 .../Transcriptomics/RnaDbLoader.cs            |  2 +-
 19 files changed, 73 insertions(+), 73 deletions(-)
 rename mzLib/Omics/BioPolymer/{SequenceVariantDescription.cs => VariantCallFormat.cs} (95%)
diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs
index 22f0347b4..d48df14b0 100644
--- a/mzLib/Omics/BioPolymer/SequenceVariation.cs
+++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs
@@ -20,7 +20,7 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str
             OneBasedEndPosition = oneBasedEndPosition;
             OriginalSequence = originalSequence ?? "";
             VariantSequence = variantSequence ?? "";
-            Description = new SequenceVariantDescription(description);
+            CallFormat = new VariantCallFormat(description);
             OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
         }
 
@@ -58,9 +58,9 @@ public SequenceVariation(int oneBasedPosition, string originalSequence, string v
         public string VariantSequence { get; }
 
         /// <summary>
-        /// Description of this variation (optional)
+        /// CallFormat of this variation (optional)
         /// </summary>
-        public SequenceVariantDescription Description { get; }
+        public VariantCallFormat CallFormat { get; }
 
         /// <summary>
         /// Modifications specifically for this variant
@@ -75,7 +75,7 @@ public override bool Equals(object obj)
                 && OneBasedEndPosition == s.OneBasedEndPosition
                 && (s.OriginalSequence == null && OriginalSequence == null || OriginalSequence.Equals(s.OriginalSequence))
                 && (s.VariantSequence == null && VariantSequence == null || VariantSequence.Equals(s.VariantSequence))
-                && (s.Description == null && Description == null || Description.Equals(s.Description))
+                && (s.CallFormat == null && CallFormat == null || CallFormat.Equals(s.CallFormat))
                 && (s.OneBasedModifications == null && OneBasedModifications == null ||
                     s.OneBasedModifications.Keys.ToList().SequenceEqual(OneBasedModifications.Keys.ToList())
                     && s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList()));
@@ -87,7 +87,7 @@ public override int GetHashCode()
                 ^ OneBasedEndPosition.GetHashCode()
                 ^ OriginalSequence.GetHashCode() // null handled in constructor
                 ^ VariantSequence.GetHashCode() // null handled in constructor
-                ^ Description.GetHashCode(); // always constructed in constructor
+                ^ CallFormat.GetHashCode(); // always constructed in constructor
         }
 
         /// <summary>
diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs
index 1669c3afe..a9ee258f5 100644
--- a/mzLib/Omics/BioPolymer/VariantApplication.cs
+++ b/mzLib/Omics/BioPolymer/VariantApplication.cs
@@ -25,7 +25,7 @@ public static List<TBioPolymerType> GetVariantBioPolymers<TBioPolymerType>(this
         {
             protein.ConsensusVariant.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
             protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
-            if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.Description == null || v.Description.Genotypes.Count == 0))
+            if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.CallFormat == null || v.CallFormat.Genotypes.Count == 0))
             {
                 // this is a protein with either no VCF lines or a mix of VCF and non-VCF lines
                 return ApplyAllVariantCombinations(protein, protein.SequenceVariations, maxAllowedVariantsForCombinatorics).ToList();
@@ -100,7 +100,7 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
             List<SequenceVariation> uniqueEffectsToApply = sequenceVariations
                 .GroupBy(v => v.SimpleString())
                 .Select(x => x.First())
-                .Where(v => v.Description.Genotypes.Count > 0) // this is a VCF line
+                .Where(v => v.CallFormat.Genotypes.Count > 0) // this is a VCF line
                 .OrderByDescending(v => v.OneBasedBeginPosition) // apply variants at the end of the protein sequence first
                 .ToList();
 
@@ -112,7 +112,7 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                 return new List<TBioPolymerType> { proteinCopy };
             }
 
-            HashSet<string> individuals = new HashSet<string>(uniqueEffectsToApply.SelectMany(v => v.Description.Genotypes.Keys));
+            HashSet<string> individuals = new HashSet<string>(uniqueEffectsToApply.SelectMany(v => v.CallFormat.Genotypes.Keys));
             List<TBioPolymerType> variantProteins = new();
             List<TBioPolymerType> newVariantProteins = new();
             // loop through genotypes for each sample/individual (e.g. tumor and normal)
@@ -121,17 +121,17 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                 newVariantProteins.Clear();
                 newVariantProteins.Add(proteinCopy);
 
-                bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.Description.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics;
+                bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.CallFormat.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics;
                 foreach (var variant in uniqueEffectsToApply)
                 {
-                    bool variantAlleleIsInTheGenotype = variant.Description.Genotypes[individual].Contains(variant.Description.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff
+                    bool variantAlleleIsInTheGenotype = variant.CallFormat.Genotypes[individual].Contains(variant.CallFormat.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff
                     if (!variantAlleleIsInTheGenotype)
                     {
                         continue;
                     }
-                    bool isHomozygousAlternate = variant.Description.Homozygous[individual] && variant.Description.Genotypes[individual].All(d => d == variant.Description.AlleleIndex.ToString()); // note this isn't a great test for homozygosity, since the genotype could be 1/2 and this would still return true. But currently, alleles 1 and 2 will be included as separate variants, so this is fine for now.
-                    bool isDeepReferenceAllele = int.TryParse(variant.Description.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth;
-                    bool isDeepAlternateAllele = int.TryParse(variant.Description.AlleleDepths[individual][variant.Description.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth;
+                    bool isHomozygousAlternate = variant.CallFormat.Homozygous[individual] && variant.CallFormat.Genotypes[individual].All(d => d == variant.CallFormat.AlleleIndex.ToString()); // note this isn't a great test for homozygosity, since the genotype could be 1/2 and this would still return true. But currently, alleles 1 and 2 will be included as separate variants, so this is fine for now.
+                    bool isDeepReferenceAllele = int.TryParse(variant.CallFormat.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth;
+                    bool isDeepAlternateAllele = int.TryParse(variant.CallFormat.AlleleDepths[individual][variant.CallFormat.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth;
 
                     // homozygous alternate
                     if (isHomozygousAlternate && isDeepAlternateAllele)
@@ -141,7 +141,7 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
 
                     // heterozygous basic
                     // first protein with variants contains all homozygous variation, second contains all variations
-                    else if (variant.Description.Heterozygous[individual] && tooManyHeterozygousVariants)
+                    else if (variant.CallFormat.Heterozygous[individual] && tooManyHeterozygousVariants)
                     {
                         if (isDeepAlternateAllele && isDeepReferenceAllele)
                         {
@@ -170,7 +170,7 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                     }
 
                     // heterozygous combinitorics
-                    else if (variant.Description.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants)
+                    else if (variant.CallFormat.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants)
                     {
                         List<TBioPolymerType> combinitoricProteins = new();
 
@@ -179,7 +179,7 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                             if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0 && isDeepReferenceAllele)
                             {
                                 // keep reference allele
-                                if (variant.Description.Genotypes[individual].Contains("0"))
+                                if (variant.CallFormat.Genotypes[individual].Contains("0"))
                                 {
                                     combinitoricProteins.Add(ppp);
                                 }
@@ -191,7 +191,7 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                             {
                                 combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual));
                             }
-                            else if (variant.Description.Genotypes[individual].Contains("0"))
+                            else if (variant.CallFormat.Genotypes[individual].Contains("0"))
                             {
                                 combinitoricProteins.Add(ppp);
                             }
@@ -224,7 +224,7 @@ private static TBioPolymerType ApplySingleVariant<TBioPolymerType>(SequenceVaria
                 variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1,
                 variantGettingApplied.OriginalSequence,
                 variantGettingApplied.VariantSequence,
-                variantGettingApplied.Description.Description,
+                variantGettingApplied.CallFormat.Description,
                 variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value));
 
             // check to see if there is incomplete indel overlap, which would lead to weird variant sequences
@@ -271,7 +271,7 @@ private static List<SequenceVariation> AdjustSequenceVariationIndices(SequenceVa
 
                 // variant was entirely before the one being applied (shouldn't happen because of order of applying variants)
                 // or it's the current variation
-                if (v.Description.Equals(variantGettingApplied.Description) || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition)
+                if (v.CallFormat.Equals(variantGettingApplied.CallFormat) || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition)
                 {
                     variations.Add(v);
                 }
@@ -299,7 +299,7 @@ private static List<SequenceVariation> AdjustSequenceVariationIndices(SequenceVa
                         end,
                         v.OriginalSequence,
                         v.VariantSequence,
-                        v.Description.Description,
+                        v.CallFormat.Description,
                         v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)));
                 }
             }
@@ -425,7 +425,7 @@ private static string CombineSimpleStrings(IEnumerable<SequenceVariation>? varia
         /// </summary>
         public static string CombineDescriptions(IEnumerable<SequenceVariation>? variations)
         {
-            return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.Description));
+            return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.CallFormat));
         }
         /// <summary>
         /// Applies all possible combinations of the provided SequenceVariation list to the base TBioPolymerType object,
diff --git a/mzLib/Omics/BioPolymer/SequenceVariantDescription.cs b/mzLib/Omics/BioPolymer/VariantCallFormat.cs
similarity index 95%
rename from mzLib/Omics/BioPolymer/SequenceVariantDescription.cs
rename to mzLib/Omics/BioPolymer/VariantCallFormat.cs
index 4b4b9d81a..386d97705 100644
--- a/mzLib/Omics/BioPolymer/SequenceVariantDescription.cs
+++ b/mzLib/Omics/BioPolymer/VariantCallFormat.cs
@@ -4,9 +4,9 @@
 
 namespace Omics.BioPolymer
 {
-    public class SequenceVariantDescription
+    public class VariantCallFormat
     {
-        public SequenceVariantDescription(string description)
+        public VariantCallFormat(string description)
         {
             Description = description;
             if (description == null)
@@ -67,7 +67,7 @@ public override string ToString()
 
         public override bool Equals(object obj)
         {
-            SequenceVariantDescription s = obj as SequenceVariantDescription;
+            VariantCallFormat s = obj as VariantCallFormat;
             return s != null && s.Description == Description;
         }
 
diff --git a/mzLib/Omics/IBioPolymerWithSetMods.cs b/mzLib/Omics/IBioPolymerWithSetMods.cs
index e475a3b5d..f065238a4 100644
--- a/mzLib/Omics/IBioPolymerWithSetMods.cs
+++ b/mzLib/Omics/IBioPolymerWithSetMods.cs
@@ -25,7 +25,7 @@ public interface IBioPolymerWithSetMods : IHasChemicalFormula, IEquatable<IBioPo
         int OneBasedEndResidue { get; }
         int MissedCleavages { get; }
         /// <summary>
-        /// Description of where the BioPolymerWithSetMods originated from examples include
+        /// CallFormat of where the BioPolymerWithSetMods originated from examples include
         /// Top-down truncation: full-length proteoform C-terminal digestion truncation
         /// Top-down truncation: DECOY full-length proteoform N-terminal digestion truncation
         /// Bottom-up search: full
diff --git a/mzLib/Proteomics/Protein/DisulfideBond.cs b/mzLib/Proteomics/Protein/DisulfideBond.cs
index 6bd4fa1af..a338957e8 100644
--- a/mzLib/Proteomics/Protein/DisulfideBond.cs
+++ b/mzLib/Proteomics/Protein/DisulfideBond.cs
@@ -25,7 +25,7 @@ public DisulfideBond(int OneBasedPosition, string Description)
         public int OneBasedEndPosition { get; set; }
 
         /// <summary>
-        /// Description of this variation (optional)
+        /// CallFormat of this variation (optional)
         /// </summary>
         public string Description { get; set; }
 
diff --git a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPeptide.cs b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPeptide.cs
index 32def7c09..d7eac04ca 100644
--- a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPeptide.cs
+++ b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPeptide.cs
@@ -79,7 +79,7 @@ public string ProteinName
         [Name("Gene")]
         public string Gene { get; set; }
 
-        [Name("Protein Description")]
+        [Name("Protein CallFormat")]
         public string ProteinDescription { get; set; }
 
         [Name("Mapped Genes")]
diff --git a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerProtein.cs b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerProtein.cs
index 37c043590..fea95ac3b 100644
--- a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerProtein.cs
+++ b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerProtein.cs
@@ -38,7 +38,7 @@ public class MsFraggerProtein
         [Name("Organism")]
         public string Organism { get; set; }
 
-        [Name("Protein Description", "Description")]
+        [Name("Protein CallFormat", "CallFormat")]
         public string Description { get; set; }
 
         [Name("Protein Existence")]
diff --git a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs
index e5272a779..8ee977e6e 100644
--- a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs
+++ b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs
@@ -124,7 +124,7 @@ public class MsFraggerPsm : IQuantifiableRecord
         [Name("Gene")]
         public string Gene { get; set; }
 
-        [Name("Protein Description")]
+        [Name("Protein CallFormat")]
         public string ProteinDescription { get; set; }
 
         [Name("Mapped Genes")]
diff --git a/mzLib/Readers/InternalResults/IndividualResultRecords/SpectrumMatchFromTsvHeader.cs b/mzLib/Readers/InternalResults/IndividualResultRecords/SpectrumMatchFromTsvHeader.cs
index 7ca469da5..d71eac995 100644
--- a/mzLib/Readers/InternalResults/IndividualResultRecords/SpectrumMatchFromTsvHeader.cs
+++ b/mzLib/Readers/InternalResults/IndividualResultRecords/SpectrumMatchFromTsvHeader.cs
@@ -46,7 +46,7 @@ public class SpectrumMatchFromTsvHeader
         public const string SpliceSites = "Splice Sites";
         public const string Contaminant = "Contaminant";
         public const string Decoy = "Decoy";
-        public const string Description = "Description";
+        public const string Description = "CallFormat";
         public const string StartAndEndResiduesInFullSequence = "Start and End Residues In Full Sequence";
         public const string PreviousResidue = "Previous Residue";
         public const string NextResidue = "Next Residue";
@@ -78,7 +78,7 @@ public class SpectrumMatchFromTsvHeader
         public const string PeptideMonoMass = "Peptide Monoisotopic Mass";
         public const string ProteinAccession = "Protein Accession";
         public const string ProteinName = "Protein Name";
-        public const string PeptideDescription = "Peptide Description";
+        public const string PeptideDescription = "Peptide CallFormat";
         public const string StartAndEndResiduesInProtein = "Start and End Residues In Protein";
         public const string PreviousAminoAcid = "Previous Amino Acid";
         public const string NextAminoAcid = "Next Amino Acid";
diff --git a/mzLib/Readers/Thermo/ThermoRawFileReader.cs b/mzLib/Readers/Thermo/ThermoRawFileReader.cs
index 7ed5456ea..7a61a0639 100644
--- a/mzLib/Readers/Thermo/ThermoRawFileReader.cs
+++ b/mzLib/Readers/Thermo/ThermoRawFileReader.cs
@@ -306,7 +306,7 @@ private static MsDataScan GetOneBasedScan(IRawDataPlus rawFile, IFilteringParams
                     HcdEnergy = values[i];
                 }
 
-                if (labels[i].StartsWith("Scan Description", StringComparison.Ordinal))
+                if (labels[i].StartsWith("Scan CallFormat", StringComparison.Ordinal))
                 {
                     scanDescript = values[i].TrimEnd();
                 }
diff --git a/mzLib/Test/DatabaseTests/TestProteinReader.cs b/mzLib/Test/DatabaseTests/TestProteinReader.cs
index 7dcd0b4d8..cbfc4c603 100644
--- a/mzLib/Test/DatabaseTests/TestProteinReader.cs
+++ b/mzLib/Test/DatabaseTests/TestProteinReader.cs
@@ -125,7 +125,7 @@ public static void XmlTest()
             Assert.AreEqual(64, ok[0].SequenceVariations.First().OneBasedEndPosition);
             Assert.AreEqual(103 - 64 + 2, ok[1].SequenceVariations.First().OneBasedBeginPosition);
             Assert.AreEqual(103 - 64 + 2, ok[1].SequenceVariations.First().OneBasedEndPosition);
-            Assert.AreNotEqual(ok[0].SequenceVariations.First().Description, ok[1].SequenceVariations.First().Description); //decoys and target variations don't have the same desc.
+            Assert.AreNotEqual(ok[0].SequenceVariations.First().CallFormat, ok[1].SequenceVariations.First().CallFormat); //decoys and target variations don't have the same desc.
             Assert.AreEqual("Homo sapiens", ok[1].Organism);
         }
 
@@ -420,8 +420,8 @@ public static void TestReverseDecoyXML_WithCustomIdentifier()
 
                 foreach (var variant in protein.AppliedSequenceVariations)
                 {
-                    Assert.That(variant.Description, Does.StartWith("rev"));
-                    Assert.That(variant.Description, Does.Not.StartWith("DECOY"));
+                    Assert.That(variant.CallFormat, Does.StartWith("rev"));
+                    Assert.That(variant.CallFormat, Does.Not.StartWith("DECOY"));
                 }
 
                 foreach (var bond in protein.DisulfideBonds)
diff --git a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs
index babe44a76..07acea46b 100644
--- a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs
+++ b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs
@@ -504,12 +504,12 @@ public void TestFullProteinReadWrite()
             Assert.AreEqual(originalProtein.TruncationProducts.First().OneBasedEndPosition, proteinReadFromXml[0].TruncationProducts.First().OneBasedEndPosition);
             Assert.AreEqual(originalProtein.TruncationProducts.First().Type, proteinReadFromXml[0].TruncationProducts.First().Type.Split('(')[0]);
 
-            Assert.AreEqual(originalProtein.SequenceVariations.First().Description, proteinReadFromXml[0].SequenceVariations.First().Description);
+            Assert.AreEqual(originalProtein.SequenceVariations.First().CallFormat, proteinReadFromXml[0].SequenceVariations.First().CallFormat);
             Assert.AreEqual(originalProtein.SequenceVariations.First().OneBasedBeginPosition, proteinReadFromXml[0].SequenceVariations.First().OneBasedBeginPosition);
             Assert.AreEqual(originalProtein.SequenceVariations.First().OneBasedEndPosition, proteinReadFromXml[0].SequenceVariations.First().OneBasedEndPosition);
             Assert.AreEqual(originalProtein.SequenceVariations.First().OriginalSequence, proteinReadFromXml[0].SequenceVariations.First().OriginalSequence);
             Assert.AreEqual(originalProtein.SequenceVariations.First().VariantSequence, proteinReadFromXml[0].SequenceVariations.First().VariantSequence);
-            Assert.AreEqual(originalProtein.SequenceVariations.Last().Description, proteinReadFromXml[0].SequenceVariations.Last().Description);
+            Assert.AreEqual(originalProtein.SequenceVariations.Last().CallFormat, proteinReadFromXml[0].SequenceVariations.Last().CallFormat);
             Assert.AreEqual(originalProtein.SequenceVariations.Last().OneBasedBeginPosition, proteinReadFromXml[0].SequenceVariations.Last().OneBasedBeginPosition);
             Assert.AreEqual(originalProtein.SequenceVariations.Last().OneBasedEndPosition, proteinReadFromXml[0].SequenceVariations.Last().OneBasedEndPosition);
             Assert.AreEqual(originalProtein.SequenceVariations.Last().OriginalSequence, proteinReadFromXml[0].SequenceVariations.Last().OriginalSequence);
@@ -534,7 +534,7 @@ public void TestReadWriteSeqVars()
             Assert.AreEqual(ok[0].SequenceVariations.Count(), ok2[0].SequenceVariations.Count());
             Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedBeginPosition, ok2[0].SequenceVariations.First().OneBasedBeginPosition);
             Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedEndPosition, ok2[0].SequenceVariations.First().OneBasedEndPosition);
-            Assert.AreEqual(ok[0].SequenceVariations.First().Description, ok2[0].SequenceVariations.First().Description);
+            Assert.AreEqual(ok[0].SequenceVariations.First().CallFormat, ok2[0].SequenceVariations.First().CallFormat);
             Assert.AreEqual(ok[0].SequenceVariations.First().OriginalSequence, ok2[0].SequenceVariations.First().OriginalSequence);
             Assert.AreEqual(ok[0].SequenceVariations.First().VariantSequence, ok2[0].SequenceVariations.First().VariantSequence);
         }
@@ -557,7 +557,7 @@ public void TestReadWriteSeqVars2()
             Assert.AreEqual(ok[0].SequenceVariations.Count(), ok2[0].SequenceVariations.Count());
             Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedBeginPosition, ok2[0].SequenceVariations.First().OneBasedBeginPosition);
             Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedEndPosition, ok2[0].SequenceVariations.First().OneBasedEndPosition);
-            Assert.AreEqual(ok[0].SequenceVariations.First().Description, ok2[0].SequenceVariations.First().Description);
+            Assert.AreEqual(ok[0].SequenceVariations.First().CallFormat, ok2[0].SequenceVariations.First().CallFormat);
             Assert.AreEqual(ok[0].SequenceVariations.First().OriginalSequence, ok2[0].SequenceVariations.First().OriginalSequence);
             Assert.AreEqual(ok[0].SequenceVariations.First().VariantSequence, ok2[0].SequenceVariations.First().VariantSequence);
         }
diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
index 5580544ef..27d545cfe 100644
--- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs
+++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
@@ -115,7 +115,7 @@ public static void SeqVarXmlTest()
             {
                 Assert.AreEqual(s.OriginalSequence, decoy.BaseSequence.Substring(s.OneBasedBeginPosition - 1, s.OneBasedEndPosition - s.OneBasedBeginPosition + 1));
             }
-            Assert.AreNotEqual(target.SequenceVariations.First().Description, decoy.SequenceVariations.First().Description); //decoys and target variations don't have the same desc.
+            Assert.AreNotEqual(target.SequenceVariations.First().CallFormat, decoy.SequenceVariations.First().CallFormat); //decoys and target variations don't have the same desc.
 
             List<PeptideWithSetModifications> peptides = ok.SelectMany(vp => vp.Digest(new DigestionParams(), null, null)).ToList();
         }
@@ -490,7 +490,7 @@ public void VariantSymbolWeirdnessXml()
             string file = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "SeqVarSymbolWeirdness.xml");
             List<Protein> variantProteins = ProteinDbLoader.LoadProteinXML(file, true, DecoyType.None, null, false, null, out var un);
             Assert.AreEqual(12, variantProteins.First().ConsensusVariant.SequenceVariations.Count());
-            Assert.AreEqual(2, variantProteins.First().ConsensusVariant.SequenceVariations.Count(v => v.Description.Heterozygous.Any(kv => kv.Value)));
+            Assert.AreEqual(2, variantProteins.First().ConsensusVariant.SequenceVariations.Count(v => v.CallFormat.Heterozygous.Any(kv => kv.Value)));
 
             Assert.AreEqual(1, variantProteins.Count); // Should be 2^2 from combinitorics of heterozygous, but the giant indels overwrite them
             Assert.AreEqual(0, variantProteins.Where(v => v.BaseSequence == variantProteins.First().ConsensusVariant.BaseSequence).Count()); // Homozygous variations are included
@@ -749,7 +749,7 @@ public void Constructor_ParsesDescriptionCorrectly()
 
 
             // Act
-            var svd = new SequenceVariantDescription(description);
+            var svd = new VariantCallFormat(description);
 
             // Assert
             Assert.AreEqual(description, svd.Description);
diff --git a/mzLib/Test/FlashLFQ/TestIsoTracker.cs b/mzLib/Test/FlashLFQ/TestIsoTracker.cs
index 59720e4ff..562126ec8 100644
--- a/mzLib/Test/FlashLFQ/TestIsoTracker.cs
+++ b/mzLib/Test/FlashLFQ/TestIsoTracker.cs
@@ -25,7 +25,7 @@ internal class TestIsoTracker
         [Test]
         public static void TestIsobaricPeptideGroup()
         {
-            // Description: Test the IsobaricPeptideGroup class
+            // CallFormat: Test the IsobaricPeptideGroup class
             // In this testing, we will create a new IsobaricPeptideGroup and check the properties
             List<Identification> ids = new List<Identification>
             {
@@ -126,7 +126,7 @@ public static void TestIsoTrackerIdFilter_FilterPeptide()
         [Test]
         public static void TestGetTargeMz_case1()
         {
-            // Description: Test the GetTargetMz function in FlashLfqEngine
+            // CallFormat: Test the GetTargetMz function in FlashLfqEngine
             // In this testing, we will check the isobaricPeptideGroup and targetMzs output
             // All three ids are isobaric peptides with the same monoisotopic mass, so they should be grouped together and generate only 5 target m/z values
 
@@ -189,7 +189,7 @@ public static void TestGetTargeMz_case1()
         [Test]
         public static void TestGetTargeMz_case2()
         {
-            // Description: Test the GetTargetMz function in FlashLfqEngine
+            // CallFormat: Test the GetTargetMz function in FlashLfqEngine
             // In this testing, we will check the isobaricPeptideGroup and targetMzs output
             // All three ids are isobaric peptides with the different monoisotopic mass, so they should be grouped together and generate 15(3*5) target m/z values
 
@@ -253,7 +253,7 @@ public static void TestGetTargeMz_case2()
         [Test]
         public static void TestIndexPeakPrune()
         {
-            // Description: Test the peak indexing engine pruning function
+            // CallFormat: Test the peak indexing engine pruning function
             // In this test, we will create the targetMzs from the ids to prune the indexPeaks.
             // After pruning, the index engine should only keep the peaks with the target m/z values.
             string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData");
@@ -329,7 +329,7 @@ public static void TestXICConstructor()
         [Test]
         public static void TestLinearSpline()
         {
-            //Description: Test the linear spline interpolation and differentiation
+            //CallFormat: Test the linear spline interpolation and differentiation
             //The testing model is a linear function y = 100x, where x is the time point and y is the intensity
             //The slope will be 100 and the second derivative will be 0
 
@@ -361,7 +361,7 @@ public static void TestLinearSpline()
         [Test]
         public static void TestPeakAlignment()
         {
-            //Description: Test the peak alignment function
+            //CallFormat: Test the peak alignment function
             //The testing model is a triangle peak with the Apex.
             //The Apex of three peaks are 3, 3.1, 2.9 min
             //The time shift should be 0.1 min for the peak2 and -0.1 min for the peak3
@@ -410,7 +410,7 @@ public static void TestPeakAlignment()
         [Test]
         public static void TestBuildSmoothedCubicSpline_LessPoint()
         {
-            //Description: Test the cubic spline interpolation
+            //CallFormat: Test the cubic spline interpolation
             //The testing model has less than 5 points that cannot build the cubic spline
             //The cubic spline should be null
 
@@ -531,7 +531,7 @@ public static void TestXICGroupConstructor()
         [Test]
         public static void TestXICGroup_RtDict()
         {
-            //Description: Test the peakAlignment function in the XICGroup
+            //CallFormat: Test the peakAlignment function in the XICGroup
             //The testing has three normal distribution XIC peaks.
             //The Apex of three peaks are 3, 3.1, 2.9 min
             //The time shift should be 0.1 min for the peak2 and -0.1 min for the peak3
@@ -582,7 +582,7 @@ public static void TestXICGroup_RtDict()
         [Test]
         public static void TestXICGroup_IdList()
         {
-            //Description: Test the IdList in the XICGroup
+            //CallFormat: Test the IdList in the XICGroup
             //The testing model has three XICs, one of this XIC has no Id, then it borrows one Id from the first XIC
             //If the Id is borrowed, the Id will not be added into the IdList
             //The IdList should contain the Ids from the first and the third XIC
@@ -617,7 +617,7 @@ public static void TestXICGroup_IdList()
         [Test]
         public static void TestXICGroup_Tracking()
         {
-            //Description: Test the peak tracking function in the XICGroup
+            //CallFormat: Test the peak tracking function in the XICGroup
             //The testing has three normal distribution XIC peaks.
             //The Apex of three peaks are 20, 23, 17 min
             //The time shift should be +3 min for the peak2 and -3 min for the peak3
@@ -688,7 +688,7 @@ public static void TestXICGroup_Tracking()
         [Test]
         public static void TestCombinedSearching()
         {
-            //Description: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output
+            //CallFormat: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output
             //There are three XIC included isobaric peaks that with 3 min gap.
 
             string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData");
@@ -796,7 +796,7 @@ public static void TestCombinedSearching()
         [Test]
         public static void TestPeakOutput()
         {
-            //Description: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output
+            //CallFormat: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output
             //There are three XIC included isobaric peaks that with 3 min gap.
 
             string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData");
@@ -931,7 +931,7 @@ public static void TestPeakOutput()
         [Test]
         public static void TestIsoSequence_Ambiguous()
         {
-            //Description: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID
+            //CallFormat: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID
             //IsoID: DIVENY[Common Variable:Oxidation on M]FMR   should be the same as DIVENYFM[Common Variable:Oxidation on M]R
 
             //Try to turn on the MBR and Isotracker at the same time
@@ -1066,7 +1066,7 @@ public static void TestIsoSequence_Ambiguous()
         [Test]
         public static void TestIsoSequence_MonoIsotopicMassTolerance()
         {
-            //Description: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID
+            //CallFormat: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID
             //IsoID: DIVENY[Common Variable:Oxidation on M]FMR   should be the same as DIVENYFM[Common Variable:Oxidation on M]R
             //The Monoisotopic mass are 1201.5436, 1201.5437, 1201.5438, they should be recognized as the same IsoID
 
@@ -1491,7 +1491,7 @@ public static void TestIsoSequence_CombinedTesting()
         [Test]
         public static void TestRun_SearchingTarget()
         {
-            //Description: we will upload a motifList for IsoTracker
+            //CallFormat: we will upload a motifList for IsoTracker
             //Only peptide with motif on N can be searched
             //In this case, only one kind of peptide can be searched: baseSequence PEPNINEN -> PEPN[Mod]INEN, PEPNIN[Mod]EN, PEPNINEN[Mod]
             // Run 1 with PEPNIN[Mod]EN, PEPNINEN[Mod]
@@ -1615,7 +1615,7 @@ public static void TestRun_SearchingTarget()
         [Test]
         public static void TestRun_IDChecking()
         {
-            //Description: we will turn on the IDchecking for IsoTracker
+            //CallFormat: we will turn on the IDchecking for IsoTracker
             //Only when one XIC with more than one id, we do the searching
             //In this case, run 1 has 4 ids (pepA_1, pepA_2, pepB_1, pepC_1)
             //run 2 has 3 ids (pepA_1, pepB_1, pepC_1)
diff --git a/mzLib/Test/Transcriptomics/TestDbLoader.cs b/mzLib/Test/Transcriptomics/TestDbLoader.cs
index db5de6417..cea7bb7c7 100644
--- a/mzLib/Test/Transcriptomics/TestDbLoader.cs
+++ b/mzLib/Test/Transcriptomics/TestDbLoader.cs
@@ -291,7 +291,7 @@ public static void TestEnsemblFastaParsing()
             Assert.That(fields["GeneBiotype"], Is.EqualTo("snRNA"));
             Assert.That(fields["TranscriptBiotype"], Is.EqualTo("snRNA"));
             Assert.That(fields["GeneSymbol"], Is.EqualTo("U6"));
-            Assert.That(fields["Description"], Does.Contain("U6 spliceosomal RNA"));
+            Assert.That(fields["CallFormat"], Does.Contain("U6 spliceosomal RNA"));
 
         }
 
diff --git a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs
index c83c80aa1..949cf68a3 100644
--- a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs
+++ b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs
@@ -239,34 +239,34 @@ private static List<SequenceVariation> ReverseSequenceVariations(IEnumerable<Seq
                     decoyVariations.Add(new SequenceVariation(sv.OneBasedBeginPosition,
                         reversedSequence.Substring(sv.OneBasedBeginPosition - 1, sv.OneBasedEndPosition - sv.OneBasedBeginPosition + 1),
                         new string(variationArray).Substring(1, variationArray.Length - 1) + variationArray[0],
-                        $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+                        $"{decoyIdentifier} VARIANT: " + sv.CallFormat, decoyVariantModifications));
                 }
                 // start loss, so the variant is at the end
                 else if (startLoss)
                 {
-                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, new string(originalArray).Substring(0, originalArray.Length - 1), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, new string(originalArray).Substring(0, originalArray.Length - 1), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.CallFormat, decoyVariantModifications));
                 }
                 // both start with M, but there's more
                 else if (sv.VariantSequence.StartsWith("M", StringComparison.Ordinal) && sv.OneBasedBeginPosition == 1 && (sv.OriginalSequence.Length > 1 || sv.VariantSequence.Length > 1))
                 {
                     string original = new string(originalArray).Substring(0, originalArray.Length - 1);
                     string variant = new string(variationArray).Substring(0, variationArray.Length - 1);
-                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, original, variant, $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, original, variant, $"{decoyIdentifier} VARIANT: " + sv.CallFormat, decoyVariantModifications));
                 }
                 // gained an initiating methionine
                 else if (sv.VariantSequence.StartsWith("M", StringComparison.Ordinal) && sv.OneBasedBeginPosition == 1)
                 {
-                    decoyVariations.Add(new SequenceVariation(1, 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+                    decoyVariations.Add(new SequenceVariation(1, 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.CallFormat, decoyVariantModifications));
                 }
                 // starting methionine, but no variations on it
                 else if (startsWithM)
                 {
-                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 2, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 2, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.CallFormat, decoyVariantModifications));
                 }
                 // no starting methionine
                 else
                 {
-                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 1, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 1, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.CallFormat, decoyVariantModifications));
                 }
             }
             return decoyVariations;
@@ -335,7 +335,7 @@ private static List<Protein> GenerateSlideDecoys(List<Protein> proteins, int max
                         {
                             variationArraySlided[i] = variationArrayUnslided[GetOldSlidedIndex(i, numSlidesHere, variationArrayUnslided.Length, true)];
                         }
-                        decoyVariationsSlide.Add(new SequenceVariation(1, "M", new string(variationArraySlided), $"{decoyIdentifier} VARIANT: Initiator Methionine Change in " + sv.Description));
+                        decoyVariationsSlide.Add(new SequenceVariation(1, "M", new string(variationArraySlided), $"{decoyIdentifier} VARIANT: Initiator Methionine Change in " + sv.CallFormat));
                     }
                     else
                     {
@@ -352,7 +352,7 @@ private static List<Protein> GenerateSlideDecoys(List<Protein> proteins, int max
                             variationArraySlided[i] = variationArrayUnslided[GetOldSlidedIndex(i, numSlidesHere, variationArrayUnslided.Length, initMet)];
                         }
 
-                        decoyVariationsSlide.Add(new SequenceVariation(decoy_begin, decoy_end, sv.OriginalSequence, new string(variationArraySlided), $"{decoyIdentifier} VARIANT: " + sv.Description));
+                        decoyVariationsSlide.Add(new SequenceVariation(decoy_begin, decoy_end, sv.OriginalSequence, new string(variationArraySlided), $"{decoyIdentifier} VARIANT: " + sv.CallFormat));
                     }
                 }
                 var decoyProteinSlide = new Protein(slided_sequence, $"{decoyIdentifier}_" + protein.Accession, protein.Organism, protein.GeneNames.ToList(), decoyModifications, decoyPPSlide,
diff --git a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs
index cc7723c15..731f80bc7 100644
--- a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs
+++ b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs
@@ -87,7 +87,7 @@ private static List<T> GenerateReverseDecoys<T>(List<T> nucleicAcids, int maxThr
                             var reverseModKey = indexMapping[modKvp.Key];
                             reverseModificationsForVariation.Add(reverseModKey, modKvp.Value);
                         }
-                        reverseAppliedVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.Description.Description, reverseModificationsForVariation));
+                        reverseAppliedVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.CallFormat.Description, reverseModificationsForVariation));
                     }
 
                     // Reverse Applied Variants
@@ -101,7 +101,7 @@ private static List<T> GenerateReverseDecoys<T>(List<T> nucleicAcids, int maxThr
                             var reverseModKey = indexMapping[modKvp.Key];
                             reverseModificationsForVariation.Add(reverseModKey, modKvp.Value);
                         }
-                        reverseVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.Description.Description, reverseModificationsForVariation));
+                        reverseVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.CallFormat.Description, reverseModificationsForVariation));
                     }
 
                     // Reverse Truncations
diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs
index 24ff44dd4..2cff6c0af 100644
--- a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs
+++ b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs
@@ -206,7 +206,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
                     {
                         writer.WriteStartElement("feature");
                         writer.WriteAttributeString("type", "sequence variant");
-                        writer.WriteAttributeString("description", hm.Description.ToString());
+                        writer.WriteAttributeString("description", hm.CallFormat.ToString());
                         writer.WriteStartElement("original");
                         writer.WriteString(hm.OriginalSequence);
                         writer.WriteEndElement(); // original
@@ -252,7 +252,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
                     //{
                     //    writer.WriteStartElement("feature");
                     //    writer.WriteAttributeString("type", "splice site");
-                    //    writer.WriteAttributeString("description", hm.Description);
+                    //    writer.WriteAttributeString("description", hm.CallFormat);
                     //    writer.WriteStartElement("location");
                     //    if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition)
                     //    {
@@ -464,7 +464,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
                     {
                         writer.WriteStartElement("feature");
                         writer.WriteAttributeString("type", "sequence variant");
-                        writer.WriteAttributeString("description", hm.Description.ToString());
+                        writer.WriteAttributeString("description", hm.CallFormat.ToString());
                         writer.WriteStartElement("original");
                         writer.WriteString(hm.OriginalSequence);
                         writer.WriteEndElement(); // original
diff --git a/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs
index b80827568..36742bf8d 100644
--- a/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs
+++ b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs
@@ -89,7 +89,7 @@ public static RnaFastaHeaderType DetectRnaFastaHeaderType(string line)
                 { "GeneBiotype", new FastaHeaderFieldRegex("GeneBiotype", @"gene_biotype:([^\s]+)", 0, 1) },
                 { "TranscriptBiotype", new FastaHeaderFieldRegex("TranscriptBiotype", @"transcript_biotype:([^\s]+)", 0, 1) },
                 { "GeneSymbol", new FastaHeaderFieldRegex("GeneSymbol", @"gene_symbol:([^\s]+)", 0, 1) },
-                { "Description", new FastaHeaderFieldRegex("Description", @"description:([^\[]+)", 0, 1) },
+                { "CallFormat", new FastaHeaderFieldRegex("CallFormat", @"description:([^\[]+)", 0, 1) },
             };
 
         public static readonly Dictionary<string, FastaHeaderFieldRegex> NcbiAssemblyFieldRegexes =

From acae84e47201a80c3255d4662ff859929d7b4db4 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Wed, 22 Oct 2025 15:47:15 -0500
Subject: [PATCH 02/38] final unit test

---
 mzLib/Omics/BioPolymer/VariantCallFormat.cs | 326 ++++++++++++++++++--
 1 file changed, 293 insertions(+), 33 deletions(-)

diff --git a/mzLib/Omics/BioPolymer/VariantCallFormat.cs b/mzLib/Omics/BioPolymer/VariantCallFormat.cs
index 386d97705..906d0ada0 100644
--- a/mzLib/Omics/BioPolymer/VariantCallFormat.cs
+++ b/mzLib/Omics/BioPolymer/VariantCallFormat.cs
@@ -4,57 +4,236 @@
 
 namespace Omics.BioPolymer
 {
+    /// <summary>
+    /// Plain-language wrapper for a single VCF record (a line in a VCF file) with
+    /// lightweight parsing of:
+    /// - Reference and alternate allele strings
+    /// - INFO (only passed through to <see cref="SnpEffAnnotation"/> for ANN-style annotations)
+    /// - FORMAT column and per-sample genotype fields
+    /// - Genotype (GT) tokens and Allelic Depth (AD) values
+    /// - Simple zygosity classification per sample
+    ///
+    /// Design goals:
+    /// - Fast, minimal allocation parsing for downstream proteomics / variant application.
+    /// - Tolerant of missing data ('.') without throwing.
+    /// - Avoids full VCF spec complexity (e.g., phased blocks, PL, GQ, allele remapping in multi-allelic normalization).
+    ///
+    /// Important assumptions / limitations:
+    /// 1. The input line MUST be tab-delimited. Literal "\t" sequences will NOT be interpreted as tabs.
+    /// 2. A valid VCF record is expected to contain at least the first 10 columns. If fewer are found, the constructor
+    ///    returns early and most properties remain null / empty.
+    /// 3. Only the ANN sub-field of INFO is parsed (via <see cref="SnpEffAnnotation"/>); all other INFO keys are ignored.
+    /// 4. FORMAT fields are assumed to be consistent across all samples; mismatched token counts throw.
+    /// 5. GT parsing:
+    ///    - Splits on '/' or '|' and removes the separators.
+    ///    - Missing alleles '.' are preserved in the parsed array.
+    ///    - Unsupported allele indexes (>3) are still accepted if they appear (so long as they are numeric) – current validation allows 0–3 and '.'.
+    /// 6. Zygosity rules:
+    ///    - Only non-missing (not ".") allele symbols are considered.
+    ///    - No called alleles ⇒ <see cref="Zygosity.Unknown"/>.
+    ///    - One distinct called allele ⇒ <see cref="Zygosity.Homozygous"/>.
+    ///    - More than one distinct called allele ⇒ <see cref="Zygosity.Heterozygous"/>.
+    /// 7. Backward compatibility booleans (<see cref="Homozygous"/> / <see cref="Heterozygous"/>) are derived from the zygosity classification
+    ///    and should be considered legacy conveniences. Prefer <see cref="ZygosityBySample"/>.
+    ///
+    /// Common usage pattern:
+    /// <code>
+    /// var vcf = new VariantCallFormat(vcLine);
+    /// foreach (var (sampleId, gt) in vcf.Genotypes)
+    /// {
+    ///     var z = vcf.ZygosityBySample[sampleId];
+    ///     var ad = vcf.AlleleDepths[sampleId];
+    /// }
+    /// </code>
+    /// </summary>
     public class VariantCallFormat
     {
+        /// <summary>
+        /// Zygosity classification per sample, derived ONLY from called (non-missing) allele symbols.
+        /// Missing-only genotype (e.g., "./.") ⇒ Unknown.
+        /// </summary>
+        public enum Zygosity { Unknown, Homozygous, Heterozygous }
+
+        /// <summary>
+        /// True when the provided line was truncated (< 10 VCF columns). In this case:
+        /// - ReferenceAlleleString / AlternateAlleleString are null
+        /// - AlleleIndex = -1
+        /// - Info is a safe empty annotation (never null)
+        /// - Format is an empty string
+        /// - Genotypes / AlleleDepths / zygosity maps are empty
+        /// </summary>
+        public bool IsTruncated { get; }
+        /// <summary>
+        /// Original raw VCF line.
+        /// </summary>
+        public string Description { get; }
+
+        /// <summary>
+        /// REF allele text (may be null if constructor aborted).
+        /// </summary>
+        public string? ReferenceAlleleString { get; }
+
+        /// <summary>
+        /// ALT allele(s) comma-delimited (may be null if constructor aborted).
+        /// </summary>
+        public string? AlternateAlleleString { get; }
+
+        /// <summary>
+        /// Parsed snpEff-style annotation (ANN=*). All other INFO keys are ignored.
+        /// </summary>
+        public SnpEffAnnotation Info { get; }
+
+        /// <summary>
+        /// FORMAT column descriptor (e.g., "GT:AD:DP"). Used to parse sample columns.
+        /// </summary>
+        public string Format { get; }
+
+        /// <summary>
+        /// Per-sample genotype token arrays (GT split on '/' or '|').
+        /// Keys are zero-based sample indices as strings ("0", "1", ...).
+        /// </summary>
+        public Dictionary<string, string[]> Genotypes { get; } = new();
+
+        /// <summary>
+        /// Per-sample AD (allele depth) string arrays (the raw comma-separated numeric tokens, excluding empty entries).
+        /// Missing or invalid AD yields an empty array.
+        /// </summary>
+        public Dictionary<string, string[]> AlleleDepths { get; } = new();
+
+        /// <summary>
+        /// 1-based index of the allele referenced by ANN’s Allele (1..N for ALT, 0 for REF).
+        /// -1 if the annotation's allele is missing or not found in ALT list.
+        /// </summary>
+        public int AlleleIndex { get; }
+
+        /// <summary>
+        /// Legacy: per-sample boolean flags indicating homozygosity.
+        /// Prefer using <see cref="ZygosityBySample"/>.
+        /// </summary>
+        public Dictionary<string, bool> Homozygous { get; } = new();
+
+        /// <summary>
+        /// Legacy: per-sample boolean flags indicating heterozygosity.
+        /// Prefer using <see cref="ZygosityBySample"/>.
+        /// </summary>
+        public Dictionary<string, bool> Heterozygous { get; } = new();
+
+        /// <summary>
+        /// Per-sample zygosity classification derived from non-missing genotype alleles.
+        /// </summary>
+        public Dictionary<string, Zygosity> ZygosityBySample { get; } = new();
+
+        /// <summary>
+        /// Construct from a single, tab-delimited VCF record.
+        /// If fewer than 10 columns are present, parsing is aborted (object remains mostly unpopulated).
+        /// </summary>
+        /// <param name="description">Full raw VCF line (must contain actual tab characters).</param>
         public VariantCallFormat(string description)
         {
-            Description = description;
-            if (description == null)
+            if (description is null)
             {
+                Description = string.Empty;
+                ReferenceAlleleString = null;
+                AlternateAlleleString = null;
+                Info = new SnpEffAnnotation(string.Empty); // safe empty annotation
+                Format = string.Empty;
+                AlleleIndex = -1;
+                IsTruncated = true;
                 return;
             }
 
+            Description = description;
+
+            // Back-compat: if no real tabs are present but literal "\t" sequences are,
+            // normalize them to actual tabs for parsing only. Leave Description intact.
+            string parseLine = NormalizeTabsForParsing(description);
+
             // Parse description into
-            string[] vcfFields = description.Split(new[] { @"\t" }, StringSplitOptions.None);
-            if (vcfFields.Length < 10) { return; }
+            string[] vcfFields = parseLine.Split('\t');
+            if (vcfFields.Length < 10)
+            {
+                ReferenceAlleleString = null;
+                AlternateAlleleString = null;
+                Info = new SnpEffAnnotation(string.Empty); // safe empty annotation
+                Format = string.Empty;
+                AlleleIndex = -1;
+                IsTruncated = true;
+                return;
+            }
             ReferenceAlleleString = vcfFields[3];
             AlternateAlleleString = vcfFields[4];
             Info = new SnpEffAnnotation(vcfFields[7]);
-            AlleleIndex = Info.Allele == null ? -1 : AlternateAlleleString.Split(',').ToList().IndexOf(Info.Allele) + 1; // reference is zero
+
+            AlleleIndex = Info.Allele == null 
+                ? -1 
+                : AlternateAlleleString.Split(',').ToList().IndexOf(Info.Allele) + 1; // reference is zero
+
+            // Format column tokens describe how to split each sample column
             Format = vcfFields[8];
-            string[] genotypes = Enumerable.Range(9, vcfFields.Length - 9).Select(i => vcfFields[i]).ToArray();
+
+            // Collect raw sample genotype strings (columns 9+)
+            string[] genotypes = Enumerable
+                .Range(9, vcfFields.Length - 9)
+                .Select(i => vcfFields[i])
+                .ToArray();
 
             // loop through genotypes for this variant (e.g. tumor and normal)
+            // Parse each sample
             for (int individual = 0; individual < genotypes.Length; individual++)
             {
                 var genotypeFields = GenotypeDictionary(Format.Trim(), genotypes[individual].Trim());
 
-                // parse genotype
-                string[] gt = null;
-                if (genotypeFields.TryGetValue("GT", out string gtString)) { gt = gtString.Split('/'); }
-                if (gt == null) { continue; }
+                // GT: split on '/' or '|' – separators removed intentionally.
+                string[] gt = genotypeFields.TryGetValue("GT", out var gtString)
+                    ? gtString.Split(new[] { '/', '|' }, StringSplitOptions.RemoveEmptyEntries)
+                    : Array.Empty<string>();
+
+                // Skip invalid or empty GT
+                if (gt.Length == 0 || !GTvaluesAreValid(gt))
+                {
+                    continue;
+                }
+
+                // AD: optional – may be missing or contain '.' tokens
+                int[] adDepths;
+                string[] ad = genotypeFields.TryGetValue("AD", out var adString) && TryParseAD(adString, out adDepths)
+                    ? adString.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)
+                    : Array.Empty<string>();
+
+                string sampleKey = individual.ToString();
+                Genotypes.Add(sampleKey, gt);
+                AlleleDepths.Add(sampleKey, ad);
 
-                // parse allele depth (might be null, technically, but shouldn't be in most use cases)
-                string[] ad = null;
-                if (genotypeFields.TryGetValue("AD", out string adString)) { ad = adString.Split(','); }
+                // Zygosity classification: ignore '.' when counting distinct alleles
+                var calledAlleles = gt.Where(a => a != ".").ToArray();
+                Zygosity z;
+                if (calledAlleles.Length == 0)
+                {
+                    z = Zygosity.Unknown;
+                }
+                else
+                {
+                    int distinctCalled = calledAlleles.Distinct().Count();
+                    z = distinctCalled == 1 ? Zygosity.Homozygous : Zygosity.Heterozygous;
+                }
+                ZygosityBySample.Add(sampleKey, z);
 
-                Genotypes.Add(individual.ToString(), gt);
-                AlleleDepths.Add(individual.ToString(), ad);
-                Homozygous.Add(individual.ToString(), gt.Distinct().Count() == 1);
-                Heterozygous.Add(individual.ToString(), gt.Distinct().Count() > 1);
+                // Legacy boolean maps (retain for existing code paths)
+                Homozygous.Add(sampleKey, z == Zygosity.Homozygous);
+                Heterozygous.Add(sampleKey, z == Zygosity.Heterozygous);
             }
         }
 
-        public string Description { get; }
-        public string ReferenceAlleleString { get; }
-        public string AlternateAlleleString { get; }
-        public SnpEffAnnotation Info { get; }
-        public string Format { get; }
-        public Dictionary<string, bool> Homozygous { get; } = new Dictionary<string, bool>();
-        public Dictionary<string, bool> Heterozygous { get; } = new Dictionary<string, bool>();
-        public Dictionary<string, string[]> Genotypes { get; } = new Dictionary<string, string[]>();
-        public Dictionary<string, string[]> AlleleDepths { get; } = new Dictionary<string, string[]>();
-        public int AlleleIndex { get; }
+        private static string NormalizeTabsForParsing(string line)
+        {
+            // Fast path: already contains real tabs
+            if (line.IndexOf('\t') >= 0) return line;
+
+            // Replace literal "\t" sequences for parsing only
+            return line.IndexOf("\\t", StringComparison.Ordinal) >= 0
+                ? line.Replace("\\t", "\t")
+                : line;
+        }
 
         /// <summary>
         /// Returns original string for the description
@@ -64,24 +243,28 @@ public override string ToString()
         {
             return Description;
         }
-
+        /// <summary>
+        /// Equality is based solely on the original description string.
+        /// </summary>
         public override bool Equals(object obj)
         {
             VariantCallFormat s = obj as VariantCallFormat;
             return s != null && s.Description == Description;
         }
-
+        /// <summary>
+        /// Hash code is derived from the original description (null-safe).
+        /// </summary>
         public override int GetHashCode()
         {
             return (Description ?? "").GetHashCode();
         }
 
         /// <summary>
-        /// Gets a dictionary of the format (key) and fields (value) for a genotype
+        /// Build a dictionary mapping FORMAT keys (e.g., GT, AD, DP) to the corresponding colon-delimited
+        /// values from a single sample column. Throws if token counts differ.
         /// </summary>
-        /// <param name="format"></param>
-        /// <param name="genotype"></param>
-        /// <returns></returns>
+        /// <param name="format">FORMAT column (e.g., "GT:AD:DP").</param>
+        /// <param name="genotype">Sample column (e.g., "0/1:12,8:20").</param>
         internal static Dictionary<string, string> GenotypeDictionary(string format, string genotype)
         {
             Dictionary<string, string> genotypeDict = new Dictionary<string, string>();
@@ -93,5 +276,82 @@ internal static Dictionary<string, string> GenotypeDictionary(string format, str
             }
             return Enumerable.Range(0, formatSplit.Length).ToDictionary(x => formatSplit[x], x => genotypeSplit[x]);
         }
+
+        /// <summary>
+        /// Validate that all genotype tokens are drawn from the accepted set {0,1,2,3,.}.
+        /// This is intentionally minimal; higher ALT indexes or symbolic alleles are not fully enforced here.
+        /// </summary>
+        public bool GTvaluesAreValid(string[] gt)
+        {
+            string[] validValues = { "0", "1", "2", "3", "." };
+            return ValidationHelpers.TryValidateValues(gt.ToList(), validValues, out _);
+        }
+
+        /// <summary>
+        /// Validate AD tokens: each must be "." or a non-negative integer.
+        /// Empty AD arrays are considered invalid (if AD is present it should have content or '.').
+        /// </summary>
+        public bool ADvaluesAreValid(string[] ad)
+        {
+            if (ad is null || ad.Length == 0) return false;
+            foreach (var token in ad)
+            {
+                var s = token?.Trim();
+                if (string.IsNullOrEmpty(s)) return false;
+                if (s == ".") continue;
+                if (!int.TryParse(s, out var n) || n < 0) return false;
+            }
+            return true;
+        }
+
+        /// <summary>
+        /// Attempt to parse AD into integer depths (excluding "." entries).
+        /// Returns false if validation fails. On success, 'depths' contains only numeric values.
+        /// </summary>
+        public bool TryParseAD(string adString, out int[] depths)
+        {
+            depths = Array.Empty<int>();
+            if (string.IsNullOrWhiteSpace(adString)) return false;
+
+            var parts = adString.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
+            if (!ADvaluesAreValid(parts)) return false;
+
+            depths = parts.Where(p => p != ".").Select(int.Parse).ToArray();
+            return true;
+        }
+
+        /// <summary>
+        /// Shared validation helper for small, fixed vocabularies of acceptable string tokens.
+        /// </summary>
+        public static class ValidationHelpers
+        {
+            /// <summary>
+            /// Returns true if all non-null, normalized values belong to the allowed set.
+            /// Produces a distinct list of invalid tokens (if any).
+            /// </summary>
+            public static bool TryValidateValues(
+                IEnumerable<string?> values,
+                IEnumerable<string> allowedValues,
+                out string[] invalid,
+                bool ignoreCase = true,
+                bool trim = true)
+            {
+                var comparer = ignoreCase ? StringComparer.OrdinalIgnoreCase : StringComparer.Ordinal;
+                var allowed = new HashSet<string>(allowedValues, comparer);
+
+                IEnumerable<string> Normalize(IEnumerable<string?> seq) =>
+                    seq
+                        .Where(v => v is not null)
+                        .Select(v => trim ? v!.Trim() : v!)
+                        .Where(v => v.Length > 0);
+
+                var normalized = Normalize(values);
+                invalid = normalized
+                    .Where(v => !allowed.Contains(v))
+                    .Distinct(comparer)
+                    .ToArray();
+                return invalid.Length == 0;
+            }
+        }
     }
 }
\ No newline at end of file

From ed23e208e0aa400ad773eebf10b6ad41899c24e3 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Wed, 22 Oct 2025 15:51:29 -0500
Subject: [PATCH 03/38] undo accidental change of descripton to call format

---
 mzLib/Omics/BioPolymer/SequenceVariation.cs   | 10 +++---
 mzLib/Omics/BioPolymer/VariantApplication.cs  | 32 +++++++++----------
 mzLib/Omics/IBioPolymerWithSetMods.cs         |  2 +-
 mzLib/Proteomics/Protein/DisulfideBond.cs     |  2 +-
 .../MsFraggerPeptide.cs                       |  2 +-
 .../MsFraggerProtein.cs                       |  2 +-
 .../IndividualResultRecords/MsFraggerPsm.cs   |  2 +-
 .../SpectrumMatchFromTsvHeader.cs             |  4 +--
 mzLib/Readers/Thermo/ThermoRawFileReader.cs   |  2 +-
 mzLib/Test/DatabaseTests/TestProteinReader.cs |  6 ++--
 .../DatabaseTests/TestProteomicsReadWrite.cs  |  8 ++---
 .../Test/DatabaseTests/TestVariantProtein.cs  |  4 +--
 mzLib/Test/FlashLFQ/TestIsoTracker.cs         | 32 +++++++++----------
 mzLib/Test/Transcriptomics/TestDbLoader.cs    |  2 +-
 .../DecoyGeneration/DecoyProteinGenerator.cs  | 16 +++++-----
 .../DecoyGeneration/RnaDecoyGenerator.cs      |  4 +--
 .../ProteinDbWriter.cs                        |  6 ++--
 .../Transcriptomics/RnaDbLoader.cs            |  2 +-
 18 files changed, 69 insertions(+), 69 deletions(-)

diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs
index d48df14b0..c3c4ef9e2 100644
--- a/mzLib/Omics/BioPolymer/SequenceVariation.cs
+++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs
@@ -20,7 +20,7 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str
             OneBasedEndPosition = oneBasedEndPosition;
             OriginalSequence = originalSequence ?? "";
             VariantSequence = variantSequence ?? "";
-            CallFormat = new VariantCallFormat(description);
+            Description = new VariantCallFormat(description);
             OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
         }
 
@@ -58,9 +58,9 @@ public SequenceVariation(int oneBasedPosition, string originalSequence, string v
         public string VariantSequence { get; }
 
         /// <summary>
-        /// CallFormat of this variation (optional)
+        /// Description of this variation (optional)
         /// </summary>
-        public VariantCallFormat CallFormat { get; }
+        public VariantCallFormat Description { get; }
 
         /// <summary>
         /// Modifications specifically for this variant
@@ -75,7 +75,7 @@ public override bool Equals(object obj)
                 && OneBasedEndPosition == s.OneBasedEndPosition
                 && (s.OriginalSequence == null && OriginalSequence == null || OriginalSequence.Equals(s.OriginalSequence))
                 && (s.VariantSequence == null && VariantSequence == null || VariantSequence.Equals(s.VariantSequence))
-                && (s.CallFormat == null && CallFormat == null || CallFormat.Equals(s.CallFormat))
+                && (s.Description == null && Description == null || Description.Equals(s.Description))
                 && (s.OneBasedModifications == null && OneBasedModifications == null ||
                     s.OneBasedModifications.Keys.ToList().SequenceEqual(OneBasedModifications.Keys.ToList())
                     && s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList()));
@@ -87,7 +87,7 @@ public override int GetHashCode()
                 ^ OneBasedEndPosition.GetHashCode()
                 ^ OriginalSequence.GetHashCode() // null handled in constructor
                 ^ VariantSequence.GetHashCode() // null handled in constructor
-                ^ CallFormat.GetHashCode(); // always constructed in constructor
+                ^ Description.GetHashCode(); // always constructed in constructor
         }
 
         /// <summary>
diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs
index a9ee258f5..1669c3afe 100644
--- a/mzLib/Omics/BioPolymer/VariantApplication.cs
+++ b/mzLib/Omics/BioPolymer/VariantApplication.cs
@@ -25,7 +25,7 @@ public static List<TBioPolymerType> GetVariantBioPolymers<TBioPolymerType>(this
         {
             protein.ConsensusVariant.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
             protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
-            if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.CallFormat == null || v.CallFormat.Genotypes.Count == 0))
+            if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.Description == null || v.Description.Genotypes.Count == 0))
             {
                 // this is a protein with either no VCF lines or a mix of VCF and non-VCF lines
                 return ApplyAllVariantCombinations(protein, protein.SequenceVariations, maxAllowedVariantsForCombinatorics).ToList();
@@ -100,7 +100,7 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
             List<SequenceVariation> uniqueEffectsToApply = sequenceVariations
                 .GroupBy(v => v.SimpleString())
                 .Select(x => x.First())
-                .Where(v => v.CallFormat.Genotypes.Count > 0) // this is a VCF line
+                .Where(v => v.Description.Genotypes.Count > 0) // this is a VCF line
                 .OrderByDescending(v => v.OneBasedBeginPosition) // apply variants at the end of the protein sequence first
                 .ToList();
 
@@ -112,7 +112,7 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                 return new List<TBioPolymerType> { proteinCopy };
             }
 
-            HashSet<string> individuals = new HashSet<string>(uniqueEffectsToApply.SelectMany(v => v.CallFormat.Genotypes.Keys));
+            HashSet<string> individuals = new HashSet<string>(uniqueEffectsToApply.SelectMany(v => v.Description.Genotypes.Keys));
             List<TBioPolymerType> variantProteins = new();
             List<TBioPolymerType> newVariantProteins = new();
             // loop through genotypes for each sample/individual (e.g. tumor and normal)
@@ -121,17 +121,17 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                 newVariantProteins.Clear();
                 newVariantProteins.Add(proteinCopy);
 
-                bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.CallFormat.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics;
+                bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.Description.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics;
                 foreach (var variant in uniqueEffectsToApply)
                 {
-                    bool variantAlleleIsInTheGenotype = variant.CallFormat.Genotypes[individual].Contains(variant.CallFormat.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff
+                    bool variantAlleleIsInTheGenotype = variant.Description.Genotypes[individual].Contains(variant.Description.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff
                     if (!variantAlleleIsInTheGenotype)
                     {
                         continue;
                     }
-                    bool isHomozygousAlternate = variant.CallFormat.Homozygous[individual] && variant.CallFormat.Genotypes[individual].All(d => d == variant.CallFormat.AlleleIndex.ToString()); // note this isn't a great test for homozygosity, since the genotype could be 1/2 and this would still return true. But currently, alleles 1 and 2 will be included as separate variants, so this is fine for now.
-                    bool isDeepReferenceAllele = int.TryParse(variant.CallFormat.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth;
-                    bool isDeepAlternateAllele = int.TryParse(variant.CallFormat.AlleleDepths[individual][variant.CallFormat.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth;
+                    bool isHomozygousAlternate = variant.Description.Homozygous[individual] && variant.Description.Genotypes[individual].All(d => d == variant.Description.AlleleIndex.ToString()); // note this isn't a great test for homozygosity, since the genotype could be 1/2 and this would still return true. But currently, alleles 1 and 2 will be included as separate variants, so this is fine for now.
+                    bool isDeepReferenceAllele = int.TryParse(variant.Description.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth;
+                    bool isDeepAlternateAllele = int.TryParse(variant.Description.AlleleDepths[individual][variant.Description.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth;
 
                     // homozygous alternate
                     if (isHomozygousAlternate && isDeepAlternateAllele)
@@ -141,7 +141,7 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
 
                     // heterozygous basic
                     // first protein with variants contains all homozygous variation, second contains all variations
-                    else if (variant.CallFormat.Heterozygous[individual] && tooManyHeterozygousVariants)
+                    else if (variant.Description.Heterozygous[individual] && tooManyHeterozygousVariants)
                     {
                         if (isDeepAlternateAllele && isDeepReferenceAllele)
                         {
@@ -170,7 +170,7 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                     }
 
                     // heterozygous combinitorics
-                    else if (variant.CallFormat.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants)
+                    else if (variant.Description.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants)
                     {
                         List<TBioPolymerType> combinitoricProteins = new();
 
@@ -179,7 +179,7 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                             if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0 && isDeepReferenceAllele)
                             {
                                 // keep reference allele
-                                if (variant.CallFormat.Genotypes[individual].Contains("0"))
+                                if (variant.Description.Genotypes[individual].Contains("0"))
                                 {
                                     combinitoricProteins.Add(ppp);
                                 }
@@ -191,7 +191,7 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                             {
                                 combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual));
                             }
-                            else if (variant.CallFormat.Genotypes[individual].Contains("0"))
+                            else if (variant.Description.Genotypes[individual].Contains("0"))
                             {
                                 combinitoricProteins.Add(ppp);
                             }
@@ -224,7 +224,7 @@ private static TBioPolymerType ApplySingleVariant<TBioPolymerType>(SequenceVaria
                 variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1,
                 variantGettingApplied.OriginalSequence,
                 variantGettingApplied.VariantSequence,
-                variantGettingApplied.CallFormat.Description,
+                variantGettingApplied.Description.Description,
                 variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value));
 
             // check to see if there is incomplete indel overlap, which would lead to weird variant sequences
@@ -271,7 +271,7 @@ private static List<SequenceVariation> AdjustSequenceVariationIndices(SequenceVa
 
                 // variant was entirely before the one being applied (shouldn't happen because of order of applying variants)
                 // or it's the current variation
-                if (v.CallFormat.Equals(variantGettingApplied.CallFormat) || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition)
+                if (v.Description.Equals(variantGettingApplied.Description) || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition)
                 {
                     variations.Add(v);
                 }
@@ -299,7 +299,7 @@ private static List<SequenceVariation> AdjustSequenceVariationIndices(SequenceVa
                         end,
                         v.OriginalSequence,
                         v.VariantSequence,
-                        v.CallFormat.Description,
+                        v.Description.Description,
                         v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)));
                 }
             }
@@ -425,7 +425,7 @@ private static string CombineSimpleStrings(IEnumerable<SequenceVariation>? varia
         /// </summary>
         public static string CombineDescriptions(IEnumerable<SequenceVariation>? variations)
         {
-            return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.CallFormat));
+            return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.Description));
         }
         /// <summary>
         /// Applies all possible combinations of the provided SequenceVariation list to the base TBioPolymerType object,
diff --git a/mzLib/Omics/IBioPolymerWithSetMods.cs b/mzLib/Omics/IBioPolymerWithSetMods.cs
index f065238a4..e475a3b5d 100644
--- a/mzLib/Omics/IBioPolymerWithSetMods.cs
+++ b/mzLib/Omics/IBioPolymerWithSetMods.cs
@@ -25,7 +25,7 @@ public interface IBioPolymerWithSetMods : IHasChemicalFormula, IEquatable<IBioPo
         int OneBasedEndResidue { get; }
         int MissedCleavages { get; }
         /// <summary>
-        /// CallFormat of where the BioPolymerWithSetMods originated from examples include
+        /// Description of where the BioPolymerWithSetMods originated from examples include
         /// Top-down truncation: full-length proteoform C-terminal digestion truncation
         /// Top-down truncation: DECOY full-length proteoform N-terminal digestion truncation
         /// Bottom-up search: full
diff --git a/mzLib/Proteomics/Protein/DisulfideBond.cs b/mzLib/Proteomics/Protein/DisulfideBond.cs
index a338957e8..6bd4fa1af 100644
--- a/mzLib/Proteomics/Protein/DisulfideBond.cs
+++ b/mzLib/Proteomics/Protein/DisulfideBond.cs
@@ -25,7 +25,7 @@ public DisulfideBond(int OneBasedPosition, string Description)
         public int OneBasedEndPosition { get; set; }
 
         /// <summary>
-        /// CallFormat of this variation (optional)
+        /// Description of this variation (optional)
         /// </summary>
         public string Description { get; set; }
 
diff --git a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPeptide.cs b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPeptide.cs
index d7eac04ca..32def7c09 100644
--- a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPeptide.cs
+++ b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPeptide.cs
@@ -79,7 +79,7 @@ public string ProteinName
         [Name("Gene")]
         public string Gene { get; set; }
 
-        [Name("Protein CallFormat")]
+        [Name("Protein Description")]
         public string ProteinDescription { get; set; }
 
         [Name("Mapped Genes")]
diff --git a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerProtein.cs b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerProtein.cs
index fea95ac3b..37c043590 100644
--- a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerProtein.cs
+++ b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerProtein.cs
@@ -38,7 +38,7 @@ public class MsFraggerProtein
         [Name("Organism")]
         public string Organism { get; set; }
 
-        [Name("Protein CallFormat", "CallFormat")]
+        [Name("Protein Description", "Description")]
         public string Description { get; set; }
 
         [Name("Protein Existence")]
diff --git a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs
index 8ee977e6e..e5272a779 100644
--- a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs
+++ b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs
@@ -124,7 +124,7 @@ public class MsFraggerPsm : IQuantifiableRecord
         [Name("Gene")]
         public string Gene { get; set; }
 
-        [Name("Protein CallFormat")]
+        [Name("Protein Description")]
         public string ProteinDescription { get; set; }
 
         [Name("Mapped Genes")]
diff --git a/mzLib/Readers/InternalResults/IndividualResultRecords/SpectrumMatchFromTsvHeader.cs b/mzLib/Readers/InternalResults/IndividualResultRecords/SpectrumMatchFromTsvHeader.cs
index d71eac995..7ca469da5 100644
--- a/mzLib/Readers/InternalResults/IndividualResultRecords/SpectrumMatchFromTsvHeader.cs
+++ b/mzLib/Readers/InternalResults/IndividualResultRecords/SpectrumMatchFromTsvHeader.cs
@@ -46,7 +46,7 @@ public class SpectrumMatchFromTsvHeader
         public const string SpliceSites = "Splice Sites";
         public const string Contaminant = "Contaminant";
         public const string Decoy = "Decoy";
-        public const string Description = "CallFormat";
+        public const string Description = "Description";
         public const string StartAndEndResiduesInFullSequence = "Start and End Residues In Full Sequence";
         public const string PreviousResidue = "Previous Residue";
         public const string NextResidue = "Next Residue";
@@ -78,7 +78,7 @@ public class SpectrumMatchFromTsvHeader
         public const string PeptideMonoMass = "Peptide Monoisotopic Mass";
         public const string ProteinAccession = "Protein Accession";
         public const string ProteinName = "Protein Name";
-        public const string PeptideDescription = "Peptide CallFormat";
+        public const string PeptideDescription = "Peptide Description";
         public const string StartAndEndResiduesInProtein = "Start and End Residues In Protein";
         public const string PreviousAminoAcid = "Previous Amino Acid";
         public const string NextAminoAcid = "Next Amino Acid";
diff --git a/mzLib/Readers/Thermo/ThermoRawFileReader.cs b/mzLib/Readers/Thermo/ThermoRawFileReader.cs
index 7a61a0639..7ed5456ea 100644
--- a/mzLib/Readers/Thermo/ThermoRawFileReader.cs
+++ b/mzLib/Readers/Thermo/ThermoRawFileReader.cs
@@ -306,7 +306,7 @@ private static MsDataScan GetOneBasedScan(IRawDataPlus rawFile, IFilteringParams
                     HcdEnergy = values[i];
                 }
 
-                if (labels[i].StartsWith("Scan CallFormat", StringComparison.Ordinal))
+                if (labels[i].StartsWith("Scan Description", StringComparison.Ordinal))
                 {
                     scanDescript = values[i].TrimEnd();
                 }
diff --git a/mzLib/Test/DatabaseTests/TestProteinReader.cs b/mzLib/Test/DatabaseTests/TestProteinReader.cs
index cbfc4c603..7dcd0b4d8 100644
--- a/mzLib/Test/DatabaseTests/TestProteinReader.cs
+++ b/mzLib/Test/DatabaseTests/TestProteinReader.cs
@@ -125,7 +125,7 @@ public static void XmlTest()
             Assert.AreEqual(64, ok[0].SequenceVariations.First().OneBasedEndPosition);
             Assert.AreEqual(103 - 64 + 2, ok[1].SequenceVariations.First().OneBasedBeginPosition);
             Assert.AreEqual(103 - 64 + 2, ok[1].SequenceVariations.First().OneBasedEndPosition);
-            Assert.AreNotEqual(ok[0].SequenceVariations.First().CallFormat, ok[1].SequenceVariations.First().CallFormat); //decoys and target variations don't have the same desc.
+            Assert.AreNotEqual(ok[0].SequenceVariations.First().Description, ok[1].SequenceVariations.First().Description); //decoys and target variations don't have the same desc.
             Assert.AreEqual("Homo sapiens", ok[1].Organism);
         }
 
@@ -420,8 +420,8 @@ public static void TestReverseDecoyXML_WithCustomIdentifier()
 
                 foreach (var variant in protein.AppliedSequenceVariations)
                 {
-                    Assert.That(variant.CallFormat, Does.StartWith("rev"));
-                    Assert.That(variant.CallFormat, Does.Not.StartWith("DECOY"));
+                    Assert.That(variant.Description, Does.StartWith("rev"));
+                    Assert.That(variant.Description, Does.Not.StartWith("DECOY"));
                 }
 
                 foreach (var bond in protein.DisulfideBonds)
diff --git a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs
index 07acea46b..babe44a76 100644
--- a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs
+++ b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs
@@ -504,12 +504,12 @@ public void TestFullProteinReadWrite()
             Assert.AreEqual(originalProtein.TruncationProducts.First().OneBasedEndPosition, proteinReadFromXml[0].TruncationProducts.First().OneBasedEndPosition);
             Assert.AreEqual(originalProtein.TruncationProducts.First().Type, proteinReadFromXml[0].TruncationProducts.First().Type.Split('(')[0]);
 
-            Assert.AreEqual(originalProtein.SequenceVariations.First().CallFormat, proteinReadFromXml[0].SequenceVariations.First().CallFormat);
+            Assert.AreEqual(originalProtein.SequenceVariations.First().Description, proteinReadFromXml[0].SequenceVariations.First().Description);
             Assert.AreEqual(originalProtein.SequenceVariations.First().OneBasedBeginPosition, proteinReadFromXml[0].SequenceVariations.First().OneBasedBeginPosition);
             Assert.AreEqual(originalProtein.SequenceVariations.First().OneBasedEndPosition, proteinReadFromXml[0].SequenceVariations.First().OneBasedEndPosition);
             Assert.AreEqual(originalProtein.SequenceVariations.First().OriginalSequence, proteinReadFromXml[0].SequenceVariations.First().OriginalSequence);
             Assert.AreEqual(originalProtein.SequenceVariations.First().VariantSequence, proteinReadFromXml[0].SequenceVariations.First().VariantSequence);
-            Assert.AreEqual(originalProtein.SequenceVariations.Last().CallFormat, proteinReadFromXml[0].SequenceVariations.Last().CallFormat);
+            Assert.AreEqual(originalProtein.SequenceVariations.Last().Description, proteinReadFromXml[0].SequenceVariations.Last().Description);
             Assert.AreEqual(originalProtein.SequenceVariations.Last().OneBasedBeginPosition, proteinReadFromXml[0].SequenceVariations.Last().OneBasedBeginPosition);
             Assert.AreEqual(originalProtein.SequenceVariations.Last().OneBasedEndPosition, proteinReadFromXml[0].SequenceVariations.Last().OneBasedEndPosition);
             Assert.AreEqual(originalProtein.SequenceVariations.Last().OriginalSequence, proteinReadFromXml[0].SequenceVariations.Last().OriginalSequence);
@@ -534,7 +534,7 @@ public void TestReadWriteSeqVars()
             Assert.AreEqual(ok[0].SequenceVariations.Count(), ok2[0].SequenceVariations.Count());
             Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedBeginPosition, ok2[0].SequenceVariations.First().OneBasedBeginPosition);
             Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedEndPosition, ok2[0].SequenceVariations.First().OneBasedEndPosition);
-            Assert.AreEqual(ok[0].SequenceVariations.First().CallFormat, ok2[0].SequenceVariations.First().CallFormat);
+            Assert.AreEqual(ok[0].SequenceVariations.First().Description, ok2[0].SequenceVariations.First().Description);
             Assert.AreEqual(ok[0].SequenceVariations.First().OriginalSequence, ok2[0].SequenceVariations.First().OriginalSequence);
             Assert.AreEqual(ok[0].SequenceVariations.First().VariantSequence, ok2[0].SequenceVariations.First().VariantSequence);
         }
@@ -557,7 +557,7 @@ public void TestReadWriteSeqVars2()
             Assert.AreEqual(ok[0].SequenceVariations.Count(), ok2[0].SequenceVariations.Count());
             Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedBeginPosition, ok2[0].SequenceVariations.First().OneBasedBeginPosition);
             Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedEndPosition, ok2[0].SequenceVariations.First().OneBasedEndPosition);
-            Assert.AreEqual(ok[0].SequenceVariations.First().CallFormat, ok2[0].SequenceVariations.First().CallFormat);
+            Assert.AreEqual(ok[0].SequenceVariations.First().Description, ok2[0].SequenceVariations.First().Description);
             Assert.AreEqual(ok[0].SequenceVariations.First().OriginalSequence, ok2[0].SequenceVariations.First().OriginalSequence);
             Assert.AreEqual(ok[0].SequenceVariations.First().VariantSequence, ok2[0].SequenceVariations.First().VariantSequence);
         }
diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
index 27d545cfe..8ced7b208 100644
--- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs
+++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
@@ -115,7 +115,7 @@ public static void SeqVarXmlTest()
             {
                 Assert.AreEqual(s.OriginalSequence, decoy.BaseSequence.Substring(s.OneBasedBeginPosition - 1, s.OneBasedEndPosition - s.OneBasedBeginPosition + 1));
             }
-            Assert.AreNotEqual(target.SequenceVariations.First().CallFormat, decoy.SequenceVariations.First().CallFormat); //decoys and target variations don't have the same desc.
+            Assert.AreNotEqual(target.SequenceVariations.First().Description, decoy.SequenceVariations.First().Description); //decoys and target variations don't have the same desc.
 
             List<PeptideWithSetModifications> peptides = ok.SelectMany(vp => vp.Digest(new DigestionParams(), null, null)).ToList();
         }
@@ -490,7 +490,7 @@ public void VariantSymbolWeirdnessXml()
             string file = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "SeqVarSymbolWeirdness.xml");
             List<Protein> variantProteins = ProteinDbLoader.LoadProteinXML(file, true, DecoyType.None, null, false, null, out var un);
             Assert.AreEqual(12, variantProteins.First().ConsensusVariant.SequenceVariations.Count());
-            Assert.AreEqual(2, variantProteins.First().ConsensusVariant.SequenceVariations.Count(v => v.CallFormat.Heterozygous.Any(kv => kv.Value)));
+            Assert.AreEqual(2, variantProteins.First().ConsensusVariant.SequenceVariations.Count(v => v.Description.Heterozygous.Any(kv => kv.Value)));
 
             Assert.AreEqual(1, variantProteins.Count); // Should be 2^2 from combinitorics of heterozygous, but the giant indels overwrite them
             Assert.AreEqual(0, variantProteins.Where(v => v.BaseSequence == variantProteins.First().ConsensusVariant.BaseSequence).Count()); // Homozygous variations are included
diff --git a/mzLib/Test/FlashLFQ/TestIsoTracker.cs b/mzLib/Test/FlashLFQ/TestIsoTracker.cs
index 562126ec8..59720e4ff 100644
--- a/mzLib/Test/FlashLFQ/TestIsoTracker.cs
+++ b/mzLib/Test/FlashLFQ/TestIsoTracker.cs
@@ -25,7 +25,7 @@ internal class TestIsoTracker
         [Test]
         public static void TestIsobaricPeptideGroup()
         {
-            // CallFormat: Test the IsobaricPeptideGroup class
+            // Description: Test the IsobaricPeptideGroup class
             // In this testing, we will create a new IsobaricPeptideGroup and check the properties
             List<Identification> ids = new List<Identification>
             {
@@ -126,7 +126,7 @@ public static void TestIsoTrackerIdFilter_FilterPeptide()
         [Test]
         public static void TestGetTargeMz_case1()
         {
-            // CallFormat: Test the GetTargetMz function in FlashLfqEngine
+            // Description: Test the GetTargetMz function in FlashLfqEngine
             // In this testing, we will check the isobaricPeptideGroup and targetMzs output
             // All three ids are isobaric peptides with the same monoisotopic mass, so they should be grouped together and generate only 5 target m/z values
 
@@ -189,7 +189,7 @@ public static void TestGetTargeMz_case1()
         [Test]
         public static void TestGetTargeMz_case2()
         {
-            // CallFormat: Test the GetTargetMz function in FlashLfqEngine
+            // Description: Test the GetTargetMz function in FlashLfqEngine
             // In this testing, we will check the isobaricPeptideGroup and targetMzs output
             // All three ids are isobaric peptides with the different monoisotopic mass, so they should be grouped together and generate 15(3*5) target m/z values
 
@@ -253,7 +253,7 @@ public static void TestGetTargeMz_case2()
         [Test]
         public static void TestIndexPeakPrune()
         {
-            // CallFormat: Test the peak indexing engine pruning function
+            // Description: Test the peak indexing engine pruning function
             // In this test, we will create the targetMzs from the ids to prune the indexPeaks.
             // After pruning, the index engine should only keep the peaks with the target m/z values.
             string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData");
@@ -329,7 +329,7 @@ public static void TestXICConstructor()
         [Test]
         public static void TestLinearSpline()
         {
-            //CallFormat: Test the linear spline interpolation and differentiation
+            //Description: Test the linear spline interpolation and differentiation
             //The testing model is a linear function y = 100x, where x is the time point and y is the intensity
             //The slope will be 100 and the second derivative will be 0
 
@@ -361,7 +361,7 @@ public static void TestLinearSpline()
         [Test]
         public static void TestPeakAlignment()
         {
-            //CallFormat: Test the peak alignment function
+            //Description: Test the peak alignment function
             //The testing model is a triangle peak with the Apex.
             //The Apex of three peaks are 3, 3.1, 2.9 min
             //The time shift should be 0.1 min for the peak2 and -0.1 min for the peak3
@@ -410,7 +410,7 @@ public static void TestPeakAlignment()
         [Test]
         public static void TestBuildSmoothedCubicSpline_LessPoint()
         {
-            //CallFormat: Test the cubic spline interpolation
+            //Description: Test the cubic spline interpolation
             //The testing model has less than 5 points that cannot build the cubic spline
             //The cubic spline should be null
 
@@ -531,7 +531,7 @@ public static void TestXICGroupConstructor()
         [Test]
         public static void TestXICGroup_RtDict()
         {
-            //CallFormat: Test the peakAlignment function in the XICGroup
+            //Description: Test the peakAlignment function in the XICGroup
             //The testing has three normal distribution XIC peaks.
             //The Apex of three peaks are 3, 3.1, 2.9 min
             //The time shift should be 0.1 min for the peak2 and -0.1 min for the peak3
@@ -582,7 +582,7 @@ public static void TestXICGroup_RtDict()
         [Test]
         public static void TestXICGroup_IdList()
         {
-            //CallFormat: Test the IdList in the XICGroup
+            //Description: Test the IdList in the XICGroup
             //The testing model has three XICs, one of this XIC has no Id, then it borrows one Id from the first XIC
             //If the Id is borrowed, the Id will not be added into the IdList
             //The IdList should contain the Ids from the first and the third XIC
@@ -617,7 +617,7 @@ public static void TestXICGroup_IdList()
         [Test]
         public static void TestXICGroup_Tracking()
         {
-            //CallFormat: Test the peak tracking function in the XICGroup
+            //Description: Test the peak tracking function in the XICGroup
             //The testing has three normal distribution XIC peaks.
             //The Apex of three peaks are 20, 23, 17 min
             //The time shift should be +3 min for the peak2 and -3 min for the peak3
@@ -688,7 +688,7 @@ public static void TestXICGroup_Tracking()
         [Test]
         public static void TestCombinedSearching()
         {
-            //CallFormat: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output
+            //Description: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output
             //There are three XIC included isobaric peaks that with 3 min gap.
 
             string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData");
@@ -796,7 +796,7 @@ public static void TestCombinedSearching()
         [Test]
         public static void TestPeakOutput()
         {
-            //CallFormat: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output
+            //Description: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output
             //There are three XIC included isobaric peaks that with 3 min gap.
 
             string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData");
@@ -931,7 +931,7 @@ public static void TestPeakOutput()
         [Test]
         public static void TestIsoSequence_Ambiguous()
         {
-            //CallFormat: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID
+            //Description: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID
             //IsoID: DIVENY[Common Variable:Oxidation on M]FMR   should be the same as DIVENYFM[Common Variable:Oxidation on M]R
 
             //Try to turn on the MBR and Isotracker at the same time
@@ -1066,7 +1066,7 @@ public static void TestIsoSequence_Ambiguous()
         [Test]
         public static void TestIsoSequence_MonoIsotopicMassTolerance()
         {
-            //CallFormat: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID
+            //Description: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID
             //IsoID: DIVENY[Common Variable:Oxidation on M]FMR   should be the same as DIVENYFM[Common Variable:Oxidation on M]R
             //The Monoisotopic mass are 1201.5436, 1201.5437, 1201.5438, they should be recognized as the same IsoID
 
@@ -1491,7 +1491,7 @@ public static void TestIsoSequence_CombinedTesting()
         [Test]
         public static void TestRun_SearchingTarget()
         {
-            //CallFormat: we will upload a motifList for IsoTracker
+            //Description: we will upload a motifList for IsoTracker
             //Only peptide with motif on N can be searched
             //In this case, only one kind of peptide can be searched: baseSequence PEPNINEN -> PEPN[Mod]INEN, PEPNIN[Mod]EN, PEPNINEN[Mod]
             // Run 1 with PEPNIN[Mod]EN, PEPNINEN[Mod]
@@ -1615,7 +1615,7 @@ public static void TestRun_SearchingTarget()
         [Test]
         public static void TestRun_IDChecking()
         {
-            //CallFormat: we will turn on the IDchecking for IsoTracker
+            //Description: we will turn on the IDchecking for IsoTracker
             //Only when one XIC with more than one id, we do the searching
             //In this case, run 1 has 4 ids (pepA_1, pepA_2, pepB_1, pepC_1)
             //run 2 has 3 ids (pepA_1, pepB_1, pepC_1)
diff --git a/mzLib/Test/Transcriptomics/TestDbLoader.cs b/mzLib/Test/Transcriptomics/TestDbLoader.cs
index cea7bb7c7..db5de6417 100644
--- a/mzLib/Test/Transcriptomics/TestDbLoader.cs
+++ b/mzLib/Test/Transcriptomics/TestDbLoader.cs
@@ -291,7 +291,7 @@ public static void TestEnsemblFastaParsing()
             Assert.That(fields["GeneBiotype"], Is.EqualTo("snRNA"));
             Assert.That(fields["TranscriptBiotype"], Is.EqualTo("snRNA"));
             Assert.That(fields["GeneSymbol"], Is.EqualTo("U6"));
-            Assert.That(fields["CallFormat"], Does.Contain("U6 spliceosomal RNA"));
+            Assert.That(fields["Description"], Does.Contain("U6 spliceosomal RNA"));
 
         }
 
diff --git a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs
index 949cf68a3..c83c80aa1 100644
--- a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs
+++ b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs
@@ -239,34 +239,34 @@ private static List<SequenceVariation> ReverseSequenceVariations(IEnumerable<Seq
                     decoyVariations.Add(new SequenceVariation(sv.OneBasedBeginPosition,
                         reversedSequence.Substring(sv.OneBasedBeginPosition - 1, sv.OneBasedEndPosition - sv.OneBasedBeginPosition + 1),
                         new string(variationArray).Substring(1, variationArray.Length - 1) + variationArray[0],
-                        $"{decoyIdentifier} VARIANT: " + sv.CallFormat, decoyVariantModifications));
+                        $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
                 }
                 // start loss, so the variant is at the end
                 else if (startLoss)
                 {
-                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, new string(originalArray).Substring(0, originalArray.Length - 1), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.CallFormat, decoyVariantModifications));
+                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, new string(originalArray).Substring(0, originalArray.Length - 1), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
                 }
                 // both start with M, but there's more
                 else if (sv.VariantSequence.StartsWith("M", StringComparison.Ordinal) && sv.OneBasedBeginPosition == 1 && (sv.OriginalSequence.Length > 1 || sv.VariantSequence.Length > 1))
                 {
                     string original = new string(originalArray).Substring(0, originalArray.Length - 1);
                     string variant = new string(variationArray).Substring(0, variationArray.Length - 1);
-                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, original, variant, $"{decoyIdentifier} VARIANT: " + sv.CallFormat, decoyVariantModifications));
+                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, original, variant, $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
                 }
                 // gained an initiating methionine
                 else if (sv.VariantSequence.StartsWith("M", StringComparison.Ordinal) && sv.OneBasedBeginPosition == 1)
                 {
-                    decoyVariations.Add(new SequenceVariation(1, 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.CallFormat, decoyVariantModifications));
+                    decoyVariations.Add(new SequenceVariation(1, 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
                 }
                 // starting methionine, but no variations on it
                 else if (startsWithM)
                 {
-                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 2, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.CallFormat, decoyVariantModifications));
+                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 2, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
                 }
                 // no starting methionine
                 else
                 {
-                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 1, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.CallFormat, decoyVariantModifications));
+                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 1, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
                 }
             }
             return decoyVariations;
@@ -335,7 +335,7 @@ private static List<Protein> GenerateSlideDecoys(List<Protein> proteins, int max
                         {
                             variationArraySlided[i] = variationArrayUnslided[GetOldSlidedIndex(i, numSlidesHere, variationArrayUnslided.Length, true)];
                         }
-                        decoyVariationsSlide.Add(new SequenceVariation(1, "M", new string(variationArraySlided), $"{decoyIdentifier} VARIANT: Initiator Methionine Change in " + sv.CallFormat));
+                        decoyVariationsSlide.Add(new SequenceVariation(1, "M", new string(variationArraySlided), $"{decoyIdentifier} VARIANT: Initiator Methionine Change in " + sv.Description));
                     }
                     else
                     {
@@ -352,7 +352,7 @@ private static List<Protein> GenerateSlideDecoys(List<Protein> proteins, int max
                             variationArraySlided[i] = variationArrayUnslided[GetOldSlidedIndex(i, numSlidesHere, variationArrayUnslided.Length, initMet)];
                         }
 
-                        decoyVariationsSlide.Add(new SequenceVariation(decoy_begin, decoy_end, sv.OriginalSequence, new string(variationArraySlided), $"{decoyIdentifier} VARIANT: " + sv.CallFormat));
+                        decoyVariationsSlide.Add(new SequenceVariation(decoy_begin, decoy_end, sv.OriginalSequence, new string(variationArraySlided), $"{decoyIdentifier} VARIANT: " + sv.Description));
                     }
                 }
                 var decoyProteinSlide = new Protein(slided_sequence, $"{decoyIdentifier}_" + protein.Accession, protein.Organism, protein.GeneNames.ToList(), decoyModifications, decoyPPSlide,
diff --git a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs
index 731f80bc7..cc7723c15 100644
--- a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs
+++ b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs
@@ -87,7 +87,7 @@ private static List<T> GenerateReverseDecoys<T>(List<T> nucleicAcids, int maxThr
                             var reverseModKey = indexMapping[modKvp.Key];
                             reverseModificationsForVariation.Add(reverseModKey, modKvp.Value);
                         }
-                        reverseAppliedVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.CallFormat.Description, reverseModificationsForVariation));
+                        reverseAppliedVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.Description.Description, reverseModificationsForVariation));
                     }
 
                     // Reverse Applied Variants
@@ -101,7 +101,7 @@ private static List<T> GenerateReverseDecoys<T>(List<T> nucleicAcids, int maxThr
                             var reverseModKey = indexMapping[modKvp.Key];
                             reverseModificationsForVariation.Add(reverseModKey, modKvp.Value);
                         }
-                        reverseVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.CallFormat.Description, reverseModificationsForVariation));
+                        reverseVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.Description.Description, reverseModificationsForVariation));
                     }
 
                     // Reverse Truncations
diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs
index 2cff6c0af..24ff44dd4 100644
--- a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs
+++ b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs
@@ -206,7 +206,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
                     {
                         writer.WriteStartElement("feature");
                         writer.WriteAttributeString("type", "sequence variant");
-                        writer.WriteAttributeString("description", hm.CallFormat.ToString());
+                        writer.WriteAttributeString("description", hm.Description.ToString());
                         writer.WriteStartElement("original");
                         writer.WriteString(hm.OriginalSequence);
                         writer.WriteEndElement(); // original
@@ -252,7 +252,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
                     //{
                     //    writer.WriteStartElement("feature");
                     //    writer.WriteAttributeString("type", "splice site");
-                    //    writer.WriteAttributeString("description", hm.CallFormat);
+                    //    writer.WriteAttributeString("description", hm.Description);
                     //    writer.WriteStartElement("location");
                     //    if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition)
                     //    {
@@ -464,7 +464,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
                     {
                         writer.WriteStartElement("feature");
                         writer.WriteAttributeString("type", "sequence variant");
-                        writer.WriteAttributeString("description", hm.CallFormat.ToString());
+                        writer.WriteAttributeString("description", hm.Description.ToString());
                         writer.WriteStartElement("original");
                         writer.WriteString(hm.OriginalSequence);
                         writer.WriteEndElement(); // original
diff --git a/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs
index 36742bf8d..b80827568 100644
--- a/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs
+++ b/mzLib/UsefulProteomicsDatabases/Transcriptomics/RnaDbLoader.cs
@@ -89,7 +89,7 @@ public static RnaFastaHeaderType DetectRnaFastaHeaderType(string line)
                 { "GeneBiotype", new FastaHeaderFieldRegex("GeneBiotype", @"gene_biotype:([^\s]+)", 0, 1) },
                 { "TranscriptBiotype", new FastaHeaderFieldRegex("TranscriptBiotype", @"transcript_biotype:([^\s]+)", 0, 1) },
                 { "GeneSymbol", new FastaHeaderFieldRegex("GeneSymbol", @"gene_symbol:([^\s]+)", 0, 1) },
-                { "CallFormat", new FastaHeaderFieldRegex("CallFormat", @"description:([^\[]+)", 0, 1) },
+                { "Description", new FastaHeaderFieldRegex("Description", @"description:([^\[]+)", 0, 1) },
             };
 
         public static readonly Dictionary<string, FastaHeaderFieldRegex> NcbiAssemblyFieldRegexes =

From 1f5587bce81b3ff3b07d4bcbc8dd6d6ea610749e Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Wed, 22 Oct 2025 15:55:29 -0500
Subject: [PATCH 04/38] more vcf tests

---
 mzLib/Test/DatabaseTests/VariantTests.cs | 123 +++++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 mzLib/Test/DatabaseTests/VariantTests.cs

diff --git a/mzLib/Test/DatabaseTests/VariantTests.cs b/mzLib/Test/DatabaseTests/VariantTests.cs
new file mode 100644
index 000000000..ebaccf3bf
--- /dev/null
+++ b/mzLib/Test/DatabaseTests/VariantTests.cs
@@ -0,0 +1,123 @@
+﻿using NUnit.Framework;
+using Omics.BioPolymer;
+using System;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+
+namespace Test.DatabaseTests
+{
+    [TestFixture]
+    [System.Diagnostics.CodeAnalysis.ExcludeFromCodeCoverage]
+    public class VariantCallFormatTests
+    {
+        [Test]
+        public void ParseComprehensiveVcfExamples()
+        {
+            string current = TestContext.CurrentContext.TestDirectory;
+            string vcfPath = null;
+            while (current != null)
+            {
+                var candidate = Path.Combine(current, "Test", "DatabaseTests", "vcf_comprehensive_examples.vcf");
+                if (File.Exists(candidate))
+                {
+                    vcfPath = candidate;
+                    break;
+                }
+                current = Directory.GetParent(current)?.FullName;
+            }
+
+            Assert.That(vcfPath, Is.Not.Null, "Could not locate vcf_comprehensive_examples.vcf");
+
+            var lines = File.ReadAllLines(vcfPath);
+
+            var dataRows = lines
+                .Where(l => !string.IsNullOrWhiteSpace(l))
+                .Where(l => !l.StartsWith("##"))
+                .Where(l => !l.StartsWith("#CHROM"))
+                .ToList();
+
+            Assert.That(dataRows.Count, Is.EqualTo(8), "Expected 8 example variant rows.");
+
+            for (int rowIndex = 0; rowIndex < dataRows.Count; rowIndex++)
+            {
+                string originalLine = dataRows[rowIndex];
+                string[] rawFields = originalLine.Split('\t');
+                Assert.That(rawFields.Length, Is.GreaterThanOrEqualTo(10), $"Row {rowIndex + 1}: insufficient columns.");
+
+                var vcf = new VariantCallFormat(originalLine);
+
+                Assert.That(vcf.Description, Is.EqualTo(originalLine));
+                Assert.That(vcf.ReferenceAlleleString, Is.EqualTo(rawFields[3]));
+                Assert.That(vcf.AlternateAlleleString, Is.EqualTo(rawFields[4]));
+                Assert.That(vcf.Format, Is.EqualTo(rawFields[8]));
+
+                if (rawFields[7] == ".")
+                {
+                    Assert.That(vcf.Info.Annotation, Is.EqualTo(rawFields[7]));
+                }
+
+                var sampleFields = rawFields.Skip(9).ToArray();
+                Assert.That(vcf.Genotypes.Count, Is.EqualTo(sampleFields.Length));
+                Assert.That(vcf.AlleleDepths.Count, Is.EqualTo(sampleFields.Length));
+                Assert.That(vcf.Homozygous.Count, Is.EqualTo(sampleFields.Length));
+                Assert.That(vcf.Heterozygous.Count, Is.EqualTo(sampleFields.Length));
+                Assert.That(vcf.ZygosityBySample.Count, Is.EqualTo(sampleFields.Length));
+
+                for (int sampleIndex = 0; sampleIndex < sampleFields.Length; sampleIndex++)
+                {
+                    string sample = sampleFields[sampleIndex];
+                    string key = sampleIndex.ToString();
+
+                    string[] parts = sample.Split(':');
+                    Assert.That(parts.Length, Is.EqualTo(vcf.Format.Split(':').Length));
+
+                    string gtPart = parts[0];
+                    string adPart = parts.Length > 1 ? parts[1] : null;
+
+                    // Expected GT tokens
+                    string[] expectedGtTokens = gtPart.Split(new[] { '/', '|' }, StringSplitOptions.RemoveEmptyEntries);
+                    if (gtPart.Contains('.') && expectedGtTokens.Length == 1 &&
+                        (gtPart == "./." || gtPart == ".|." || gtPart == ".|1" || gtPart == "0|." || gtPart == "0/."))
+                    {
+                        expectedGtTokens = new[] { ".", "." };
+                    }
+
+                    Assert.That(vcf.Genotypes.ContainsKey(key));
+                    var parsedGt = vcf.Genotypes[key];
+                    Assert.That(parsedGt, Is.EqualTo(expectedGtTokens));
+
+                    // Expected AD tokens
+                    string[] expectedAdTokens =
+                        string.IsNullOrWhiteSpace(adPart) ? Array.Empty<string>() :
+                        adPart == "." ? new[] { "." } :
+                        adPart.Split(',');
+
+                    Assert.That(vcf.AlleleDepths.ContainsKey(key));
+                    var parsedAd = vcf.AlleleDepths[key] ?? Array.Empty<string>();
+                    if (!(parsedAd.Length == 0 && expectedAdTokens.Length == 1 && expectedAdTokens[0] == "."))
+                    {
+                        Assert.That(parsedAd, Is.EqualTo(expectedAdTokens));
+                    }
+
+                    // Expected zygosity using ONLY non-missing alleles (must mirror implementation)
+                    var calledAlleles = parsedGt.Where(a => a != ".").ToArray();
+                    bool expectedHom = calledAlleles.Length > 0 && calledAlleles.Distinct().Count() == 1;
+                    bool expectedHet = calledAlleles.Distinct().Count() > 1;
+                    VariantCallFormat.Zygosity expectedZ =
+                        calledAlleles.Length == 0
+                            ? VariantCallFormat.Zygosity.Unknown
+                            : expectedHet
+                                ? VariantCallFormat.Zygosity.Heterozygous
+                                : VariantCallFormat.Zygosity.Homozygous;
+
+                    Assert.That(vcf.Homozygous[key], Is.EqualTo(expectedHom));
+                    Assert.That(vcf.Heterozygous[key], Is.EqualTo(expectedHet));
+                    Assert.That(vcf.ZygosityBySample[key], Is.EqualTo(expectedZ));
+                }
+            }
+        }
+    }
+}

From 3e965982abee58366d1e753f8f2daed97f743dc1 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Wed, 22 Oct 2025 15:57:57 -0500
Subject: [PATCH 05/38] comprehensive vcf examples

---
 .../vcf_comprehensive_examples.vcf            | 30 +++++++++++++++++++
 mzLib/Test/Test.csproj                        |  3 ++
 2 files changed, 33 insertions(+)
 create mode 100644 mzLib/Test/DatabaseTests/vcf_comprehensive_examples.vcf

diff --git a/mzLib/Test/DatabaseTests/vcf_comprehensive_examples.vcf b/mzLib/Test/DatabaseTests/vcf_comprehensive_examples.vcf
new file mode 100644
index 000000000..ef7fd0698
--- /dev/null
+++ b/mzLib/Test/DatabaseTests/vcf_comprehensive_examples.vcf
@@ -0,0 +1,30 @@
+##fileformat=VCFv4.2
+##INFO=<ID=ANN,Number=.,Type=String,Description="Annotation">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read depth">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SAMPLE1	SAMPLE2	SAMPLE3	SAMPLE4
+
+## Example 1: Basic SNP with common genotypes
+1	1000	.	A	G	.	PASS	.	GT:AD:DP	0/0:40,0:40	0/1:20,18:38	1/1:0,42:42	./.:0,0:0
+
+## Example 2: Multi-allelic site (REF=A, ALT=G,T)
+1	2000	.	A	G,T	.	PASS	.	GT:AD:DP	0/2:25,0,20:45	1/2:0,15,12:27	2/2:0,0,30:30	0/0:35,0,0:35
+
+## Example 3: Phased genotypes
+1	3000	.	C	T	.	PASS	.	GT:AD:DP	0|1:22,18:40	1|0:21,19:40	.|1:.:25	0|.:.:30
+
+## Example 4: Partial missing alleles
+1	4000	.	G	A	.	PASS	.	GT:AD:DP	0|.:12,0:12	.|1:0,8:8	./.:.:0	0/.:15,0:15
+
+## Example 5: Low coverage and uneven allele balance
+1	5000	.	T	C	.	PASS	.	GT:AD:DP	0/1:1,10:11	1/1:0,5:5	0/0:3,0:3	0/1:2,8:10
+
+## Example 6: Multi-allelic with three ALT alleles (REF=A, ALT=G,T,C)
+1	6000	.	A	G,T,C	.	PASS	.	GT:AD:DP	0/3:30,0,0,12:42	1/3:0,20,0,5:25	2/3:0,0,15,7:22	3/3:0,0,0,20:20
+
+## Example 7: Zero depth and missing data
+1	7000	.	C	G	.	PASS	.	GT:AD:DP	./.:0,0:0	0/0:0,0:0	0/1:.:.	./.:.:.
+
+## Example 8: High-depth site
+1	8000	.	G	A	.	PASS	.	GT:AD:DP	0/1:500,520:1020	1/1:0,1000:1000	0/0:950,0:950	0/1:480,500:980
diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj
index c2c444cf7..59c393b39 100644
--- a/mzLib/Test/Test.csproj
+++ b/mzLib/Test/Test.csproj
@@ -274,6 +274,9 @@
     <None Update="DatabaseTests\VariantModsGPTMD.xml">
       <CopyToOutputDirectory>Always</CopyToOutputDirectory>
     </None>
+    <None Update="DatabaseTests\vcf_comprehensive_examples.vcf">
+      <CopyToOutputDirectory>Always</CopyToOutputDirectory>
+    </None>
     <None Update="DatabaseTests\Working_86.tsv">
       <CopyToOutputDirectory>Always</CopyToOutputDirectory>
     </None>

From c0e4433a03415b9d0d151ffeb45c8934aa15ef3c Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Wed, 22 Oct 2025 16:07:36 -0500
Subject: [PATCH 06/38] test invalid variant

---
 mzLib/Test/DatabaseTests/VariantTests.cs | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/mzLib/Test/DatabaseTests/VariantTests.cs b/mzLib/Test/DatabaseTests/VariantTests.cs
index ebaccf3bf..e738e82c2 100644
--- a/mzLib/Test/DatabaseTests/VariantTests.cs
+++ b/mzLib/Test/DatabaseTests/VariantTests.cs
@@ -1,5 +1,6 @@
 ﻿using NUnit.Framework;
 using Omics.BioPolymer;
+using Omics.Modifications;
 using System;
 using System.Collections.Generic;
 using System.IO;
@@ -119,5 +120,21 @@ public void ParseComprehensiveVcfExamples()
                 }
             }
         }
+        [Test]
+        public void Constructor_InvalidCoordinates_ThrowsArgumentException()
+        {
+            // Minimal valid VCF line (10 columns) so VariantCallFormat parses without truncation.
+            // Arrange: end < begin (invalid coordinates)
+            var sv = new SequenceVariation(
+                oneBasedBeginPosition: 5,
+                oneBasedEndPosition: 4,
+                originalSequence: "A",
+                variantSequence: "V",
+                description: "invalid-coords",
+                oneBasedModifications: null);
+
+            // Assert: SequenceVariation does not throw on construction; it reports invalid via AreValid()
+            Assert.That(sv.AreValid(), Is.False);
+        }
     }
 }

From 65c87293faab38576374045a297ba9d26db3f10e Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Wed, 22 Oct 2025 16:24:57 -0500
Subject: [PATCH 07/38] add line to proteindbloader

---
 mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs
index 143a9e327..a550bf422 100644
--- a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs
+++ b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs
@@ -108,6 +108,12 @@ public static List<Protein> LoadProteinXML(string proteinDbLocation, bool genera
                             Protein newProtein = block.ParseEndElement(xml, modTypesToExclude, unknownModifications, isContaminant, proteinDbLocation);
                             if (newProtein != null)
                             {
+                                //If we have read any modifications that are nucleotide substitutions, convert them to sequence variants here:
+                                //newProtein.ConsensusVariant.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
+                                if (newProtein.OneBasedPossibleLocalizedModifications.Any(m => m.Value.Any(mt => mt.ModificationType.Contains("nucleotide substitution"))))
+                                {
+                                    newProtein.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
+                                }
                                 if (addTruncations)
                                 {
                                     newProtein.AddTruncations();

From 302b80497c9f6997bc8919df676bf3fc8b7578a8 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Wed, 22 Oct 2025 16:42:03 -0500
Subject: [PATCH 08/38] unit tests

---
 mzLib/Test/DatabaseTests/VariantTests.cs   | 197 +++++++++++++++++++++
 mzLib/Test/Transcriptomics/TestDbLoader.cs |   1 -
 2 files changed, 197 insertions(+), 1 deletion(-)

diff --git a/mzLib/Test/DatabaseTests/VariantTests.cs b/mzLib/Test/DatabaseTests/VariantTests.cs
index e738e82c2..603ca8b47 100644
--- a/mzLib/Test/DatabaseTests/VariantTests.cs
+++ b/mzLib/Test/DatabaseTests/VariantTests.cs
@@ -1,6 +1,7 @@
 ﻿using NUnit.Framework;
 using Omics.BioPolymer;
 using Omics.Modifications;
+using Proteomics;
 using System;
 using System.Collections.Generic;
 using System.IO;
@@ -137,4 +138,200 @@ public void Constructor_InvalidCoordinates_ThrowsArgumentException()
             Assert.That(sv.AreValid(), Is.False);
         }
     }
+    [TestFixture]
+    public class VariantApplicationConvertNucleotideSubstitutionTests
+    {
+        // Helper to create a minimal substitution modification matching the required detection pattern
+        private static Modification Substitution(string idArrow)
+        {
+            // If you want this helper to be convertible by the code under test,
+            // give it a matching motif for the site where it will be placed.
+            // For now keep it generic (unused in this test).
+            return new Modification(
+                idArrow,                          // originalId
+                null,                             // accession
+                "1 nucleotide substitution",      // modificationType
+                null,                             // featureType
+                null,                             // target motif
+                "Anywhere.",                      // location restriction
+                null,                             // chemical formula
+                0,                                // monoisotopic mass
+                new Dictionary<string, IList<string>>(), // databaseReference
+                null,                             // taxonomicRange
+                null,                             // keywords
+                null,                             // neutralLosses
+                null,                             // diagnosticIons
+                null);                            // fileOrigin
+        }
+
+        // Non-substitution (should be ignored)
+        private static Modification Other(string id, double mass = 15.9949)
+        {
+            // Generic oxidation at P motif (unused by main test path)
+            ModificationMotif.TryGetMotif("P", out var motifP);
+            return new Modification(
+                id,
+                null,
+                "oxidation",
+                null,
+                motifP,
+                "Anywhere.",
+                null,
+                mass,
+                new Dictionary<string, IList<string>>(),
+                null,
+                null,
+                null,
+                null,
+                null);
+        }
+
+        // Malformed substitution (no "->" pattern) must be ignored
+        private static Modification Malformed()
+        {
+            ModificationMotif.TryGetMotif("Q", out var motifQ);
+            return new Modification(
+                "E>A",
+                null,
+                "1 nucleotide substitution",
+                null,
+                motifQ,
+                "Anywhere.",
+                null,
+                0,
+                new Dictionary<string, IList<string>>(),
+                null,
+                null,
+                null,
+                null,
+                null);
+        }
+
+        [Test]
+        public void ConvertNucleotideSubstitutionModificationsToSequenceVariants_Comprehensive()
+        {
+            // 1 M, 2 A, 3 E, 4 W, 5 P, 6 Q, 7 K
+            var protein = new Protein("MAEWPQK", "TEST_PROT");
+
+            static Modification MakeSub(string idArrow, char originalResidue)
+            {
+                ModificationMotif.TryGetMotif(originalResidue.ToString(), out var motif);
+                return new Modification(
+                    idArrow,
+                    null,
+                    "1 nucleotide substitution",
+                    null,
+                    motif,
+                    "Anywhere.",
+                    null,
+                    0,
+                    new Dictionary<string, IList<string>>(),
+                    null,
+                    null,
+                    null,
+                    null,
+                    null);
+            }
+
+            static Modification MakeOther(string id)
+            {
+                ModificationMotif.TryGetMotif("P", out var motifP);
+                return new Modification(
+                    id,
+                    null,
+                    "oxidation",
+                    null,
+                    motifP,
+                    "Anywhere.",
+                    null,
+                    15.9949,
+                    new Dictionary<string, IList<string>>(),
+                    null,
+                    null,
+                    null,
+                    null,
+                    null);
+            }
+
+            static Modification MakeMalformed()
+            {
+                ModificationMotif.TryGetMotif("Q", out var motifQ);
+                return new Modification(
+                    "E>A",
+                    null,
+                    "1 nucleotide substitution",
+                    null,
+                    motifQ,
+                    "Anywhere.",
+                    null,
+                    0,
+                    new Dictionary<string, IList<string>>(),
+                    null,
+                    null,
+                    null,
+                    null,
+                    null);
+            }
+
+            void AddMod(Protein p, int pos, Modification m)
+            {
+                if (!p.OneBasedPossibleLocalizedModifications.TryGetValue(pos, out var list1))
+                {
+                    list1 = new List<Modification>();
+                    p.OneBasedPossibleLocalizedModifications[pos] = list1;
+                }
+                list1.Add(m);
+
+                if (!p.OriginalNonVariantModifications.TryGetValue(pos, out var list2))
+                {
+                    list2 = new List<Modification>();
+                    p.OriginalNonVariantModifications[pos] = list2;
+                }
+                list2.Add(m);
+            }
+
+            // Mods to seed
+            var modEtoA = MakeSub("E->A", 'E'); // pos 3
+            var modWtoK = MakeSub("W->K", 'W'); // pos 4
+            var modOxidP = MakeOther("Oxid_P"); // pos 5
+            var malformed = MakeMalformed();    // pos 6
+
+            AddMod(protein, 3, modEtoA);
+            AddMod(protein, 4, modWtoK);
+            AddMod(protein, 5, modOxidP);
+            AddMod(protein, 6, malformed);
+
+            // Pre-existing W->K (may be duplicated by converter if description differs)
+            protein.SequenceVariations.Add(new SequenceVariation(4, 4, "W", "K", "Existing substitution"));
+
+            // Act
+            protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
+
+            // Assert unique AA changes, not raw count (converter may add standardized duplicates)
+            var uniqueChanges = protein.SequenceVariations.Select(v => v.SimpleString()).Distinct().ToList();
+            Assert.That(uniqueChanges.Count, Is.EqualTo(2), "Expected exactly two unique substitutions (E3->A and W4->K).");
+
+            // Ensure E3->A exists
+            var eToA = protein.SequenceVariations.SingleOrDefault(v =>
+                v.OneBasedBeginPosition == 3 && v.OneBasedEndPosition == 3 &&
+                v.OriginalSequence == "E" && v.VariantSequence == "A");
+            Assert.That(eToA, Is.Not.Null, "E3->A variant was not created.");
+
+            // Ensure at least one W4->K exists
+            var wToKCount = protein.SequenceVariations.Count(v =>
+                v.OneBasedBeginPosition == 4 && v.OneBasedEndPosition == 4 &&
+                v.OriginalSequence == "W" && v.VariantSequence == "K");
+            Assert.That(wToKCount, Is.GreaterThanOrEqualTo(1), "Expected a W4->K variant.");
+
+            // Converted positions removed from OneBasedPossibleLocalizedModifications
+            Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(3), Is.False);
+            Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(4), Is.False);
+
+            // Unrelated and malformed mods remain
+            Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(5), Is.True);
+            Assert.That(protein.OneBasedPossibleLocalizedModifications[5].Any(m => m.OriginalId == "Oxid_P"), Is.True);
+            Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(6), Is.True);
+            Assert.That(protein.OneBasedPossibleLocalizedModifications[6].Any(m => m.OriginalId == "E>A"), Is.True);
+        }
+    }
 }
diff --git a/mzLib/Test/Transcriptomics/TestDbLoader.cs b/mzLib/Test/Transcriptomics/TestDbLoader.cs
index db5de6417..a9e590d06 100644
--- a/mzLib/Test/Transcriptomics/TestDbLoader.cs
+++ b/mzLib/Test/Transcriptomics/TestDbLoader.cs
@@ -164,7 +164,6 @@ public static void TestXmlWriterReaderAsBioPolymer()
             var rna = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifedFastaPath, true, DecoyType.None, false, out var errors)
                 .Cast<IBioPolymer>().ToList();
             Assert.That(errors.Count, Is.EqualTo(0));
-
             var modString = "ID   Methylation\r\nMT   Biological\r\nPP   Anywhere.\r\nTG   G\r\nCF   C1H2\r\n" + @"//";
             var methylG = PtmListLoader.ReadModsFromString(modString, out List<(Modification, string)> modsOut).First();
 

From 676a89279661c29c0525eadc5bc4ac9b664c1b42 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Thu, 23 Oct 2025 09:59:16 -0500
Subject: [PATCH 09/38] chasing updates to variant call format

---
 mzLib/Omics/BioPolymer/VariantCallFormat.cs | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mzLib/Omics/BioPolymer/VariantCallFormat.cs b/mzLib/Omics/BioPolymer/VariantCallFormat.cs
index 906d0ada0..dcadae128 100644
--- a/mzLib/Omics/BioPolymer/VariantCallFormat.cs
+++ b/mzLib/Omics/BioPolymer/VariantCallFormat.cs
@@ -1,6 +1,7 @@
 ﻿using System;
 using System.Collections.Generic;
 using System.Linq;
+using System.Reflection;
 
 namespace Omics.BioPolymer
 {
@@ -164,9 +165,9 @@ public VariantCallFormat(string description)
             AlternateAlleleString = vcfFields[4];
             Info = new SnpEffAnnotation(vcfFields[7]);
 
-            AlleleIndex = Info.Allele == null 
-                ? -1 
-                : AlternateAlleleString.Split(',').ToList().IndexOf(Info.Allele) + 1; // reference is zero
+            AlleleIndex = Info.Allele == null
+                ? -1
+                : AlternateAlleleString.Split(',').ToList().IndexOf(Info.Allele) + 1; // returns 1 - based index for ALT alleles, 0 if not found, -1 if Info.Allele is null
 
             // Format column tokens describe how to split each sample column
             Format = vcfFields[8];
@@ -185,7 +186,7 @@ public VariantCallFormat(string description)
 
                 // GT: split on '/' or '|' – separators removed intentionally.
                 string[] gt = genotypeFields.TryGetValue("GT", out var gtString)
-                    ? gtString.Split(new[] { '/', '|' }, StringSplitOptions.RemoveEmptyEntries)
+                    ? gtString.Split(new[] { '/', '|' }) // if this is bugged, try adding ",StringSplitOptions.RemoveEmptyEntries" after { '/', '|' }
                     : Array.Empty<string>();
 
                 // Skip invalid or empty GT

From fb1f1476a27db8b7e99ef1901d8f377086f7e36e Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Thu, 23 Oct 2025 10:04:43 -0500
Subject: [PATCH 10/38] copilot suggestions

---
 mzLib/Omics/BioPolymer/VariantCallFormat.cs        | 2 +-
 mzLib/Test/DatabaseTests/VariantTests.cs           | 2 +-
 mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs | 1 -
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/mzLib/Omics/BioPolymer/VariantCallFormat.cs b/mzLib/Omics/BioPolymer/VariantCallFormat.cs
index dcadae128..cd4cc6cb1 100644
--- a/mzLib/Omics/BioPolymer/VariantCallFormat.cs
+++ b/mzLib/Omics/BioPolymer/VariantCallFormat.cs
@@ -149,7 +149,7 @@ public VariantCallFormat(string description)
             // normalize them to actual tabs for parsing only. Leave Description intact.
             string parseLine = NormalizeTabsForParsing(description);
 
-            // Parse description into
+            // Parse description into VCF fields
             string[] vcfFields = parseLine.Split('\t');
             if (vcfFields.Length < 10)
             {
diff --git a/mzLib/Test/DatabaseTests/VariantTests.cs b/mzLib/Test/DatabaseTests/VariantTests.cs
index 603ca8b47..04b42bb99 100644
--- a/mzLib/Test/DatabaseTests/VariantTests.cs
+++ b/mzLib/Test/DatabaseTests/VariantTests.cs
@@ -99,7 +99,7 @@ public void ParseComprehensiveVcfExamples()
 
                     Assert.That(vcf.AlleleDepths.ContainsKey(key));
                     var parsedAd = vcf.AlleleDepths[key] ?? Array.Empty<string>();
-                    if (!(parsedAd.Length == 0 && expectedAdTokens.Length == 1 && expectedAdTokens[0] == "."))
+                    if (parsedAd.Length != 0 || expectedAdTokens.Length != 1 || expectedAdTokens[0] != ".")
                     {
                         Assert.That(parsedAd, Is.EqualTo(expectedAdTokens));
                     }
diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs
index a550bf422..9567ba6c1 100644
--- a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs
+++ b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs
@@ -109,7 +109,6 @@ public static List<Protein> LoadProteinXML(string proteinDbLocation, bool genera
                             if (newProtein != null)
                             {
                                 //If we have read any modifications that are nucleotide substitutions, convert them to sequence variants here:
-                                //newProtein.ConsensusVariant.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
                                 if (newProtein.OneBasedPossibleLocalizedModifications.Any(m => m.Value.Any(mt => mt.ModificationType.Contains("nucleotide substitution"))))
                                 {
                                     newProtein.ConvertNucleotideSubstitutionModificationsToSequenceVariants();

From 0c327b80ab5ad63e2a7703782e19a9b1b38d29e1 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Thu, 23 Oct 2025 10:15:20 -0500
Subject: [PATCH 11/38] h

---
 mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs b/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs
index bb8436392..c10f912bd 100644
--- a/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs
+++ b/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs
@@ -237,7 +237,7 @@ public void TestUpdateUniprot()
             var uniprotLocation = Path.Combine(TestContext.CurrentContext.TestDirectory, "ptmlist.txt");
             Loaders.UpdateUniprot(uniprotLocation);
             Loaders.UpdateUniprot(uniprotLocation);
-        }
+        } 
 
         [Test]
         public void FilesEqualHash()

From c224908b84fcc397c3d871fccb9ed67a0c70332b Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Thu, 23 Oct 2025 10:21:42 -0500
Subject: [PATCH 12/38] added new paramater location to proteinxmlentry to
 reference specific isoform

---
 .../ProteinXmlEntry.cs                        | 24 ++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs
index 68aaa399b..57849158e 100644
--- a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs
+++ b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs
@@ -8,7 +8,6 @@
 using Omics.Modifications;
 using Transcriptomics;
 using UsefulProteomicsDatabases.Transcriptomics;
-using System.Data;
 using Proteomics.ProteolyticDigestion;
 
 namespace UsefulProteomicsDatabases
@@ -53,10 +52,27 @@ public class ProteinXmlEntry
         public UniProtSequenceAttributes SequenceAttributes { get; set; } = null; // this is used to store the sequence attributes from the <sequence> element, if present
         private List<(int, string)> AnnotatedMods = new List<(int position, string originalModificationID)>();
         private List<(int, string)> AnnotatedVariantMods = new List<(int position, string originalModificationID)>();
+        // Captured isoform/sequence identifier from <location sequence="...">
+        private string LocationSequenceId;
 
         /// <summary>
-        /// Start parsing a protein XML element
+        /// Finalizes the parsing of a protein XML entry and constructs a <see cref="Protein"/> object.
+        /// This method is called when the end of an &lt;entry&gt; element is reached during XML parsing.
+        /// It sanitizes the sequence, prunes out-of-range sequence variants, resolves and attaches modifications,
+        /// and aggregates all parsed data (such as gene names, proteolysis products, sequence variations, disulfide bonds, and splice sites)
+        /// into a new <see cref="Protein"/> instance.
+        /// After construction, the internal state is cleared to prepare for the next entry.
         /// </summary>
+        /// <param name="xml">The <see cref="XmlReader"/> positioned at the end of the &lt;entry&gt; element.</param>
+        /// <param name="isContaminant">Indicates whether the protein is a contaminant.</param>
+        /// <param name="proteinDbLocation">The file path or identifier of the protein database source.</param>
+        /// <param name="modTypesToExclude">A collection of modification types to exclude from the protein.</param>
+        /// <param name="unknownModifications">A dictionary to collect modifications that could not be resolved.</param>
+        /// <param name="decoyIdentifier">A string used to identify decoy proteins (default: "DECOY").</param>
+        /// <returns>
+        /// A constructed <see cref="Protein"/> object containing all parsed and resolved information,
+        /// or <c>null</c> if the entry is incomplete.
+        /// </returns>
         public void ParseElement(string elementName, XmlReader xml)
         {
             int outValue;
@@ -137,7 +153,9 @@ public void ParseElement(string elementName, XmlReader xml)
                     PropertyTypes.Add(xml.GetAttribute("type"));
                     PropertyValues.Add(xml.GetAttribute("value"));
                     break;
-
+                case "location":
+                    LocationSequenceId = xml.GetAttribute("sequence");
+                    break;
                 case "position":
                     OneBasedFeaturePosition = int.Parse(xml.GetAttribute("position"));
                     break;

From c20aa20260965df49a7ba8bd8720e3133089b7e3 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Thu, 23 Oct 2025 10:38:30 -0500
Subject: [PATCH 13/38] primitive tweaks to SequenceVariation

---
 mzLib/Omics/BioPolymer/SequenceVariation.cs   |  11 +-
 .../ProteinXmlEntry.cs                        | 319 +++++++++++++-----
 2 files changed, 246 insertions(+), 84 deletions(-)

diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs
index c3c4ef9e2..fc30b4082 100644
--- a/mzLib/Omics/BioPolymer/SequenceVariation.cs
+++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs
@@ -14,13 +14,14 @@ public class SequenceVariation
         /// <param name="originalSequence"></param>
         /// <param name="variantSequence"></param>
         /// <param name="oneBasedModifications"></param>
-        public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, Dictionary<int, List<Modification>>? oneBasedModifications = null)
+        public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, string? variantCallFormatDataString = null, Dictionary<int, List<Modification>>? oneBasedModifications = null)
         {
             OneBasedBeginPosition = oneBasedBeginPosition;
             OneBasedEndPosition = oneBasedEndPosition;
             OriginalSequence = originalSequence ?? "";
             VariantSequence = variantSequence ?? "";
             Description = new VariantCallFormat(description);
+            VariantCallFormatData = variantCallFormatDataString is null ? null : new VariantCallFormat(variantCallFormatDataString);
             OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
         }
 
@@ -33,8 +34,8 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str
         /// <param name="variantSequence"></param>
         /// <param name="description"></param>
         /// <param name="oneBasedModifications"></param>
-        public SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, Dictionary<int, List<Modification>>? oneBasedModifications = null)
-            : this(oneBasedPosition, originalSequence == null ? oneBasedPosition : oneBasedPosition + originalSequence.Length - 1, originalSequence, variantSequence, description, oneBasedModifications)
+        public SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, string? variantCallFormatDataString = null, Dictionary<int, List<Modification>>? oneBasedModifications = null)
+            : this(oneBasedPosition, originalSequence == null ? oneBasedPosition : oneBasedPosition + originalSequence.Length - 1, originalSequence, variantSequence, description,  variantCallFormatDataString = null, oneBasedModifications)
         { }
 
         /// <summary>
@@ -62,6 +63,10 @@ public SequenceVariation(int oneBasedPosition, string originalSequence, string v
         /// </summary>
         public VariantCallFormat Description { get; }
 
+        /// <summary>Optional multi-sample VCF record describing the variant (can be null or collapsed).</summary>
+        public VariantCallFormat? VariantCallFormatData { get; }
+        [Obsolete("Use VariantCallFormatData for structured data or Description/SearchableAnnotation for text.")]
+        public VariantCallFormat? LegacyVariantDescription => VariantCallFormatData;
         /// <summary>
         /// Modifications specifically for this variant
         /// </summary>
diff --git a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs
index 57849158e..922ebb44e 100644
--- a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs
+++ b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs
@@ -1,14 +1,15 @@
-﻿using Proteomics;
+﻿using Omics.BioPolymer;
+using Omics.Modifications;
+using Proteomics;
+using Proteomics.ProteolyticDigestion;
 using System;
 using System.Collections.Generic;
+using System.Diagnostics;
 using System.Linq;
 using System.Text.RegularExpressions;
 using System.Xml;
-using Omics.BioPolymer;
-using Omics.Modifications;
 using Transcriptomics;
 using UsefulProteomicsDatabases.Transcriptomics;
-using Proteomics.ProteolyticDigestion;
 
 namespace UsefulProteomicsDatabases
 {
@@ -179,9 +180,12 @@ public void ParseElement(string elementName, XmlReader xml)
         }
 
         /// <summary>
-        /// Parses the attributes of the current <entry> element from the provided XmlReader.
-        /// Extracts and stores the values for dataset, created, modified, version, and xmlns attributes.
+        /// Parses and stores key metadata attributes from the current &lt;entry&gt; element in the XML.
+        /// This includes dataset, creation date, modification date, version, and XML namespace information.
+        /// The extracted values are assigned to the corresponding properties of the <see cref="ProteinXmlEntry"/> instance.
+        /// This method is typically called when the parser encounters the start of a protein entry in a UniProt or similar XML file.
         /// </summary>
+        /// <param name="xml">The <see cref="XmlReader"/> positioned at the &lt;entry&gt; element whose attributes are to be read.</param>
         private void ParseEntryAttributes(XmlReader xml)
         {
             DatasetEntryTag = xml.GetAttribute("dataset");
@@ -286,10 +290,12 @@ private static UniProtSequenceAttributes.FragmentType ParseFragmentType(string f
             return UniProtSequenceAttributes.FragmentType.unspecified;
         }
 
-        // Helper method to compute the monoisotopic mass of a sequence.
         /// <summary>
-        /// Computes the monoisotopic mass of the given sequence.
-        /// Returns 0 if the sequence is empty.
+        /// Computes the monoisotopic mass of a protein or nucleic acid sequence without modifications.
+        /// If the input sequence is null or empty, returns 0.
+        /// Internally, constructs a <see cref="PeptideWithSetModifications"/> using the provided sequence and an empty modification dictionary,
+        /// then returns the rounded monoisotopic mass as an integer.
+        /// This method is used to populate sequence attributes such as mass during XML parsing.
         /// </summary>
         private static int ComputeSequenceMass(string sequence)
         {
@@ -298,8 +304,25 @@ private static int ComputeSequenceMass(string sequence)
             return (int)Math.Round(new PeptideWithSetModifications(sequence, new Dictionary<string, Modification>()).MonoisotopicMass);
         }
         /// <summary>
-        /// Finish parsing at the end of an element
+        /// Handles the end of an XML element during protein database parsing, updating the internal state or finalizing objects as needed.
+        /// Depending on the element name, this method processes and stores feature, subfeature, database reference, gene, and organism information,
+        /// or, if the end of an &lt;entry&gt; element is reached, constructs and returns a fully populated <see cref="Protein"/> object.
+        /// For &lt;feature&gt; and &lt;subfeature&gt; elements, it attaches modifications or proteolytic products.
+        /// For &lt;dbReference&gt;, it records database cross-references.
+        /// For &lt;gene&gt; and &lt;organism&gt;, it updates parsing state flags.
+        /// For &lt;entry&gt;, it aggregates all parsed data, resolves modifications, and returns a new <see cref="Protein"/> instance,
+        /// clearing the internal state for the next entry.
         /// </summary>
+        /// <param name="xml">The <see cref="XmlReader"/> positioned at the end of the current XML element.</param>
+        /// <param name="modTypesToExclude">A collection of modification types to exclude from the protein.</param>
+        /// <param name="unknownModifications">A dictionary to collect modifications that could not be resolved.</param>
+        /// <param name="isContaminant">Indicates whether the protein is a contaminant.</param>
+        /// <param name="proteinDbLocation">The file path or identifier of the protein database source.</param>
+        /// <param name="decoyIdentifier">A string used to identify decoy proteins (default: "DECOY").</param>
+        /// <returns>
+        /// A constructed <see cref="Protein"/> object if the end of an &lt;entry&gt; element is reached and all required data is present;
+        /// otherwise, <c>null</c>.
+        /// </returns>
         public Protein ParseEndElement(XmlReader xml, IEnumerable<string> modTypesToExclude, Dictionary<string, Modification> unknownModifications,
             bool isContaminant, string proteinDbLocation, string decoyIdentifier = "DECOY")
         {
@@ -330,7 +353,26 @@ public Protein ParseEndElement(XmlReader xml, IEnumerable<string> modTypesToExcl
             }
             return protein;
         }
-
+        /// <summary>
+        /// Handles the end of an XML element during RNA database parsing, updating the internal state or finalizing objects as needed.
+        /// Depending on the element name, this method processes and stores feature, subfeature, and database reference information,
+        /// or, if the end of an &lt;entry&gt; element is reached, constructs and returns a fully populated <see cref="RNA"/> object.
+        /// For &lt;feature&gt; and &lt;subfeature&gt; elements, it attaches modifications or truncation products.
+        /// For &lt;dbReference&gt;, it records database cross-references.
+        /// For &lt;gene&gt; and &lt;organism&gt;, it updates parsing state flags.
+        /// For &lt;entry&gt;, it aggregates all parsed data, resolves modifications, and returns a new <see cref="RNA"/> instance,
+        /// clearing the internal state for the next entry.
+        /// </summary>
+        /// <param name="xml">The <see cref="XmlReader"/> positioned at the end of the current XML element.</param>
+        /// <param name="modTypesToExclude">A collection of modification types to exclude from the RNA.</param>
+        /// <param name="unknownModifications">A dictionary to collect modifications that could not be resolved.</param>
+        /// <param name="isContaminant">Indicates whether the RNA is a contaminant.</param>
+        /// <param name="rnaDbLocation">The file path or identifier of the RNA database source.</param>
+        /// <param name="decoyIdentifier">A string used to identify decoy RNAs (default: "DECOY").</param>
+        /// <returns>
+        /// A constructed <see cref="RNA"/> object if the end of an &lt;entry&gt; element is reached and all required data is present;
+        /// otherwise, <c>null</c>.
+        /// </returns>
         internal RNA ParseRnaEndElement(XmlReader xml, IEnumerable<string> modTypesToExclude,
             Dictionary<string, Modification> unknownModifications,
             bool isContaminant, string rnaDbLocation, string decoyIdentifier = "DECOY")
@@ -364,8 +406,30 @@ internal RNA ParseRnaEndElement(XmlReader xml, IEnumerable<string> modTypesToExc
         }
 
         /// <summary>
-        /// Finish parsing an entry
+        /// Finalizes the parsing of a protein XML entry and constructs a <see cref="Protein"/> object from the accumulated data.
+        /// This method is called when the end of an &lt;entry&gt; element is reached during XML parsing.
+        /// It performs several key tasks:
+        /// <list type="bullet">
+        ///   <item>Sanitizes the parsed sequence (e.g., replacing invalid amino acids with 'X').</item>
+        ///   <item>Prunes any sequence variants whose coordinates exceed the sequence length.</item>
+        ///   <item>Resolves and attaches all annotated modifications, excluding those of specified types or unknowns.</item>
+        ///   <item>Determines if the protein is a decoy based on the accession and decoy identifier.</item>
+        ///   <item>Aggregates all parsed data (gene names, proteolysis products, sequence variations, disulfide bonds, splice sites, database references, and sequence attributes) into a new <see cref="Protein"/> instance.</item>
+        ///   <item>Clears the internal state of the <see cref="ProteinXmlEntry"/> to prepare for parsing the next entry.</item>
+        /// </list>
+        /// If either the accession or sequence is missing, returns <c>null</c>.
         /// </summary>
+        /// <param name="xml">The <see cref="XmlReader"/> positioned at the end of the &lt;entry&gt; element.</param>
+        /// <param name="isContaminant">Indicates whether the protein is a contaminant.</param>
+        /// <param name="proteinDbLocation">The file path or identifier of the protein database source.</param>
+        /// <param name="modTypesToExclude">A collection of modification types to exclude from the protein.</param>
+        /// <param name="unknownModifications">A dictionary to collect modifications that could not be resolved.</param>
+        /// <param name="decoyIdentifier">A string used to identify decoy proteins (default: "DECOY").</param>
+        /// <returns>
+        /// A constructed <see cref="Protein"/> object containing all parsed and resolved information,
+        /// or <c>null</c> if the entry is incomplete.
+        /// </returns>
+
         public Protein ParseEntryEndElement(XmlReader xml, bool isContaminant, string proteinDbLocation, IEnumerable<string> modTypesToExclude, Dictionary<string, Modification> unknownModifications, string decoyIdentifier = "DECOY")
         {
             Protein result = null;
@@ -377,6 +441,8 @@ public Protein ParseEntryEndElement(XmlReader xml, bool isContaminant, string pr
                 Sequence = ProteinDbLoader.SanitizeAminoAcidSequence(Sequence, 'X');
 
                 ParseAnnotatedMods(OneBasedModifications, modTypesToExclude, unknownModifications, AnnotatedMods);
+                //prune any sequence variants whose coordinates exceed the known sequence length
+                PruneOutOfRangeSequenceVariants();
                 if (Accession.StartsWith(decoyIdentifier))
                 {
                     isDecoy = true;
@@ -388,7 +454,30 @@ public Protein ParseEntryEndElement(XmlReader xml, bool isContaminant, string pr
             Clear();
             return result;
         }
-
+        /// <summary>
+        /// Finalizes the parsing of an RNA XML entry and constructs an <see cref="RNA"/> object from the accumulated data.
+        /// This method is called when the end of an &lt;entry&gt; element is reached during XML parsing for RNA records.
+        /// It performs several key tasks:
+        /// <list type="bullet">
+        ///   <item>Sanitizes the parsed sequence (e.g., replacing invalid characters with 'X').</item>
+        ///   <item>Prunes any sequence variants whose coordinates exceed the sequence length.</item>
+        ///   <item>Resolves and attaches all annotated modifications, excluding those of specified types or unknowns.</item>
+        ///   <item>Determines if the RNA is a decoy based on the accession and decoy identifier.</item>
+        ///   <item>Aggregates all parsed data (gene names, proteolysis products, sequence variations, and other metadata) into a new <see cref="RNA"/> instance.</item>
+        ///   <item>Clears the internal state of the <see cref="ProteinXmlEntry"/> to prepare for parsing the next entry.</item>
+        /// </list>
+        /// If either the accession or sequence is missing, returns <c>null</c>.
+        /// </summary>
+        /// <param name="xml">The <see cref="XmlReader"/> positioned at the end of the &lt;entry&gt; element.</param>
+        /// <param name="isContaminant">Indicates whether the RNA is a contaminant.</param>
+        /// <param name="rnaDbLocation">The file path or identifier of the RNA database source.</param>
+        /// <param name="modTypesToExclude">A collection of modification types to exclude from the RNA.</param>
+        /// <param name="unknownModifications">A dictionary to collect modifications that could not be resolved.</param>
+        /// <param name="decoyIdentifier">A string used to identify decoy RNAs (default: "DECOY").</param>
+        /// <returns>
+        /// A constructed <see cref="RNA"/> object containing all parsed and resolved information,
+        /// or <c>null</c> if the entry is incomplete.
+        /// </returns>
         internal RNA ParseRnaEntryEndElement(XmlReader xml, bool isContaminant, string rnaDbLocation,
             IEnumerable<string> modTypesToExclude, Dictionary<string, Modification> unknownModifications, string decoyIdentifier = "DECOY")
         {
@@ -399,6 +488,8 @@ internal RNA ParseRnaEntryEndElement(XmlReader xml, bool isContaminant, string r
                 // sanitize the sequence to replace unexpected characters with X (unknown amino acid)
                 // sometimes strange characters get added by RNA sequencing software, etc.
                 Sequence = ProteinDbLoader.SanitizeAminoAcidSequence(Sequence, 'X');
+                //prune any sequence variants whose coordinates exceed the known sequence length
+                PruneOutOfRangeSequenceVariants();
                 if (Accession.StartsWith(decoyIdentifier))
                 {
                     isDecoy = true;
@@ -425,8 +516,19 @@ public void ParseSubFeatureEndElement(XmlReader xml, IEnumerable<string> modType
         }
 
         /// <summary>
-        /// Finish parsing a feature element
+        /// Processes the end of a &lt;feature&gt; element during XML parsing and updates the internal state with the parsed feature information.
+        /// Depending on the feature type, this method:
+        /// <list type="bullet">
+        ///   <item>Adds modification annotations for "modified residue" and "lipid moiety-binding region" features.</item>
+        ///   <item>Creates and adds <see cref="TruncationProduct"/> objects for proteolytic features such as "peptide", "propeptide", "chain", and "signal peptide".</item>
+        ///   <item>Handles "sequence variant" features by creating <see cref="SequenceVariation"/> objects, including variant-specific modifications, and ensures they apply to the correct sequence or isoform.</item>
+        ///   <item>Creates and adds <see cref="DisulfideBond"/> or <see cref="SpliceSite"/> objects for their respective feature types, using available position information.</item>
+        /// </list>
+        /// After processing, resets feature-related state variables to prepare for the next feature.
         /// </summary>
+        /// <param name="xml">The <see cref="XmlReader"/> positioned at the end of the &lt;feature&gt; element.</param>
+        /// <param name="modTypesToExclude">A collection of modification types to exclude from the protein.</param>
+        /// <param name="unknownModifications">A dictionary to collect modifications that could not be resolved.</param>
         public void ParseFeatureEndElement(XmlReader xml, IEnumerable<string> modTypesToExclude, Dictionary<string, Modification> unknownModifications)
         {
             if (FeatureType == "modified residue")
@@ -463,24 +565,56 @@ public void ParseFeatureEndElement(XmlReader xml, IEnumerable<string> modTypesTo
                     }
                     else
                     {
-                        type += ("null-null");
+                        type += "null-null";
                     }
                 }
                 ProteolysisProducts.Add(new TruncationProduct(OneBasedBeginPosition, OneBasedEndPosition, type));
             }
-            else if (FeatureType == "sequence variant" && VariationValue != null && VariationValue != "") // Only keep if there is variant sequence information and position information
+            else if (FeatureType == "sequence variant" && VariationValue != null && VariationValue != "")
             {
-                ParseAnnotatedMods(OneBasedVariantModifications, modTypesToExclude, unknownModifications, AnnotatedVariantMods);
-                if (OneBasedBeginPosition != null && OneBasedEndPosition != null)
+                bool appliesToThisSequence = true;
+                if (!string.IsNullOrEmpty(LocationSequenceId))
                 {
-                    SequenceVariations.Add(new SequenceVariation((int)OneBasedBeginPosition, (int)OneBasedEndPosition, OriginalValue, VariationValue, FeatureDescription, OneBasedVariantModifications));
+                    string acc = Accession ?? "";
+                    appliesToThisSequence =
+                        LocationSequenceId.Equals(acc, StringComparison.OrdinalIgnoreCase)
+                        || (!string.IsNullOrEmpty(acc) && LocationSequenceId.Equals($"{acc}-1", StringComparison.OrdinalIgnoreCase));
                 }
-                else if (OneBasedFeaturePosition >= 1)
+
+                if (appliesToThisSequence)
                 {
-                    SequenceVariations.Add(new SequenceVariation(OneBasedFeaturePosition, OriginalValue, VariationValue, FeatureDescription, OneBasedVariantModifications));
+                    ParseAnnotatedMods(OneBasedVariantModifications, modTypesToExclude, unknownModifications, AnnotatedVariantMods);
+
+                    // NOTE: We can NOT validate coordinate vs sequence length here because sequence is usually parsed later.
+                    // Validation is deferred to PruneOutOfRangeSequenceVariants() during ParseEntryEndElement.
+
+                    if (OneBasedBeginPosition != null && OneBasedEndPosition != null)
+                    {
+                        SequenceVariations.Add(
+                            new SequenceVariation(
+                                (int)OneBasedBeginPosition,
+                                (int)OneBasedEndPosition,
+                                OriginalValue,
+                                VariationValue,
+                                FeatureDescription,
+                                variantCallFormatDataString: null,
+                                oneBasedModifications: OneBasedVariantModifications));
+                    }
+                    else if (OneBasedFeaturePosition >= 1)
+                    {
+                        SequenceVariations.Add(
+                            new SequenceVariation(
+                                OneBasedFeaturePosition,
+                                OriginalValue,
+                                VariationValue,
+                                FeatureDescription,
+                                variantCallFormatDataString: null,
+                                oneBasedModifications: OneBasedVariantModifications));
+                    }
+
+                    AnnotatedVariantMods = new List<(int, string)>();
+                    OneBasedVariantModifications = new Dictionary<int, List<Modification>>();
                 }
-                AnnotatedVariantMods = new List<(int, string)>();
-                OneBasedVariantModifications = new Dictionary<int, List<Modification>>();
             }
             else if (FeatureType == "disulfide bond")
             {
@@ -509,64 +643,7 @@ public void ParseFeatureEndElement(XmlReader xml, IEnumerable<string> modTypesTo
             OneBasedFeaturePosition = -1;
             OriginalValue = "";
             VariationValue = "";
-        }
-
-        private static void ParseAnnotatedMods(Dictionary<int, List<Modification>> destination, IEnumerable<string> modTypesToExclude,
-            Dictionary<string, Modification> unknownModifications, List<(int, string)> annotatedMods)
-        {
-            foreach (var annotatedMod in annotatedMods)
-            {
-                string annotatedId = annotatedMod.Item2;
-                int annotatedModLocation = annotatedMod.Item1;
-
-                if (ProteinDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out Modification foundMod)
-                    || RnaDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out foundMod))
-                {
-                    // if the list of known mods contains this IdWithMotif
-                    if (!modTypesToExclude.Contains(foundMod.ModificationType))
-                    {
-                        if (destination.TryGetValue(annotatedModLocation, out var listOfModsAtThisLocation))
-                        {
-                            listOfModsAtThisLocation.Add(foundMod);
-                        }
-                        else
-                        {
-                            destination.Add(annotatedModLocation, new List<Modification> { foundMod });
-                        }
-                    }
-                    // else - the mod ID was found but the motif didn't fit the annotated location
-                }
-
-                // no known mod - try looking it up in the dictionary of mods without motif appended
-                else if (ProteinDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out IList<Modification> mods)
-                         || RnaDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out mods))
-                {
-                    foreach (Modification mod in mods)
-                    {
-                        if (!modTypesToExclude.Contains(mod.ModificationType))
-                        {
-                            if (destination.TryGetValue(annotatedModLocation, out var listOfModsAtThisLocation))
-                            {
-                                listOfModsAtThisLocation.Add(mod);
-                            }
-                            else
-                            {
-                                destination.Add(annotatedModLocation, new List<Modification> { mod });
-                            }
-                            break;
-                        }
-                    }
-                }
-                else
-                {
-                    // could not find the annotated mod's ID in our list of known mods - it's an unknown mod
-                    // I don't think this really does anything...
-                    if (!unknownModifications.ContainsKey(annotatedId))
-                    {
-                        unknownModifications.Add(annotatedId, new Modification(annotatedId));
-                    }
-                }
-            }
+            LocationSequenceId = null;
         }
 
         /// <summary>
@@ -622,6 +699,86 @@ private void Clear()
             GeneNames = new List<Tuple<string, string>>();
             ReadingGene = false;
             ReadingOrganism = false;
+            LocationSequenceId = null;
+            AnnotatedVariantMods = new List<(int, string)>();
+            OneBasedVariantModifications = new Dictionary<int, List<Modification>>();
+        }
+        /// <summary>
+        /// Resolves and attaches annotated modifications to the specified destination dictionary based on parsed feature or variant annotations.
+        /// For each annotated modification, attempts to look up the modification by its identifier (with motif) in both protein and RNA modification dictionaries.
+        /// If found and not excluded by <paramref name="modTypesToExclude"/>, the modification is added to the destination at the specified position.
+        /// If not found by identifier, attempts to resolve the modification by possible matches (without motif) and adds the first non-excluded match.
+        /// If no match is found, records the modification as unknown in <paramref name="unknownModifications"/> to avoid repeated warnings.
+        /// This method is used to populate the protein or variant modification dictionaries during XML parsing.
+        /// </summary>
+        /// <param name="destination">Dictionary mapping one-based positions to lists of modifications to be populated.</param>
+        /// <param name="modTypesToExclude">A collection of modification types to exclude from assignment.</param>
+        /// <param name="unknownModifications">A dictionary to collect modifications that could not be resolved by identifier or type.</param>
+        /// <param name="annotatedMods">List of (position, modification identifier) tuples parsed from XML features or subfeatures.</param>
+        private static void ParseAnnotatedMods(
+            Dictionary<int, List<Modification>> destination,
+            IEnumerable<string> modTypesToExclude,
+            Dictionary<string, Modification> unknownModifications,
+            List<(int position, string originalModificationID)> annotatedMods)
+        {
+            foreach (var (annotatedModLocation, annotatedId) in annotatedMods)
+            {
+                if (ProteinDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out Modification foundMod)
+                    || RnaDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out foundMod))
+                {
+                    if (!modTypesToExclude.Contains(foundMod.ModificationType))
+                    {
+                        if (destination.TryGetValue(annotatedModLocation, out var listOfModsAtThisLocation))
+                        {
+                            listOfModsAtThisLocation.Add(foundMod);
+                        }
+                        else
+                        {
+                            destination.Add(annotatedModLocation, new List<Modification> { foundMod });
+                        }
+                    }
+                }
+                else if (ProteinDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out IList<Modification> mods)
+                         || RnaDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out mods))
+                {
+                    foreach (Modification mod in mods)
+                    {
+                        if (!modTypesToExclude.Contains(mod.ModificationType))
+                        {
+                            if (destination.TryGetValue(annotatedModLocation, out var listOfModsAtThisLocation))
+                            {
+                                listOfModsAtThisLocation.Add(mod);
+                            }
+                            else
+                            {
+                                destination.Add(annotatedModLocation, new List<Modification> { mod });
+                            }
+                            break;
+                        }
+                    }
+                }
+                else
+                {
+                    if (!unknownModifications.ContainsKey(annotatedId))
+                    {
+                        unknownModifications.Add(annotatedId, new Modification(annotatedId));
+                    }
+                }
+            }
+        }
+        private void PruneOutOfRangeSequenceVariants()
+        {
+            if (string.IsNullOrEmpty(Sequence) || SequenceVariations.Count == 0)
+                return;
+
+            int len = Sequence.Length;
+            int removed = SequenceVariations.RemoveAll(v =>
+                v.OneBasedBeginPosition > len || v.OneBasedEndPosition > len);
+
+            if (removed > 0)
+            {
+                Trace.TraceWarning($"Pruned {removed} out-of-range sequence variant(s) for accession {Accession} (protein length {len}).");
+            }
         }
     }
 }
\ No newline at end of file

From 5853a1b4f2cd3a156c5ef73931c39cd84466f783 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Thu, 23 Oct 2025 10:54:36 -0500
Subject: [PATCH 14/38] lets see

---
 mzLib/Omics/BioPolymer/SequenceVariation.cs        | 7 +++----
 mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs | 4 ++--
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs
index fc30b4082..63619b1c8 100644
--- a/mzLib/Omics/BioPolymer/SequenceVariation.cs
+++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs
@@ -14,14 +14,13 @@ public class SequenceVariation
         /// <param name="originalSequence"></param>
         /// <param name="variantSequence"></param>
         /// <param name="oneBasedModifications"></param>
-        public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, string? variantCallFormatDataString = null, Dictionary<int, List<Modification>>? oneBasedModifications = null)
+        public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, Dictionary<int, List<Modification>>? oneBasedModifications = null)
         {
             OneBasedBeginPosition = oneBasedBeginPosition;
             OneBasedEndPosition = oneBasedEndPosition;
             OriginalSequence = originalSequence ?? "";
             VariantSequence = variantSequence ?? "";
             Description = new VariantCallFormat(description);
-            VariantCallFormatData = variantCallFormatDataString is null ? null : new VariantCallFormat(variantCallFormatDataString);
             OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
         }
 
@@ -34,8 +33,8 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str
         /// <param name="variantSequence"></param>
         /// <param name="description"></param>
         /// <param name="oneBasedModifications"></param>
-        public SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, string? variantCallFormatDataString = null, Dictionary<int, List<Modification>>? oneBasedModifications = null)
-            : this(oneBasedPosition, originalSequence == null ? oneBasedPosition : oneBasedPosition + originalSequence.Length - 1, originalSequence, variantSequence, description,  variantCallFormatDataString = null, oneBasedModifications)
+        public SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, Dictionary<int, List<Modification>>? oneBasedModifications = null)
+            : this(oneBasedPosition, originalSequence == null ? oneBasedPosition : oneBasedPosition + originalSequence.Length - 1, originalSequence, variantSequence, description, oneBasedModifications)
         { }
 
         /// <summary>
diff --git a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs
index 922ebb44e..dd6315e93 100644
--- a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs
+++ b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs
@@ -597,7 +597,7 @@ public void ParseFeatureEndElement(XmlReader xml, IEnumerable<string> modTypesTo
                                 OriginalValue,
                                 VariationValue,
                                 FeatureDescription,
-                                variantCallFormatDataString: null,
+                                //variantCallFormatDataString: null,
                                 oneBasedModifications: OneBasedVariantModifications));
                     }
                     else if (OneBasedFeaturePosition >= 1)
@@ -608,7 +608,7 @@ public void ParseFeatureEndElement(XmlReader xml, IEnumerable<string> modTypesTo
                                 OriginalValue,
                                 VariationValue,
                                 FeatureDescription,
-                                variantCallFormatDataString: null,
+                                //variantCallFormatDataString: null,
                                 oneBasedModifications: OneBasedVariantModifications));
                     }
 

From 71d24a858cc349873f261b94a9a96f653555cc19 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Fri, 24 Oct 2025 14:29:07 -0500
Subject: [PATCH 15/38] expand test AppliedVariants

---
 .../Test/DatabaseTests/TestVariantProtein.cs  | 192 +++++++++++++++---
 1 file changed, 162 insertions(+), 30 deletions(-)

diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
index 8ced7b208..52731ca0a 100644
--- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs
+++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
@@ -1,20 +1,21 @@
-﻿using System;
-using System.Collections;
-using System.Collections.Generic;
-using System.IO;
-using System.Linq;
+﻿using Chemistry;
+using MassSpectrometry;
 using NUnit.Framework;
+using NUnit.Framework.Legacy;
+using Omics;
 using Omics.BioPolymer;
-using Assert = NUnit.Framework.Legacy.ClassicAssert;
 using Omics.Modifications;
 using Proteomics;
 using Proteomics.ProteolyticDigestion;
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using Transcriptomics;
 using UsefulProteomicsDatabases;
+using Assert = NUnit.Framework.Legacy.ClassicAssert;
 using Stopwatch = System.Diagnostics.Stopwatch;
-using Omics;
-using Transcriptomics;
-using MassSpectrometry;
-using Chemistry;
 
 namespace Test.DatabaseTests
 {
@@ -284,56 +285,187 @@ public static void HomozygousVariantsAtVariedDepths(string filename, int minVari
             var variantProteins = proteins[0].GetVariantBioPolymers();
             List<PeptideWithSetModifications> peptides = proteins.SelectMany(vp => vp.Digest(new DigestionParams(), null, null)).ToList();
         }
-
         [Test]
         public static void AppliedVariants()
         {
-            ModificationMotif.TryGetMotif("P", out ModificationMotif motifP);
-            Modification mp = new Modification("mod", null, "type", null, motifP, "Anywhere.", null, 42.01, new Dictionary<string, IList<string>>(), null, null, null, null, null);
+            // This test verifies that applying sequence variations (SAV, MNV, insertion, deletion)
+            // produces the correct variant protein sequences, maps variant coordinates correctly,
+            // preserves per-variant metadata (AppliedSequenceVariations), and remains stable across:
+            // - repeated in-memory application,
+            // - round-tripping through XML (write → read).
+            //
+            // Additionally, it verifies that a modification attached to a variant (protein5)
+            // is persisted and localized at the expected one-based position after application
+            // and after XML reload.
 
+            // Define a simple P-specific modification used later to validate that modifications
+            // attached to a variant are preserved and localized correctly after applying variants
+            // and XML round-tripping.
+            ModificationMotif.TryGetMotif("P", out ModificationMotif motifP);
+            Modification mp = new Modification(
+                _originalId: "mod",
+                _accession: null,
+                _modificationType: "type",
+                _featureType: null,
+                _target: motifP,
+                _locationRestriction: "Anywhere.",
+                _chemicalFormula: null,
+                _monoisotopicMass: 42.01,
+                _databaseReference: new Dictionary<string, IList<string>>(),
+                _taxonomicRange: null,
+                _keywords: null,
+                _neutralLosses: null,
+                _diagnosticIons: null,
+                _fileOrigin: null);
+
+            // Prepare five proteins that each have one sequence variation:
+            // 1) protein1: Single Amino-acid Variant (SAV) P→V at position 4 (4..4)
+            // 2) protein2: Multi-Nucleotide Variant (MNV) PT→KT spanning positions 4..5
+            // 3) protein3: Insertion-like replacement: P→PPP at position 4 (longer variant)
+            // 4) protein4: Deletion-like replacement: PPP→P spanning 4..6 (shorter variant)
+            // 5) protein5: Same as (3) but with a modification attached at one-based index 5
+            //    to verify mod persistence/localization through variant application and XML.
             List<Protein> proteinsWithSeqVars = new List<Protein>
             {
-                new Protein("MPEPTIDE", "protein1", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 4, "P", "V", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
-                new Protein("MPEPTIDE", "protein2", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 5, "PT", "KT", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
-                new Protein("MPEPTIDE", "protein3", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
-                new Protein("MPEPPPTIDE", "protein4", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 6, "PPP", "P", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
-                new Protein("MPEPTIDE", "protein5", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary<int, List<Modification>> {{ 5, new[] { mp }.ToList() } }) }),
-             };
+                new Protein("MPEPTIDE", "protein1",
+                    sequenceVariations: new List<SequenceVariation>
+                    {
+                        // SAV: P(4) -> V(4)
+                        new SequenceVariation(4, 4, "P", "V",
+                            @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
+                    }),
+                new Protein("MPEPTIDE", "protein2",
+                    sequenceVariations: new List<SequenceVariation>
+                    {
+                        // MNV: PT(4..5) -> KT(4..5)
+                        new SequenceVariation(4, 5, "PT", "KT",
+                            @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
+                    }),
+                new Protein("MPEPTIDE", "protein3",
+                    sequenceVariations: new List<SequenceVariation>
+                    {
+                        // Insertion-like: P(4) -> PPP(4..6) (length +2)
+                        new SequenceVariation(4, 4, "P", "PPP",
+                            @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
+                    }),
+                new Protein("MPEPPPTIDE", "protein4",
+                    sequenceVariations: new List<SequenceVariation>
+                    {
+                        // Deletion-like: PPP(4..6) -> P(4) (length -2)
+                        new SequenceVariation(4, 6, "PPP", "P",
+                            @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
+                    }),
+                new Protein("MPEPTIDE", "protein5",
+                    sequenceVariations: new List<SequenceVariation>
+                    {
+                        // Insertion-like with a downstream mod to verify mod localization at 5
+                        new SequenceVariation(4, 4, "P", "PPP",
+                            @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30",
+                            new Dictionary<int, List<Modification>> { { 5, new[] { mp }.ToList() } })
+                    }),
+            };
+
+            // Apply variants in memory twice; the output should be stable and identical.
             var proteinsWithAppliedVariants = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList();
             var proteinsWithAppliedVariants2 = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList(); // should be stable
+
+            // Round-trip through XML: write the variant-bearing proteins (targets only) and read back.
+            // This ensures that variant application and metadata survive I/O and result identically.
             string xml = Path.Combine(TestContext.CurrentContext.TestDirectory, "AppliedVariants.xml");
             ProteinDbWriter.WriteXmlDatabase(new Dictionary<string, HashSet<Tuple<int, Modification>>>(), proteinsWithSeqVars, xml);
             var proteinsWithAppliedVariants3 = ProteinDbLoader.LoadProteinXML(xml, true, DecoyType.None, null, false, null, out var un);
 
+            // Compare across all three sources:
+            // - [0]: in-memory 1
+            // - [1]: in-memory 2 (should match [0])
+            // - [2]: XML reload (should match [0] and [1])
             var listArray = new[] { proteinsWithAppliedVariants, proteinsWithAppliedVariants2, proteinsWithAppliedVariants3 };
+
+            // Assert we always get exactly five variant proteins in the same order
+            // (SAV, MNV, insertion, deletion, insertion+mod).
+            Assert.AreEqual(5, proteinsWithAppliedVariants.Count, "Expected 5 applied variants (in-memory #1).");
+            Assert.AreEqual(5, proteinsWithAppliedVariants2.Count, "Expected 5 applied variants (in-memory #2).");
+            Assert.AreEqual(5, proteinsWithAppliedVariants3.Count, "Expected 5 applied variants (XML reload).");
+
+            // The expected sequences for each of the five applied variants, in order:
+            // 0: "MPEVTIDE"  (SAV at 4: P->V)
+            // 1: "MPEKTIDE"  (MNV at 4..5: PT->KT)
+            // 2: "MPEPPPTIDE" (Insertion-like at 4: P->PPP; length +2)
+            // 3: "MPEPTIDE"  (Deletion-like at 4..6: PPP->P; length -2 from "MPEPPPTIDE")
+            // 4: "MPEPPPTIDE" (Insertion-like with mod at 5)
             for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++)
             {
                 // sequences
-                Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence);
-                Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence);
-                Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence);
-                Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence);
-                Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence);
-                Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key);
-
+                Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence, $"[{dbIdx}] SAV sequence mismatch");
+                Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence, $"[{dbIdx}] MNV sequence mismatch");
+                Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence, $"[{dbIdx}] insertion sequence mismatch");
+                Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence, $"[{dbIdx}] deletion sequence mismatch");
+                Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence, $"[{dbIdx}] insertion+mod sequence mismatch");
+
+                // Confirm the modification attached to protein5 survives application and XML round-trip
+                Assert.AreEqual(1, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] Expected exactly 1 mod on the insertion+mod variant");
+                Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key, $"[{dbIdx}] Mod should localize to one-based position 5");
+                // Sanity: ensure the residue under the mod is indeed P
+                Assert.AreEqual('P', listArray[dbIdx][4].BaseSequence[5 - 1], $"[{dbIdx}] Residue at mod position should be 'P'");
+
+                // SAV expectations: single-residue change; length unchanged; position 4 is 'V'
+                Assert.AreEqual(8, listArray[dbIdx][0].BaseSequence.Length, $"[{dbIdx}] SAV length should be unchanged");
+                Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedBeginPosition, $"[{dbIdx}] SAV begin mismatch");
+                Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedEndPosition, $"[{dbIdx}] SAV end mismatch");
+                Assert.AreEqual("P", listArray[dbIdx][0].AppliedSequenceVariations.Single().OriginalSequence, $"[{dbIdx}] SAV original should be 'P'");
+                Assert.AreEqual("V", listArray[dbIdx][0].AppliedSequenceVariations.Single().VariantSequence, $"[{dbIdx}] SAV variant should be 'V'");
+                Assert.AreEqual('V', listArray[dbIdx][0].BaseSequence[4 - 1], $"[{dbIdx}] SAV residue at 4 should be 'V'");
+
+                // MNV expectations: multi-residue change; length unchanged; positions 4..5 become "KT"
+                Assert.AreEqual(8, listArray[dbIdx][1].BaseSequence.Length, $"[{dbIdx}] MNV length should be unchanged");
+                Assert.AreEqual(4, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedBeginPosition, $"[{dbIdx}] MNV begin mismatch");
+                Assert.AreEqual(5, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedEndPosition, $"[{dbIdx}] MNV end mismatch");
+                Assert.AreEqual("PT", listArray[dbIdx][1].AppliedSequenceVariations.Single().OriginalSequence, $"[{dbIdx}] MNV original should be 'PT'");
+                Assert.AreEqual("KT", listArray[dbIdx][1].AppliedSequenceVariations.Single().VariantSequence, $"[{dbIdx}] MNV variant should be 'KT'");
+                Assert.AreEqual("KT", listArray[dbIdx][1].BaseSequence.Substring(4 - 1, 2), $"[{dbIdx}] MNV residues 4..5 should be 'KT'");
+
+                // insertion expectations: length grows by +2; positions 4..6 are "PPP"
+                Assert.AreEqual(10, listArray[dbIdx][2].BaseSequence.Length, $"[{dbIdx}] insertion length should be 8 + 2 = 10");
+                Assert.AreEqual(4, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedBeginPosition, $"[{dbIdx}] insertion begin mismatch");
+                Assert.AreEqual(6, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedEndPosition, $"[{dbIdx}] insertion end mismatch");
+                Assert.AreEqual("P", listArray[dbIdx][2].AppliedSequenceVariations.Single().OriginalSequence, $"[{dbIdx}] insertion original should be 'P'");
+                Assert.AreEqual("PPP", listArray[dbIdx][2].AppliedSequenceVariations.Single().VariantSequence, $"[{dbIdx}] insertion variant should be 'PPP'");
+                Assert.AreEqual("PPP", listArray[dbIdx][2].BaseSequence.Substring(4 - 1, 3), $"[{dbIdx}] insertion residues 4..6 should be 'PPP'");
+
+                // deletion expectations: length shrinks by -2 relative to the starting "MPEPPPTIDE" (10 → 8)
+                // and positions collapse so that sequence returns to "MPEPTIDE".
+                Assert.AreEqual(8, listArray[dbIdx][3].BaseSequence.Length, $"[{dbIdx}] deletion length should be 10 - 2 = 8");
+                Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedBeginPosition, $"[{dbIdx}] deletion begin mismatch");
+                Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedEndPosition, $"[{dbIdx}] deletion end mismatch (post-collapse)");
+                Assert.AreEqual("PPP", listArray[dbIdx][3].AppliedSequenceVariations.Single().OriginalSequence, $"[{dbIdx}] deletion original should be 'PPP'");
+                Assert.AreEqual("P", listArray[dbIdx][3].AppliedSequenceVariations.Single().VariantSequence, $"[{dbIdx}] deletion variant should be 'P'");
+
+                // For completeness, also assert the summarized begin/end expectations that the original test verified.
                 // SAV
                 Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedBeginPosition);
                 Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedEndPosition);
-
                 // MNV
                 Assert.AreEqual(4, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedBeginPosition);
                 Assert.AreEqual(5, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedEndPosition);
-
                 // insertion
                 Assert.AreEqual(4, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedBeginPosition);
                 Assert.AreEqual(6, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedEndPosition);
-
                 // deletion
                 Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedBeginPosition);
                 Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedEndPosition);
             }
-        }
 
+            // Ensure exact stability across the three sources:
+            // - All sequences in in-memory #1 equal in-memory #2 and XML reload.
+            CollectionAssert.AreEqual(
+                proteinsWithAppliedVariants.Select(p => p.BaseSequence).ToList(),
+                proteinsWithAppliedVariants2.Select(p => p.BaseSequence).ToList(),
+                "In-memory application should be stable across repeated calls");
+            CollectionAssert.AreEqual(
+                proteinsWithAppliedVariants.Select(p => p.BaseSequence).ToList(),
+                proteinsWithAppliedVariants3.Select(p => p.BaseSequence).ToList(),
+                "XML round-trip should preserve variant-applied sequences in the same order");
+        }
         [Test]
         public static void AppliedVariants_AsIBioPolymer()
         {

From c4bf45465e1836aa18f0691e7518f40ca0cd54dd Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Fri, 24 Oct 2025 14:32:27 -0500
Subject: [PATCH 16/38] expand test AppliedVariants_AsIBioPolymer

---
 .../Test/DatabaseTests/TestVariantProtein.cs  | 189 +++++++++++++++---
 1 file changed, 160 insertions(+), 29 deletions(-)

diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
index 52731ca0a..594bc0368 100644
--- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs
+++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
@@ -469,23 +469,74 @@ public static void AppliedVariants()
         [Test]
         public static void AppliedVariants_AsIBioPolymer()
         {
+            // PURPOSE
+            // This test mirrors "AppliedVariants" but exercises the IBioPolymer interface path.
+            // It validates, in detail:
+            // 1) Correct application of four variant types (SAV, MNV, insertion, deletion) to produce expected sequences.
+            // 2) Correct coordinates (begin/end), original/variant strings in AppliedSequenceVariations after application.
+            // 3) Stability of results across:
+            //    - repeated in-memory variant application,
+            //    - XML round-trip (write → read).
+            // 4) Modification propagation and localization for a variant carrying a downstream mod.
+            // 5) Distinguishing two variants with identical sequences by their modification state (index 2 vs 4).
+            //
+            // EXPECTATIONS SUMMARY
+            // - Variant sequences (in order): "MPEVTIDE", "MPEKTIDE", "MPEPPPTIDE", "MPEPTIDE", "MPEPPPTIDE".
+            // - AppliedSequenceVariations:
+            //   SAV       (idx 0): begin=4, end=4, original="P",   variant="V",   len=8
+            //   MNV       (idx 1): begin=4, end=5, original="PT",  variant="KT",  len=8
+            //   Insertion (idx 2): begin=4, end=6, original="P",   variant="PPP", len=10
+            //   Deletion  (idx 3): begin=4, end=4, original="PPP", variant="P",   len=8
+            //   Insert+Mod(idx 4): same sequence as insertion, plus exactly one mod at one-based index 5 targeting a 'P'.
+            //
+            // NOTE: Lists [0], [1], [2] below represent:
+            //   [0] in-memory application (first call)
+            //   [1] in-memory application (second call, should be identical/stable)
+            //   [2] XML round-trip results, must match [0] and [1]
+
+            // Arrange: create a P-specific modification used to test propagation and localization
             ModificationMotif.TryGetMotif("P", out ModificationMotif motifP);
-            Modification mp = new Modification("mod", null, "type", null, motifP, "Anywhere.", null, 42.01, new Dictionary<string, IList<string>>(), null, null, null, null, null);
+            Modification mp = new Modification(
+                _originalId: "mod",
+                _accession: null,
+                _modificationType: "type",
+                _featureType: null,
+                _target: motifP,
+                _locationRestriction: "Anywhere.",
+                _chemicalFormula: null,
+                _monoisotopicMass: 42.01,
+                _databaseReference: new Dictionary<string, IList<string>>(),
+                _taxonomicRange: null,
+                _keywords: null,
+                _neutralLosses: null,
+                _diagnosticIons: null,
+                _fileOrigin: null);
 
+            // Arrange: build five proteins (as IBioPolymer) with one sequence variation each
+            // 1) SAV P(4)->V
+            // 2) MNV PT(4..5)->KT
+            // 3) Insertion-like P(4)->PPP(4..6)
+            // 4) Deletion-like PPP(4..6)->P(4) on a longer starting sequence
+            // 5) Insertion-like with a downstream mod at one-based index 5
             List<IBioPolymer> proteinsWithSeqVars = new List<IBioPolymer>
             {
-                new Protein("MPEPTIDE", "protein1", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 4, "P", "V", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
-                new Protein("MPEPTIDE", "protein2", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 5, "PT", "KT", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
-                new Protein("MPEPTIDE", "protein3", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
-                new Protein("MPEPPPTIDE", "protein4", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 6, "PPP", "P", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
-                new Protein("MPEPTIDE", "protein5", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary<int, List<Modification>> {{ 5, new[] { mp }.ToList() } }) }),
+                new Protein("MPEPTIDE",   "protein1", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 4, "P",   "V",   @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
+                new Protein("MPEPTIDE",   "protein2", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 5, "PT",  "KT",  @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
+                new Protein("MPEPTIDE",   "protein3", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 4, "P",   "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
+                new Protein("MPEPPPTIDE", "protein4", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 6, "PPP", "P",   @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
+                new Protein("MPEPTIDE",   "protein5", sequenceVariations: new List<SequenceVariation> { new SequenceVariation(4, 4, "P",   "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary<int, List<Modification>> { { 5, new[] { mp }.ToList() } }) }),
              };
+
+            // Act: apply variants in-memory twice (should be identical/stable)
             var proteinsWithAppliedVariants = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList();
-            var proteinsWithAppliedVariants2 = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList(); // should be stable
+            var proteinsWithAppliedVariants2 = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList();
+
+            // Act: write to XML and load back; results should match in-memory outputs
             string xml = Path.Combine(TestContext.CurrentContext.TestDirectory, "AppliedVariants.xml");
             ProteinDbWriter.WriteXmlDatabase(new Dictionary<string, HashSet<Tuple<int, Modification>>>(), proteinsWithSeqVars, xml);
             var proteinsWithAppliedVariants3 = ProteinDbLoader.LoadProteinXML(xml, true, DecoyType.None, null, false, null, out var un);
 
+            // Group lists for uniform validation loops
             var listArray = new List<IBioPolymer>[]
             {
                 proteinsWithAppliedVariants,
@@ -493,34 +544,114 @@ public static void AppliedVariants_AsIBioPolymer()
                 proteinsWithAppliedVariants3.Cast<IBioPolymer>().ToList()
             };
 
-            for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++)
-            {
-                // sequences
-                Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence);
-                Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence);
-                Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence);
-                Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence);
-                Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence);
-                Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key);
+            // Assert: each expansion produces exactly 5 variant biopolymers, in the same, predictable order
+            Assert.AreEqual(5, proteinsWithAppliedVariants.Count, "Expected 5 variants (in-memory 1)");
+            Assert.AreEqual(5, proteinsWithAppliedVariants2.Count, "Expected 5 variants (in-memory 2)");
+            Assert.AreEqual(5, proteinsWithAppliedVariants3.Count, "Expected 5 variants (XML reload)");
 
-                // SAV
-                Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedBeginPosition);
-                Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedEndPosition);
+            // Assert: stability across the three sources (same sequences, same order)
+            CollectionAssert.AreEqual(
+                proteinsWithAppliedVariants.Select(p => p.BaseSequence).ToList(),
+                proteinsWithAppliedVariants2.Select(p => p.BaseSequence).ToList(),
+                "In-memory application should be stable across repeated calls");
+            CollectionAssert.AreEqual(
+                proteinsWithAppliedVariants.Select(p => p.BaseSequence).ToList(),
+                proteinsWithAppliedVariants3.Select(p => p.BaseSequence).ToList(),
+                "XML round-trip should preserve variant-applied sequences in the same order");
 
-                // MNV
-                Assert.AreEqual(4, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedBeginPosition);
-                Assert.AreEqual(5, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedEndPosition);
+            // Assert: for all variants we expect exactly one applied sequence variation
+            foreach (var variants in listArray)
+            {
+                Assert.AreEqual(1, variants[0].AppliedSequenceVariations.Count, "SAV must have exactly one applied variant");
+                Assert.AreEqual(1, variants[1].AppliedSequenceVariations.Count, "MNV must have exactly one applied variant");
+                Assert.AreEqual(1, variants[2].AppliedSequenceVariations.Count, "Insertion must have exactly one applied variant");
+                Assert.AreEqual(1, variants[3].AppliedSequenceVariations.Count, "Deletion must have exactly one applied variant");
+                Assert.AreEqual(1, variants[4].AppliedSequenceVariations.Count, "Insertion+Mod must have exactly one applied variant");
+            }
 
-                // insertion
-                Assert.AreEqual(4, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedBeginPosition);
-                Assert.AreEqual(6, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedEndPosition);
+            // Per-list validation of sequence, coordinates, and (where appropriate) residue checks and mod checks
+            for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++)
+            {
+                // Assert: expected sequences in fixed order
+                Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence, $"[{dbIdx}] SAV sequence mismatch");
+                Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence, $"[{dbIdx}] MNV sequence mismatch");
+                Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence, $"[{dbIdx}] insertion sequence mismatch");
+                Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence, $"[{dbIdx}] deletion sequence mismatch");
+                Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence, $"[{dbIdx}] insertion+mod sequence mismatch");
 
-                // deletion
-                Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedBeginPosition);
-                Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedEndPosition);
+                // Assert: lengths (sanity for ins/del)
+                Assert.AreEqual(8, listArray[dbIdx][0].BaseSequence.Length, $"[{dbIdx}] SAV length should be unchanged");
+                Assert.AreEqual(8, listArray[dbIdx][1].BaseSequence.Length, $"[{dbIdx}] MNV length should be unchanged");
+                Assert.AreEqual(10, listArray[dbIdx][2].BaseSequence.Length, $"[{dbIdx}] insertion length should be 8 + 2 = 10");
+                Assert.AreEqual(8, listArray[dbIdx][3].BaseSequence.Length, $"[{dbIdx}] deletion length should be 10 - 2 = 8");
+                Assert.AreEqual(10, listArray[dbIdx][4].BaseSequence.Length, $"[{dbIdx}] insertion+mod length should be 10");
+
+                // SAV assertions: P(4)->V
+                var sav = listArray[dbIdx][0].AppliedSequenceVariations.Single();
+                Assert.AreEqual(4, sav.OneBasedBeginPosition, $"[{dbIdx}] SAV begin");
+                Assert.AreEqual(4, sav.OneBasedEndPosition, $"[{dbIdx}] SAV end");
+                Assert.AreEqual("P", sav.OriginalSequence, $"[{dbIdx}] SAV original");
+                Assert.AreEqual("V", sav.VariantSequence, $"[{dbIdx}] SAV variant");
+                Assert.AreEqual('V', listArray[dbIdx][0].BaseSequence[3], $"[{dbIdx}] SAV residue at 4 must be 'V'");
+
+                // MNV assertions: PT(4..5)->KT
+                var mnv = listArray[dbIdx][1].AppliedSequenceVariations.Single();
+                Assert.AreEqual(4, mnv.OneBasedBeginPosition, $"[{dbIdx}] MNV begin");
+                Assert.AreEqual(5, mnv.OneBasedEndPosition, $"[{dbIdx}] MNV end");
+                Assert.AreEqual("PT", mnv.OriginalSequence, $"[{dbIdx}] MNV original");
+                Assert.AreEqual("KT", mnv.VariantSequence, $"[{dbIdx}] MNV variant");
+                Assert.AreEqual("KT", listArray[dbIdx][1].BaseSequence.Substring(3, 2), $"[{dbIdx}] MNV residues 4..5 must be 'KT'");
+
+                // Insertion-like assertions: P(4)->PPP(4..6)
+                var ins = listArray[dbIdx][2].AppliedSequenceVariations.Single();
+                Assert.AreEqual(4, ins.OneBasedBeginPosition, $"[{dbIdx}] insertion begin");
+                Assert.AreEqual(6, ins.OneBasedEndPosition, $"[{dbIdx}] insertion end");
+                Assert.AreEqual("P", ins.OriginalSequence, $"[{dbIdx}] insertion original");
+                Assert.AreEqual("PPP", ins.VariantSequence, $"[{dbIdx}] insertion variant");
+                Assert.AreEqual("PPP", listArray[dbIdx][2].BaseSequence.Substring(3, 3), $"[{dbIdx}] insertion residues 4..6 must be 'PPP'");
+
+                // Deletion-like assertions: PPP(4..6)->P(4) (collapses back to MPEPTIDE)
+                var del = listArray[dbIdx][3].AppliedSequenceVariations.Single();
+                Assert.AreEqual(4, del.OneBasedBeginPosition, $"[{dbIdx}] deletion begin");
+                Assert.AreEqual(4, del.OneBasedEndPosition, $"[{dbIdx}] deletion end (post-collapse)");
+                Assert.AreEqual("PPP", del.OriginalSequence, $"[{dbIdx}] deletion original");
+                Assert.AreEqual("P", del.VariantSequence, $"[{dbIdx}] deletion variant");
+
+                // Insertion + Modification assertions: identical sequence to insertion, plus one mod at pos 5
+                var insMod = listArray[dbIdx][4].AppliedSequenceVariations.Single();
+                Assert.AreEqual(4, insMod.OneBasedBeginPosition, $"[{dbIdx}] insertion+mod begin");
+                Assert.AreEqual(6, insMod.OneBasedEndPosition, $"[{dbIdx}] insertion+mod end");
+                Assert.AreEqual("P", insMod.OriginalSequence, $"[{dbIdx}] insertion+mod original");
+                Assert.AreEqual("PPP", insMod.VariantSequence, $"[{dbIdx}] insertion+mod variant");
+
+                // Mods: Only the "insertion+mod" variant (index 4) carries a mod; all others should have none
+                // Confirm the variant 4 has exactly one mod at one-based position 5, and the residue at 5 is 'P' (motif).
+                if (dbIdx == 0 || dbIdx == 1 || dbIdx == 2) // only assert detailed mod state once per path for clarity
+                {
+                    // Index 2 (plain insertion) must have no mods
+                    Assert.AreEqual(0, listArray[dbIdx][2].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] insertion should have no mods");
+
+                    // Index 4 (insertion+mod) must have exactly one mod at site 5, targeting a P residue
+                    Assert.AreEqual(1, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] insertion+mod should have exactly one site with mods");
+                    Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key, $"[{dbIdx}] insertion+mod site should be one-based index 5");
+                    Assert.AreEqual('P', listArray[dbIdx][4].BaseSequence[5 - 1], $"[{dbIdx}] insertion+mod residue at site 5 must be 'P' (motif)");
+
+                    // All other variants should have zero possible localized mods
+                    Assert.AreEqual(0, listArray[dbIdx][0].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] SAV should have no mods");
+                    Assert.AreEqual(0, listArray[dbIdx][1].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] MNV should have no mods");
+                    Assert.AreEqual(0, listArray[dbIdx][3].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] deletion should have no mods");
+                }
             }
-        }
 
+            // Additional cross-list checks:
+            // - The two "MPEPPPTIDE" variants (indices 2 and 4) should be sequence-identical, but differ by modification presence.
+            for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++)
+            {
+                Assert.AreEqual(listArray[dbIdx][2].BaseSequence, listArray[dbIdx][4].BaseSequence, $"[{dbIdx}] insertion and insertion+mod sequences must match");
+                Assert.AreEqual(0, listArray[dbIdx][2].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] insertion should remain unmodified");
+                Assert.AreEqual(1, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] insertion+mod should remain modified");
+            }
+        }
         [Test]
         public static void CrashOnCreateVariantFromRNA()
         {

From 39f106b50a0c7eff2d436a739029c149fc948121 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Fri, 24 Oct 2025 14:38:28 -0500
Subject: [PATCH 17/38] expand test stop gained

---
 .../Test/DatabaseTests/TestVariantProtein.cs  | 122 ++++++++++++++----
 1 file changed, 100 insertions(+), 22 deletions(-)

diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
index 594bc0368..057c56280 100644
--- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs
+++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
@@ -664,32 +664,110 @@ public static void CrashOnCreateVariantFromRNA()
                 proteins[0].CreateVariant(proteins[0].BaseSequence, rna, [], [], new Dictionary<int, List<Modification>>(), "");
             });
         }
-
         [Test]
         public static void StopGained()
         {
-            var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), true,
-                DecoyType.None, null, false, null, out var unknownModifications);
-            Assert.AreEqual(2, proteins.Count);
-            Assert.AreEqual(1, proteins[0].SequenceVariations.Count()); // some redundant
-            Assert.AreEqual(1, proteins[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes
-            Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Count()); // some redundant
-            Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes
-            Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Count()); // some redundant
-            Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes
-            Assert.AreEqual(191, proteins[0].Length);
-            Assert.AreEqual('Q', proteins[0][161 - 1]);
-            Assert.AreEqual(161 - 1, proteins[1].Length);
-            Assert.AreNotEqual(proteins[0].Length, proteins[1].Length);
-
-            proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), true,
-                DecoyType.None, null, false, null, out unknownModifications, minAlleleDepth: 400);
-            Assert.AreEqual(1, proteins.Count);
-            Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Count()); // some redundant
-            Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes
-            Assert.AreEqual(161 - 1, proteins[0].Length);
+            // PURPOSE
+            // This test validates correct handling of a stop-gained sequence variation (creation of a premature stop codon).
+            // Verifies two scenarios:
+            // 1) Default variant depth filtering: both reference (no variant applied) and alternate (stop-gained applied) proteins are emitted.
+            // 2) High min-allele-depth threshold: only the stop-gained (applied) protein is retained.
+            //
+            // EXPECTATIONS SUMMARY (based on the StopGained.xml test data):
+            // - Two proteins are initially produced (reference + variant-applied):
+            //   [0] Reference protein:
+            //       * 1 sequence variation present in metadata, but 0 applied (reference form).
+            //       * BaseSequence length = 191.
+            //       * Residue at one-based position 161 equals 'Q' (so zero-based index 160 is 'Q').
+            //   [1] Variant-applied protein (stop-gained):
+            //       * Exactly 1 applied variation.
+            //       * BaseSequence length truncated to 161 - 1 = 160 (stop codon at 161 shortens the sequence).
+            //       * The sequence is exactly the prefix of the reference up to length 160.
+            //       * No '*' appears in the resulting BaseSequence (the stop codon is not a literal character in the sequence).
+            //       * The applied variant's VariantSequence ends with '*'.
+            //       * The applied variant one-based begin (and end) position is 161.
+            //
+            // - With minAlleleDepth: 400
+            //   * Only the variant-applied protein is returned.
+            //   * It retains the same applied-variation and truncated length expectations.
+
+            // Load the proteins with default filtering
+            var proteins = ProteinDbLoader.LoadProteinXML(
+                Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"),
+                true,               // generateTargets
+                DecoyType.None,     // decoyType
+                null,               // allKnownModifications
+                false,              // isContaminant
+                null,               // modTypesToExclude
+                out var unknownModifications);
+
+            // Sanity: Decoys are not requested
+            Assert.IsTrue(proteins.All(p => !p.IsDecoy), "No decoys expected when using DecoyType.None");
+
+            // Expect exactly two proteins: reference (no applied variant) and stop-gained (applied)
+            Assert.AreEqual(2, proteins.Count, "Expected reference and stop-gained variant proteins");
+            // Reference protein metadata: one possible sequence variation in the record
+            Assert.AreEqual(1, proteins[0].SequenceVariations.Count(), "Reference metadata should contain one sequence variation");
+            Assert.AreEqual(1, proteins[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), "Reference metadata should contain exactly one unique sequence variation");
+            // Reference should have zero applied variations (reference form retained)
+            Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Count(), "Reference protein should not have an applied sequence variation");
+            Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), "Reference applied variations should be zero");
+
+            // Variant-applied protein: applied variation present and unique
+            Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Count(), "Variant protein should have exactly one applied variation");
+            Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), "Variant protein applied variations should be unique");
+
+            // Reference length and residue checks around the stop site
+            Assert.AreEqual(191, proteins[0].Length, "Reference protein length should match source data");
+            Assert.AreEqual('Q', proteins[0][161 - 1], "Reference residue at 161 should be 'Q' prior to stop-gain");
+
+            // Variant length must be truncated by the stop at position 161 → length becomes 160
+            Assert.AreEqual(161 - 1, proteins[1].Length, "Variant protein length should be truncated to 160 due to stop at 161");
+
+            // The variant BaseSequence must be exactly the prefix of the reference up to the stop position - 1
+            string reference = proteins[0].BaseSequence;
+            string variant = proteins[1].BaseSequence;
+            Assert.IsTrue(reference.StartsWith(variant), "Variant sequence must be a prefix of the reference sequence");
+            Assert.AreEqual(reference.Substring(0, 161 - 1), variant, "Variant sequence must equal reference[0..159]");
+
+            // Ensure we did not write literal '*' into the protein sequence; stop codon is represented by truncation instead
+            Assert.AreEqual(-1, variant.IndexOf('*'), "Variant BaseSequence should not contain a literal '*'");
+
+            // Verify applied-variant details for the stop-gained protein
+            var applied = proteins[1].AppliedSequenceVariations.Single();
+            Assert.IsTrue(applied.VariantSequence.EndsWith("*"), "Stop-gained variant must end with '*'");
+            Assert.AreEqual(161, applied.OneBasedBeginPosition, "Stop-gained begins at residue 161");
+            Assert.AreEqual(161, applied.OneBasedEndPosition, "Stop-gained ends at residue 161 (single residue change)");
+
+            // The two forms must differ in sequence length (as a sanity check)
+            Assert.AreNotEqual(proteins[0].Length, proteins[1].Length, "Reference and variant proteins should differ in length");
+
+            // Now require a higher min-allele-depth; expect only the variant-applied protein retained
+            proteins = ProteinDbLoader.LoadProteinXML(
+                Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"),
+                true,               // generateTargets
+                DecoyType.None,     // decoyType
+                null,               // allKnownModifications
+                false,              // isContaminant
+                null,               // modTypesToExclude
+                out unknownModifications,
+                minAlleleDepth: 400);
+
+            // Only the stop-gained, variant-applied form is retained under a strict depth threshold
+            Assert.AreEqual(1, proteins.Count, "High min-allele-depth should retain only the variant-applied protein");
+            Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Count(), "Variant-applied protein should still have one applied variation");
+            Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), "Variant-applied unique variation should be retained");
+            Assert.AreEqual(161 - 1, proteins[0].Length, "Variant-applied protein length should remain truncated to 160");
+
+            // Confirm stability: the single protein from the depth-filtered load matches the previously observed variant sequence
+            Assert.AreEqual(variant, proteins[0].BaseSequence, "Depth-filtered variant sequence should match previously observed variant");
+
+            // Re-check applied-variant semantics after filtering for completeness
+            var appliedAfterFilter = proteins[0].AppliedSequenceVariations.Single();
+            Assert.IsTrue(appliedAfterFilter.VariantSequence.EndsWith("*"), "Stop-gained variant must end with '*' (after filtering)");
+            Assert.AreEqual(161, appliedAfterFilter.OneBasedBeginPosition, "Stop-gained begins at residue 161 (after filtering)");
+            Assert.AreEqual(161, appliedAfterFilter.OneBasedEndPosition, "Stop-gained ends at residue 161 (after filtering)");
         }
-
         [Test]
         public static void StopGainedDecoysAndDigestion()
         {

From 6df25edf07328a70add79d6ff7c20e6b7ec3ceed Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Fri, 24 Oct 2025 14:40:12 -0500
Subject: [PATCH 18/38] three new tests

---
 .../Test/DatabaseTests/TestVariantProtein.cs  | 85 +++++++++++++++++++
 1 file changed, 85 insertions(+)

diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
index 057c56280..af8437af3 100644
--- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs
+++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
@@ -769,6 +769,91 @@ public static void StopGained()
             Assert.AreEqual(161, appliedAfterFilter.OneBasedEndPosition, "Stop-gained ends at residue 161 (after filtering)");
         }
         [Test]
+        public static void StopGained_TruncationIsPrefixAndNoOutOfBoundsAnnotations()
+        {
+            var proteins = ProteinDbLoader.LoadProteinXML(
+                Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"),
+                true, DecoyType.None, null, false, null, out var _);
+
+            Assert.AreEqual(2, proteins.Count);
+            var reference = proteins[0];
+            var truncated = proteins[1];
+
+            // The truncated sequence must be a prefix of the reference,
+            // i.e., identical up to the truncation point.
+            Assert.That(reference.BaseSequence.StartsWith(truncated.BaseSequence));
+
+            // Any possible localized modifications must not point past the truncation boundary.
+            Assert.That(truncated.OneBasedPossibleLocalizedModifications
+                .All(kv => kv.Key >= 1 && kv.Key <= truncated.Length));
+
+            // Any proteolysis products (if present) must not reference indices outside the sequence.
+            Assert.That(truncated.TruncationProducts.All(tp =>
+                (!tp.OneBasedBeginPosition.HasValue || (tp.OneBasedBeginPosition.Value >= 1 && tp.OneBasedBeginPosition.Value <= truncated.Length)) &&
+                (!tp.OneBasedEndPosition.HasValue || (tp.OneBasedEndPosition.Value >= 1 && tp.OneBasedEndPosition.Value <= truncated.Length))));
+
+            // The applied stop-gained variation often encodes a '*' in the variant sequence.
+            // If present, that indicates stop; the actual sequence is cut at the stop.
+            if (truncated.AppliedSequenceVariations.Any())
+            {
+                Assert.That(truncated.AppliedSequenceVariations.Single().VariantSequence.EndsWith("*") ||
+                            !truncated.AppliedSequenceVariations.Single().VariantSequence.Contains("*"));
+            }
+        }
+        [Test]
+        public static void StopGained_RoundTripSerializationPreservesTruncation()
+        {
+            var proteins = ProteinDbLoader.LoadProteinXML(
+                Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"),
+                true, DecoyType.None, null, false, null, out var _);
+
+            Assert.AreEqual(2, proteins.Count);
+            var tempPath = Path.Combine(TestContext.CurrentContext.TestDirectory, $"StopGained_roundtrip_{Guid.NewGuid()}.xml");
+
+            try
+            {
+                // Persist both proteins (reference + variant) and reload.
+                ProteinDbWriter.WriteXmlDatabase(new Dictionary<string, HashSet<Tuple<int, Modification>>>(), proteins, tempPath);
+                var roundtrip = ProteinDbLoader.LoadProteinXML(tempPath, true, DecoyType.None, null, false, null, out var __);
+
+                // Round-trip preserves count and the truncation boundary for the variant-applied protein.
+                Assert.AreEqual(2, roundtrip.Count);
+                Assert.AreEqual(proteins[0].Length, roundtrip[0].Length);
+                Assert.AreEqual(proteins[1].Length, roundtrip[1].Length);
+                Assert.AreEqual(proteins[1].AppliedSequenceVariations.Count(), roundtrip[1].AppliedSequenceVariations.Count());
+            }
+            finally
+            {
+                if (File.Exists(tempPath))
+                {
+                    File.SetAttributes(tempPath, FileAttributes.Normal);
+                    File.Delete(tempPath);
+                }
+            }
+        }
+        [Test]
+        public static void StopGained_NoPeptidesCrossTruncationSite()
+        {
+            var proteins = ProteinDbLoader.LoadProteinXML(
+                Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"),
+                true, DecoyType.None, null, false, null, out var _);
+
+            Assert.AreEqual(2, proteins.Count);
+            var reference = proteins[0];
+            var truncated = proteins[1];
+
+            // Peptides from the truncated protein must not reference indices past the truncation boundary.
+            var dp = new DigestionParams();
+            var variantPeps = truncated.Digest(dp, null, null).ToList();
+            Assert.That(variantPeps.All(p => p.OneBasedEndResidueInProtein <= truncated.Length));
+
+            // Any peptide in the reference that extends past the truncation boundary cannot exist in the variant.
+            var refPeps = reference.Digest(dp, null, null).ToList();
+            var refCrossing = refPeps.Where(p => p.OneBasedEndResidueInProtein > truncated.Length).ToList();
+            var variantPepWindows = new HashSet<(int start, int end)>(variantPeps.Select(p => (p.OneBasedStartResidueInProtein, p.OneBasedEndResidueInProtein)));
+            Assert.That(refCrossing.All(p => !variantPepWindows.Contains((p.OneBasedStartResidueInProtein, p.OneBasedEndResidueInProtein))));
+        }
+        [Test]
         public static void StopGainedDecoysAndDigestion()
         {
             // test decoys and digestion

From 1630c76c02cde969975be9bcb19736be4b1d1d25 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Fri, 24 Oct 2025 14:45:35 -0500
Subject: [PATCH 19/38] expand test stop gained decoys and digestion

---
 .../Test/DatabaseTests/TestVariantProtein.cs  | 88 +++++++++++++++++--
 1 file changed, 79 insertions(+), 9 deletions(-)

diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
index af8437af3..3e66e8611 100644
--- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs
+++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
@@ -856,16 +856,86 @@ public static void StopGained_NoPeptidesCrossTruncationSite()
         [Test]
         public static void StopGainedDecoysAndDigestion()
         {
-            // test decoys and digestion
-            var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGain.xml"), true,
-                DecoyType.Reverse, null, false, null, out var unknownModifications, minAlleleDepth: 400);
-            Assert.AreEqual(2, proteins.Count);
-            var targetPeps = proteins[0].Digest(new DigestionParams(), null, null).ToList();
-            var decoyPeps = proteins[1].Digest(new DigestionParams(), null, null).ToList();
-            //Assert.AreEqual(targetPeps.Sum(p => p.Length), decoyPeps.Sum(p => p.Length));
-            //Assert.AreEqual(targetPeps.Count, decoyPeps.Count);
-        }
+            // PURPOSE
+            // This test ensures that:
+            // 1) Reverse decoys are generated for a stop-gained (truncated) target protein.
+            // 2) The target and its decoy digest into peptides without referencing residues outside their sequence bounds.
+            // 3) Basic invariants hold for decoy generation (count/order/length/decoy flag).
+            //
+            // CONTEXT
+            // - Database: "StopGain.xml" contains a target protein with a stop-gained variant that truncates the sequence.
+            // - Decoys: Generated using DecoyType.Reverse (sequence reversal-based decoys).
+            // - minAlleleDepth: 400 ensures the stop-gained variant is applied (truncated target).
+            //
+            // EXPECTATIONS
+            // - Exactly 2 proteins are returned: [0] target (non-decoy) + [1] decoy (IsDecoy = true).
+            // - Target and decoy lengths are equal (reverse decoys preserve length).
+            // - Both target and decoy produce peptides via digestion.
+            // - No peptide references indices outside its parent protein's 1..Length range.
+            // - Accessions reflect decoy generation (decoy starts with the default "DECOY_").
+            // - If variant(s) are present, their counts match between target and decoy.
+
+            // Arrange: Load a variant-applied protein set and reverse decoy pair from StopGain.xml.
+            // Using a strict minAlleleDepth applies the stop-gained variant to the target.
+            var proteins = ProteinDbLoader.LoadProteinXML(
+                Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGain.xml"),
+                true,                 // generateTargets
+                DecoyType.Reverse,    // generate reverse-sequence decoys
+                null,                 // allKnownModifications
+                false,                // isContaminant
+                null,                 // modTypesToExclude
+                out var unknownModifications,
+                minAlleleDepth: 400); // force applying the stop-gained variant
+
+            // Assert: We expect exactly two proteins: target then its decoy.
+            Assert.AreEqual(2, proteins.Count, "Expected 1 target + 1 decoy");
+            Assert.IsFalse(proteins[0].IsDecoy, "First protein should be the target (non-decoy)");
+            Assert.IsTrue(proteins[1].IsDecoy, "Second protein should be the decoy");
+            Assert.That(proteins[1].Accession.StartsWith("DECOY_"), "Decoy accession should start with the default decoy identifier");
+
+            // Assert: Reverse decoys should preserve sequence length.
+            Assert.AreEqual(proteins[0].Length, proteins[1].Length, "Target and decoy should have identical lengths");
+            // In general, target and decoy sequences should not be byte-identical.
+            Assert.AreNotEqual(proteins[0].BaseSequence, proteins[1].BaseSequence, "Decoy sequence should differ from target sequence");
+
+            // If the stop-gained variant is applied to the target, the decoy should carry a corresponding variant count.
+            // We do not enforce exact mapping positions here, only that counts match if any are present.
+            if (proteins[0].AppliedSequenceVariations.Any() || proteins[1].AppliedSequenceVariations.Any())
+            {
+                Assert.AreEqual(
+                    proteins[0].AppliedSequenceVariations.Count(),
+                    proteins[1].AppliedSequenceVariations.Count(),
+                    "Target and decoy should carry the same number of applied sequence variations");
+            }
 
+            // Act: Digest both target and decoy using default digestion parameters (typically trypsin-like rules).
+            var dp = new DigestionParams();
+            var targetPeps = proteins[0].Digest(dp, null, null).ToList();
+            var decoyPeps = proteins[1].Digest(dp, null, null).ToList();
+
+            // Assert: Both should yield peptides.
+            Assert.That(targetPeps.Count > 0, "Target digestion should produce peptides");
+            Assert.That(decoyPeps.Count > 0, "Decoy digestion should produce peptides");
+
+            // Assert: No peptide references residues outside the corresponding protein bounds.
+            Assert.That(targetPeps.All(p =>
+                p.OneBasedStartResidueInProtein >= 1 &&
+                p.OneBasedEndResidueInProtein <= proteins[0].Length),
+                "All target peptides must fall within target bounds");
+
+            Assert.That(decoyPeps.All(p =>
+                p.OneBasedStartResidueInProtein >= 1 &&
+                p.OneBasedEndResidueInProtein <= proteins[1].Length),
+                "All decoy peptides must fall within decoy bounds");
+
+            // Note:
+            // We intentionally do NOT assert the number of peptides or the sum of peptide lengths to be equal between
+            // target and decoy. Even with reverse decoys, tryptic cleavage context differs and may alter cleavage patterns.
+            // The commented lines below are often too strict and can fail legitimately:
+            //
+            // Assert.AreEqual(targetPeps.Sum(p => p.Length), decoyPeps.Sum(p => p.Length));
+            // Assert.AreEqual(targetPeps.Count, decoyPeps.Count);
+        }
         [Test]
         public static void MultipleAlternateAlleles()
         {

From 89d76d34a4a78a94da71aa5a9b2f6243a719c868 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Fri, 24 Oct 2025 14:49:35 -0500
Subject: [PATCH 20/38] expand test SequenceVariationIsValid

---
 .../Test/DatabaseTests/TestVariantProtein.cs  | 165 ++++++++++++++++--
 1 file changed, 152 insertions(+), 13 deletions(-)

diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
index 3e66e8611..c615342c5 100644
--- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs
+++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
@@ -936,6 +936,7 @@ public static void StopGainedDecoysAndDigestion()
             // Assert.AreEqual(targetPeps.Sum(p => p.Length), decoyPeps.Sum(p => p.Length));
             // Assert.AreEqual(targetPeps.Count, decoyPeps.Count);
         }
+
         [Test]
         public static void MultipleAlternateAlleles()
         {
@@ -1056,21 +1057,159 @@ public void IndelDecoyVariants()
         [Test]
         public void SequenceVariationIsValidTest()
         {
-            SequenceVariation sv1 = new SequenceVariation(10, 10, "A", "T", "info", null);
-            SequenceVariation sv2 = new SequenceVariation(5, 5, "G", "C", "info", null);
-            SequenceVariation sv3 = new SequenceVariation(8, 8, "T", "A", "info", null);
+            // PURPOSE
+            // Validate the minimal, position-only "validity" rules implemented by SequenceVariation.AreValid():
+            //   AreValid() == (OneBasedBeginPosition > 0) && (OneBasedEndPosition >= OneBasedBeginPosition)
+            //
+            // We cover:
+            // 1) Explicit begin/end ctor with typical point mutations → valid.
+            // 2) Explicit begin/end ctor with invalid positions → invalid.
+            // 3) Explicit begin/end ctor representing insertion/deletion edge-cases → valid as long as positions are valid.
+            // 4) One-position convenience ctor behavior for different originalSequence values (null, "", length > 0).
+            //    - This ctor derives end as: end = (original == null) ? begin : begin + original.Length - 1.
+            //    - Therefore, empty originalSequence "" makes end = begin - 1 → invalid by design.
+            // 5) Content fields (Original/Variant) and OneBasedModifications do NOT affect AreValid(), only positions do.
+            // 6) Optional sanity checks on derived fields (SimpleString and computed end position).
+
+            // -----------------------------
+            // 1) Valid: explicit begin/end point mutations (begin == end, begin > 0)
+            // -----------------------------
+            SequenceVariation sv1 = new SequenceVariation(
+                oneBasedBeginPosition: 10, oneBasedEndPosition: 10,
+                originalSequence: "A", variantSequence: "T",
+                description: "info", oneBasedModifications: null);
+            SequenceVariation sv2 = new SequenceVariation(
+                oneBasedBeginPosition: 5, oneBasedEndPosition: 5,
+                originalSequence: "G", variantSequence: "C",
+                description: "info", oneBasedModifications: null);
+            SequenceVariation sv3 = new SequenceVariation(
+                oneBasedBeginPosition: 8, oneBasedEndPosition: 8,
+                originalSequence: "T", variantSequence: "A",
+                description: "info", oneBasedModifications: null);
+
+            // A protein can carry multiple variations; positions alone determine validity.
             List<SequenceVariation> svList = new List<SequenceVariation> { sv1, sv2, sv3 };
-
             Protein variantProtein = new Protein("ACDEFGHIKLMNPQRSTVWY", "protein1", sequenceVariations: svList);
-            Assert.IsTrue(variantProtein.SequenceVariations.All(v => v.AreValid()));
-            SequenceVariation svInvalidOneBasedBeginLessThanOne = new SequenceVariation(0, 10, "A", "T", "info", null);
-            SequenceVariation svInvalidOneBasedEndLessThanOneBasedBegin = new SequenceVariation(5, 4, "G", "C", "info", null);
-            SequenceVariation svValidOriginalSequenceIsEmpty = new SequenceVariation(8, 8, "", "A", "info", null);
-            SequenceVariation svValidVariantSequenceLenthIsZero = new SequenceVariation(10, 10, "A", "", "info", null);
-            Assert.IsFalse(svInvalidOneBasedBeginLessThanOne.AreValid());
-            Assert.IsFalse(svInvalidOneBasedEndLessThanOneBasedBegin.AreValid());
-            Assert.IsTrue(svValidOriginalSequenceIsEmpty.AreValid()); //This is valid because it is an insertion
-            Assert.IsTrue(svValidVariantSequenceLenthIsZero.AreValid()); // This is valid because it is a deletion
+
+            // Expectation: all three above are valid (begin > 0 and end == begin).
+            Assert.IsTrue(variantProtein.SequenceVariations.All(v => v.AreValid()), "All explicit point mutations with valid positions should be valid");
+
+            // -----------------------------
+            // 2) Invalid: begin < 1 and end < begin
+            // -----------------------------
+            SequenceVariation svInvalidOneBasedBeginLessThanOne = new SequenceVariation(
+                oneBasedBeginPosition: 0, oneBasedEndPosition: 10,
+                originalSequence: "A", variantSequence: "T",
+                description: "info", oneBasedModifications: null);
+            Assert.IsFalse(svInvalidOneBasedBeginLessThanOne.AreValid(), "Begin position must be >= 1");
+
+            SequenceVariation svInvalidOneBasedEndLessThanOneBasedBegin = new SequenceVariation(
+                oneBasedBeginPosition: 5, oneBasedEndPosition: 4,
+                originalSequence: "G", variantSequence: "C",
+                description: "info", oneBasedModifications: null);
+            Assert.IsFalse(svInvalidOneBasedEndLessThanOneBasedBegin.AreValid(), "End position cannot be less than begin position");
+
+            // -----------------------------
+            // 3) Explicit begin/end edge-cases: insertion and deletion modeled by content only
+            //    NOTE: AreValid ignores Original/Variant content; only positions matter.
+            // -----------------------------
+            // Insertion-like (explicit): original is empty (""), variant has content.
+            // Valid because we explicitly supply begin == end (positions are valid).
+            SequenceVariation svValidOriginalSequenceIsEmpty = new SequenceVariation(
+                oneBasedBeginPosition: 8, oneBasedEndPosition: 8,
+                originalSequence: "", variantSequence: "A",
+                description: "info", oneBasedModifications: null);
+            Assert.IsTrue(svValidOriginalSequenceIsEmpty.AreValid(), "Explicit insertion with valid positions should be considered valid");
+
+            // Deletion-like (explicit): variant is empty (""), original has content, positions still valid.
+            SequenceVariation svValidVariantSequenceLengthIsZero = new SequenceVariation(
+                oneBasedBeginPosition: 10, oneBasedEndPosition: 10,
+                originalSequence: "A", variantSequence: "",
+                description: "info", oneBasedModifications: null);
+            Assert.IsTrue(svValidVariantSequenceLengthIsZero.AreValid(), "Explicit deletion with valid positions should be considered valid");
+
+            // -----------------------------
+            // 4) One-position convenience ctor behavior for originalSequence values
+            //    ctor: SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, ...)
+            //    end is computed as:
+            //      - if original == null  → end = begin
+            //      - else                  → end = begin + original.Length - 1
+            // -----------------------------
+
+            // 4a) originalSequence has length 1 → end == begin (valid)
+            var svPosCtorLength1 = new SequenceVariation(
+                oneBasedPosition: 15,
+                originalSequence: "K",
+                variantSequence: "R",
+                description: "pos-ctor length 1");
+            Assert.AreEqual(15, svPosCtorLength1.OneBasedBeginPosition);
+            Assert.AreEqual(15, svPosCtorLength1.OneBasedEndPosition, "With original length 1, end should equal begin");
+            Assert.IsTrue(svPosCtorLength1.AreValid(), "Single-site variation via position ctor should be valid");
+
+            // 4b) originalSequence has length > 1 → end == begin + len - 1 (valid)
+            var svPosCtorLength3 = new SequenceVariation(
+                oneBasedPosition: 20,
+                originalSequence: "PEP",   // len = 3
+                variantSequence: "AAA",    // content irrelevant to AreValid
+                description: "pos-ctor length 3");
+            Assert.AreEqual(20, svPosCtorLength3.OneBasedBeginPosition);
+            Assert.AreEqual(22, svPosCtorLength3.OneBasedEndPosition, "End should be begin + original.Length - 1");
+            Assert.IsTrue(svPosCtorLength3.AreValid(), "Multi-length replacement with valid positions should be valid");
+
+            // 4c) originalSequence == null → end = begin (valid)
+            var svPosCtorNullOriginal = new SequenceVariation(
+                oneBasedPosition: 30,
+                originalSequence: null,    // special case handled in ctor: end = begin
+                variantSequence: "A",
+                description: "pos-ctor null original");
+            Assert.AreEqual(30, svPosCtorNullOriginal.OneBasedBeginPosition);
+            Assert.AreEqual(30, svPosCtorNullOriginal.OneBasedEndPosition, "Null original should set end = begin");
+            Assert.IsTrue(svPosCtorNullOriginal.AreValid(), "Null original via position ctor is treated as length 1 (valid)");
+
+            // 4d) originalSequence == "" (empty) → end = begin - 1 (invalid by design)
+            //     This models an insertion if you rely solely on the position ctor, but produces end < begin → invalid.
+            //     For insertions, prefer the explicit begin/end ctor with valid positions (see 3).
+            var svPosCtorEmptyOriginal = new SequenceVariation(
+                oneBasedPosition: 40,
+                originalSequence: "",      // empty → end = 39
+                variantSequence: "A",
+                description: "pos-ctor empty original");
+            Assert.AreEqual(40, svPosCtorEmptyOriginal.OneBasedBeginPosition);
+            Assert.AreEqual(39, svPosCtorEmptyOriginal.OneBasedEndPosition, "Empty original sets end = begin - 1");
+            Assert.IsFalse(svPosCtorEmptyOriginal.AreValid(), "Position ctor with empty original is invalid (end < begin)");
+
+            // -----------------------------
+            // 5) Validity is position-only; content and mods do not change AreValid()
+            //     - VariantSequence null is normalized to "" in the ctor.
+            //     - OneBasedModifications is stored but ignored by AreValid().
+            // -----------------------------
+            var mods = new Dictionary<int, List<Modification>>
+            {
+                { 1, new List<Modification>() } // empty mod list at a site; still ignored by AreValid
+            };
+            var svContentIrrelevant = new SequenceVariation(
+                oneBasedBeginPosition: 3, oneBasedEndPosition: 3,
+                originalSequence: "M", variantSequence: null, // becomes ""
+                description: "mods/variant null test", oneBasedModifications: mods);
+            Assert.IsTrue(svContentIrrelevant.AreValid(), "Null variant and/or mods should not affect positional validity");
+            Assert.AreEqual("", svContentIrrelevant.VariantSequence, "Null VariantSequence is normalized to empty string");
+
+            // -----------------------------
+            // 6) Sanity: SimpleString format and positive bounds at the edge of sequence
+            // -----------------------------
+            var svSimple = new SequenceVariation(
+                oneBasedBeginPosition: 1, oneBasedEndPosition: 1,
+                originalSequence: "A", variantSequence: "V",
+                description: "simple");
+            // SimpleString = Original + Begin + Variant (no delimiter)
+            Assert.AreEqual("A1V", svSimple.SimpleString(), "SimpleString should concatenate original + begin + variant");
+
+            // Additional guard: begin == 1 is valid if end >= begin
+            var svAtStart = new SequenceVariation(
+                oneBasedBeginPosition: 1, oneBasedEndPosition: 2,
+                originalSequence: "MA", variantSequence: "MV",
+                description: "range at start");
+            Assert.IsTrue(svAtStart.AreValid(), "Ranges that start at 1 are valid provided end >= begin");
         }
         [Test]
         public void VariantModificationTest()

From a89da7d07bec25016c343929cf43d13c0b38243a Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Fri, 24 Oct 2025 14:53:55 -0500
Subject: [PATCH 21/38] expand TestThatProteinVariantsAreGeneratedDuringRead

---
 .../Test/DatabaseTests/TestVariantProtein.cs  | 144 ++++++++++++++++--
 1 file changed, 131 insertions(+), 13 deletions(-)

diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
index c615342c5..8dda74f3f 100644
--- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs
+++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
@@ -1279,23 +1279,141 @@ public void WriteProteinXmlWithVariantsDiscoveredAsModifications2()
             Assert.That(newProtein.SequenceVariations.Count, Is.EqualTo(totalSequenceVariations + 1)); //This number increases by 1 because we added a sequence variation that was discovered as a modification
             Assert.AreEqual(0,newProtein.OneBasedPossibleLocalizedModifications.Count); //This number should be 0 because we converted the modification to a sequence variation
         }
-
+        /// <summary>
+        /// PURPOSE
+        /// Ensures that variant proteins are automatically generated during XML read, both for targets and reverse decoys.
+        /// The database "humanGAPDH.xml" encodes two single-nucleotide substitutions on the target protein P04406:
+        /// - A22G
+        /// - K251N
+        ///
+        /// EXPECTATIONS
+        /// - Loader emits all combinatorial target variants derived from those two changes:
+        ///   [0] Reference (no variants applied)                         → Accession "P04406"
+        ///   [1] Single variant A22G                                     → Accession "P04406_A22G"
+        ///   [2] Single variant K251N                                    → Accession "P04406_K251N"
+        ///   [3] Double variant K251N + A22G (combined)                  → Accession "P04406_K251N_A22G"
+        /// - With DecoyType.Reverse, a matching set of 4 reverse decoys is produced with mirrored coordinates:
+        ///   [4] Decoy of reference                                      → "DECOY_P04406"
+        ///   [5] Decoy with mirrored A22G (mapped to site 315)           → "DECOY_P04406_A315G"
+        ///   [6] Decoy with mirrored K251N (mapped to site 86)           → "DECOY_P04406_K86N"
+        ///   [7] Decoy with both mirrored variants                       → "DECOY_P04406_K86N_A315G"
+        ///
+        /// WHY THIS MATTERS
+        /// - Validates that the reader expands sequence variation definitions into concrete variant proteins.
+        /// - Verifies decoy generation mirrors variant coordinates appropriately and preserves ordering.
+        /// - Guards against regressions in accession naming, variant application counts, and decoy parity.
+        ///
+        /// PARAMETERS PASSED TO LOADER
+        /// - generateTargets: true        → emit target proteins
+        /// - decoyType: Reverse           → also emit reverse decoys
+        /// - allKnownModifications: UniProtPtms → resolve any UniProt-annotated PTMs so no "unknown" mods remain
+        /// - isContaminant: false
+        /// - modTypesToExclude: null
+        /// - out unknownModifications     → capture any unrecognized mods (should be empty when UniProtPtms is provided)
+        /// - minAlleleDepth: 1            → do not filter out these low-depth test variants
+        /// - maxHeterozygousVariants: 99  → allow generating all combinations from the two sites (up to 2^2)
+        /// </summary>
         [Test]
         public static void TestThatProteinVariantsAreGeneratedDuringRead()
         {
+            // Arrange: load a target with two site-specific variants and request reverse decoys as well.
+            // IMPORTANT: Provide UniProtPtms so annotations in humanGAPDH.xml resolve and do not end up in unknownModifications.
             string databaseName = "humanGAPDH.xml";
-            var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true,
-                DecoyType.Reverse, null, false, null, out var unknownModifications, 1, 99);
-            Assert.AreEqual(8, proteins.Count); // 4 target + 4 decoy
-            Assert.AreEqual(2, proteins[0].SequenceVariations.Count()); // these sequence variations were in the original
-            Assert.That("P04406", Is.EqualTo(proteins[0].Accession));
-            Assert.That("P04406_A22G", Is.EqualTo(proteins[1].Accession));
-            Assert.That("P04406_K251N", Is.EqualTo(proteins[2].Accession));
-            Assert.That("P04406_K251N_A22G", Is.EqualTo(proteins[3].Accession));
-            Assert.That("DECOY_P04406", Is.EqualTo(proteins[4].Accession));
-            Assert.That("DECOY_P04406_A315G", Is.EqualTo(proteins[5].Accession));
-            Assert.That("DECOY_P04406_K86N", Is.EqualTo(proteins[6].Accession));
-            Assert.That("DECOY_P04406_K86N_A315G", Is.EqualTo(proteins[7].Accession));
+            var proteins = ProteinDbLoader.LoadProteinXML(
+                Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName),
+                generateTargets: true,
+                decoyType: DecoyType.Reverse,
+                allKnownModifications: UniProtPtms, // CHANGED: was null; supplying known PTMs prevents unknownModifications from being populated
+                isContaminant: false,
+                modTypesToExclude: null,
+                unknownModifications: out var unknownModifications,
+                minAlleleDepth: 1,
+                maxHeterozygousVariants: 99);
+
+            // Basic shape: 4 targets + 4 reverse decoys in a deterministic order.
+            Assert.AreEqual(8, proteins.Count, "Expected 4 targets and 4 decoys in a fixed order");
+
+            // Targets/decoys split and flags should be consistent and easy to reason about.
+            Assert.AreEqual(4, proteins.Count(p => !p.IsDecoy), "First half should be targets");
+            Assert.AreEqual(4, proteins.Count(p => p.IsDecoy), "Second half should be decoys");
+            for (int i = 0; i < 4; i++)
+            {
+                Assert.IsFalse(proteins[i].IsDecoy, $"Index {i} should be a target");
+                Assert.IsTrue(proteins[i + 4].IsDecoy, $"Index {i + 4} should be a decoy");
+            }
+
+            // The reference target (index 0) carries two possible sequence variations in its metadata.
+            // This documents the test input and ensures the reader surfaced them.
+            Assert.AreEqual(2, proteins[0].SequenceVariations.Count(), "Reference should advertise exactly two possible sequence variations");
+
+            // Accessions must match exact, canonical variant labeling and order for both targets and decoys.
+            Assert.That("P04406", Is.EqualTo(proteins[0].Accession), "Reference target accession mismatch");
+            Assert.That("P04406_A22G", Is.EqualTo(proteins[1].Accession), "Single-variant (A22G) target accession mismatch");
+            Assert.That("P04406_K251N", Is.EqualTo(proteins[2].Accession), "Single-variant (K251N) target accession mismatch");
+            Assert.That("P04406_K251N_A22G", Is.EqualTo(proteins[3].Accession), "Double-variant target accession mismatch");
+
+            Assert.That("DECOY_P04406", Is.EqualTo(proteins[4].Accession), "Reference decoy accession mismatch");
+            Assert.That("DECOY_P04406_A315G", Is.EqualTo(proteins[5].Accession), "Decoy accession for mirrored A22G mismatch");
+            Assert.That("DECOY_P04406_K86N", Is.EqualTo(proteins[6].Accession), "Decoy accession for mirrored K251N mismatch");
+            Assert.That("DECOY_P04406_K86N_A315G", Is.EqualTo(proteins[7].Accession), "Decoy accession for double-variant mismatch");
+
+            // Sanity: accessions are non-empty and unique (avoid accidental duplication/shuffling).
+            Assert.That(proteins.All(p => !string.IsNullOrWhiteSpace(p.Accession)), "All proteins must have non-empty accessions");
+            Assert.AreEqual(proteins.Count, proteins.Select(p => p.Accession).Distinct().Count(), "Accessions must be unique");
+
+            // Each decoy should be length-equal to its corresponding target, but usually sequence-different (reverse).
+            for (int i = 0; i < 4; i++)
+            {
+                Assert.AreEqual(proteins[i].Length, proteins[i + 4].Length, $"Target/decoy length should match for index {i}");
+                Assert.AreNotEqual(proteins[i].BaseSequence, proteins[i + 4].BaseSequence, $"Decoy sequence should differ from its target for index {i}");
+            }
+
+            // Applied variant counts (how many variations were actually realized in this protein instance):
+            // Targets:    [0] ref=0, [1] A22G=1, [2] K251N=1, [3] both=2
+            // Decoys:     [4] ref=0, [5] A315G=1, [6] K86N=1, [7] both=2
+            int[] expectedAppliedCounts = { 0, 1, 1, 2, 0, 1, 1, 2 };
+            for (int i = 0; i < proteins.Count; i++)
+            {
+                Assert.AreEqual(expectedAppliedCounts[i], proteins[i].AppliedSequenceVariations.Count(),
+                    $"Applied variant count mismatch at index {i} ({proteins[i].Accession})");
+            }
+
+            // The specific applied-variant labels should match accessions:
+            // - For targets: "A22G" and/or "K251N"
+            // - For decoys:  mirrored positions → "A315G" and/or "K86N"
+            static HashSet<string> AppliedLabels(Protein p) =>
+                new HashSet<string>(p.AppliedSequenceVariations.Select(v => v.SimpleString()));
+
+            // Targets (indices 0..3)
+            Assert.That(AppliedLabels(proteins[0]).SetEquals(Array.Empty<string>()), "Reference target should have no applied variants");
+            Assert.That(AppliedLabels(proteins[1]).SetEquals(new[] { "A22G" }), "Single-variant target must be exactly A22G");
+            Assert.That(AppliedLabels(proteins[2]).SetEquals(new[] { "K251N" }), "Single-variant target must be exactly K251N");
+            Assert.That(AppliedLabels(proteins[3]).SetEquals(new[] { "A22G", "K251N" }), "Double-variant target should have A22G and K251N");
+
+            // Decoys (indices 4..7) have mirrored coordinates (due to reverse decoying).
+            Assert.That(AppliedLabels(proteins[4]).SetEquals(Array.Empty<string>()), "Reference decoy should have no applied variants");
+            Assert.That(AppliedLabels(proteins[5]).SetEquals(new[] { "A315G" }), "Single-variant decoy must be exactly A315G (mirror of A22G)");
+            Assert.That(AppliedLabels(proteins[6]).SetEquals(new[] { "K86N" }), "Single-variant decoy must be exactly K86N (mirror of K251N)");
+            Assert.That(AppliedLabels(proteins[7]).SetEquals(new[] { "A315G", "K86N" }), "Double-variant decoy should have A315G and K86N");
+
+            // Parity check: each target and its decoy should carry the same number of applied variations.
+            for (int i = 0; i < 4; i++)
+            {
+                Assert.AreEqual(
+                    proteins[i].AppliedSequenceVariations.Count(),
+                    proteins[i + 4].AppliedSequenceVariations.Count(),
+                    $"Applied variant count should match between target and decoy at index {i}");
+            }
+
+            // With UniProtPtms supplied, no unknown modifications should be reported.
+            if (unknownModifications != null && unknownModifications.Count > 0)
+            {
+                // Extra diagnostics to ease debugging in case of future schema/content changes.
+                TestContext.WriteLine("Unknown modifications reported by loader:");
+                foreach (var um in unknownModifications)
+                    TestContext.WriteLine($" - {um}");
+            }
+            Assert.That(unknownModifications == null || unknownModifications.Count == 0, "No unknown modifications expected from this input");
         }
         [Test]
         public static void ProteinVariantsReadAsModificationsWrittenAsVariants()

From 39ff105d3bb597353066d0c16d2906524f96ffd3 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Fri, 24 Oct 2025 14:56:37 -0500
Subject: [PATCH 22/38] two new tests

---
 .../Test/DatabaseTests/TestVariantProtein.cs  | 122 ++++++++++++++++++
 1 file changed, 122 insertions(+)

diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
index 8dda74f3f..83c9fef03 100644
--- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs
+++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
@@ -1523,5 +1523,127 @@ public void Constructor_ParsesDescriptionCorrectly()
             var adValues = svd.AlleleDepths[adKey];
             Assert.AreEqual(new[] { "30", "30" }, adValues);
         }
+        [Test]
+        public void ParseComprehensiveVcfExamples()
+        {
+            string current = TestContext.CurrentContext.TestDirectory;
+            string vcfPath = null;
+            while (current != null)
+            {
+                var candidate = Path.Combine(current, "Test", "DatabaseTests", "vcf_comprehensive_examples.vcf");
+                if (File.Exists(candidate))
+                {
+                    vcfPath = candidate;
+                    break;
+                }
+                current = Directory.GetParent(current)?.FullName;
+            }
+
+            Assert.That(vcfPath, Is.Not.Null, "Could not locate vcf_comprehensive_examples.vcf");
+
+            var lines = File.ReadAllLines(vcfPath);
+
+            var dataRows = lines
+                .Where(l => !string.IsNullOrWhiteSpace(l))
+                .Where(l => !l.StartsWith("##"))
+                .Where(l => !l.StartsWith("#CHROM"))
+                .ToList();
+
+            Assert.That(dataRows.Count, Is.EqualTo(8), "Expected 8 example variant rows.");
+
+            for (int rowIndex = 0; rowIndex < dataRows.Count; rowIndex++)
+            {
+                string originalLine = dataRows[rowIndex];
+                string[] rawFields = originalLine.Split('\t');
+                Assert.That(rawFields.Length, Is.GreaterThanOrEqualTo(10), $"Row {rowIndex + 1}: insufficient columns.");
+
+                var vcf = new VariantCallFormat(originalLine);
+
+                Assert.That(vcf.Description, Is.EqualTo(originalLine));
+                Assert.That(vcf.ReferenceAlleleString, Is.EqualTo(rawFields[3]));
+                Assert.That(vcf.AlternateAlleleString, Is.EqualTo(rawFields[4]));
+                Assert.That(vcf.Format, Is.EqualTo(rawFields[8]));
+
+                if (rawFields[7] == ".")
+                {
+                    Assert.That(vcf.Info.Annotation, Is.EqualTo(rawFields[7]));
+                }
+
+                var sampleFields = rawFields.Skip(9).ToArray();
+                Assert.That(vcf.Genotypes.Count, Is.EqualTo(sampleFields.Length));
+                Assert.That(vcf.AlleleDepths.Count, Is.EqualTo(sampleFields.Length));
+                Assert.That(vcf.Homozygous.Count, Is.EqualTo(sampleFields.Length));
+                Assert.That(vcf.Heterozygous.Count, Is.EqualTo(sampleFields.Length));
+                Assert.That(vcf.ZygosityBySample.Count, Is.EqualTo(sampleFields.Length));
+
+                for (int sampleIndex = 0; sampleIndex < sampleFields.Length; sampleIndex++)
+                {
+                    string sample = sampleFields[sampleIndex];
+                    string key = sampleIndex.ToString();
+
+                    string[] parts = sample.Split(':');
+                    Assert.That(parts.Length, Is.EqualTo(vcf.Format.Split(':').Length));
+
+                    string gtPart = parts[0];
+                    string adPart = parts.Length > 1 ? parts[1] : null;
+
+                    // Expected GT tokens
+                    string[] expectedGtTokens = gtPart.Split(new[] { '/', '|' }, StringSplitOptions.RemoveEmptyEntries);
+                    if (gtPart.Contains('.') && expectedGtTokens.Length == 1 &&
+                        (gtPart == "./." || gtPart == ".|." || gtPart == ".|1" || gtPart == "0|." || gtPart == "0/."))
+                    {
+                        expectedGtTokens = new[] { ".", "." };
+                    }
+
+                    Assert.That(vcf.Genotypes.ContainsKey(key));
+                    var parsedGt = vcf.Genotypes[key];
+                    Assert.That(parsedGt, Is.EqualTo(expectedGtTokens));
+
+                    // Expected AD tokens
+                    string[] expectedAdTokens =
+                        string.IsNullOrWhiteSpace(adPart) ? Array.Empty<string>() :
+                        adPart == "." ? new[] { "." } :
+                        adPart.Split(',');
+
+                    Assert.That(vcf.AlleleDepths.ContainsKey(key));
+                    var parsedAd = vcf.AlleleDepths[key] ?? Array.Empty<string>();
+                    if (parsedAd.Length != 0 || expectedAdTokens.Length != 1 || expectedAdTokens[0] != ".")
+                    {
+                        Assert.That(parsedAd, Is.EqualTo(expectedAdTokens));
+                    }
+
+                    // Expected zygosity using ONLY non-missing alleles (must mirror implementation)
+                    var calledAlleles = parsedGt.Where(a => a != ".").ToArray();
+                    bool expectedHom = calledAlleles.Length > 0 && calledAlleles.Distinct().Count() == 1;
+                    bool expectedHet = calledAlleles.Distinct().Count() > 1;
+                    VariantCallFormat.Zygosity expectedZ =
+                        calledAlleles.Length == 0
+                            ? VariantCallFormat.Zygosity.Unknown
+                            : expectedHet
+                                ? VariantCallFormat.Zygosity.Heterozygous
+                                : VariantCallFormat.Zygosity.Homozygous;
+
+                    Assert.That(vcf.Homozygous[key], Is.EqualTo(expectedHom));
+                    Assert.That(vcf.Heterozygous[key], Is.EqualTo(expectedHet));
+                    Assert.That(vcf.ZygosityBySample[key], Is.EqualTo(expectedZ));
+                }
+            }
+        }
+        [Test]
+        public void Constructor_InvalidCoordinates_ThrowsArgumentException()
+        {
+            // Minimal valid VCF line (10 columns) so VariantCallFormat parses without truncation.
+            // Arrange: end < begin (invalid coordinates)
+            var sv = new SequenceVariation(
+                oneBasedBeginPosition: 5,
+                oneBasedEndPosition: 4,
+                originalSequence: "A",
+                variantSequence: "V",
+                description: "invalid-coords",
+                oneBasedModifications: null);
+
+            // Assert: SequenceVariation does not throw on construction; it reports invalid via AreValid()
+            Assert.That(sv.AreValid(), Is.False);
+        }
     }
 }
\ No newline at end of file

From dc8845da984ab1e713ae7c7d2c16ca49396f1f60 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Fri, 24 Oct 2025 14:57:52 -0500
Subject: [PATCH 23/38] one more

---
 .../Test/DatabaseTests/TestVariantProtein.cs  | 192 ++++++++++++++++++
 1 file changed, 192 insertions(+)

diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
index 83c9fef03..84133ae66 100644
--- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs
+++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
@@ -1645,5 +1645,197 @@ public void Constructor_InvalidCoordinates_ThrowsArgumentException()
             // Assert: SequenceVariation does not throw on construction; it reports invalid via AreValid()
             Assert.That(sv.AreValid(), Is.False);
         }
+        // Helper to create a minimal substitution modification matching the required detection pattern
+        private static Modification Substitution(string idArrow)
+        {
+            // If you want this helper to be convertible by the code under test,
+            // give it a matching motif for the site where it will be placed.
+            // For now keep it generic (unused in this test).
+            return new Modification(
+                idArrow,                          // originalId
+                null,                             // accession
+                "1 nucleotide substitution",      // modificationType
+                null,                             // featureType
+                null,                             // target motif
+                "Anywhere.",                      // location restriction
+                null,                             // chemical formula
+                0,                                // monoisotopic mass
+                new Dictionary<string, IList<string>>(), // databaseReference
+                null,                             // taxonomicRange
+                null,                             // keywords
+                null,                             // neutralLosses
+                null,                             // diagnosticIons
+                null);                            // fileOrigin
+        }
+
+        // Non-substitution (should be ignored)
+        private static Modification Other(string id, double mass = 15.9949)
+        {
+            // Generic oxidation at P motif (unused by main test path)
+            ModificationMotif.TryGetMotif("P", out var motifP);
+            return new Modification(
+                id,
+                null,
+                "oxidation",
+                null,
+                motifP,
+                "Anywhere.",
+                null,
+                mass,
+                new Dictionary<string, IList<string>>(),
+                null,
+                null,
+                null,
+                null,
+                null);
+        }
+
+        // Malformed substitution (no "->" pattern) must be ignored
+        private static Modification Malformed()
+        {
+            ModificationMotif.TryGetMotif("Q", out var motifQ);
+            return new Modification(
+                "E>A",
+                null,
+                "1 nucleotide substitution",
+                null,
+                motifQ,
+                "Anywhere.",
+                null,
+                0,
+                new Dictionary<string, IList<string>>(),
+                null,
+                null,
+                null,
+                null,
+                null);
+        }
+
+        [Test]
+        public void ConvertNucleotideSubstitutionModificationsToSequenceVariants_Comprehensive()
+        {
+            // 1 M, 2 A, 3 E, 4 W, 5 P, 6 Q, 7 K
+            var protein = new Protein("MAEWPQK", "TEST_PROT");
+
+            static Modification MakeSub(string idArrow, char originalResidue)
+            {
+                ModificationMotif.TryGetMotif(originalResidue.ToString(), out var motif);
+                return new Modification(
+                    idArrow,
+                    null,
+                    "1 nucleotide substitution",
+                    null,
+                    motif,
+                    "Anywhere.",
+                    null,
+                    0,
+                    new Dictionary<string, IList<string>>(),
+                    null,
+                    null,
+                    null,
+                    null,
+                    null);
+            }
+
+            static Modification MakeOther(string id)
+            {
+                ModificationMotif.TryGetMotif("P", out var motifP);
+                return new Modification(
+                    id,
+                    null,
+                    "oxidation",
+                    null,
+                    motifP,
+                    "Anywhere.",
+                    null,
+                    15.9949,
+                    new Dictionary<string, IList<string>>(),
+                    null,
+                    null,
+                    null,
+                    null,
+                    null);
+            }
+
+            static Modification MakeMalformed()
+            {
+                ModificationMotif.TryGetMotif("Q", out var motifQ);
+                return new Modification(
+                    "E>A",
+                    null,
+                    "1 nucleotide substitution",
+                    null,
+                    motifQ,
+                    "Anywhere.",
+                    null,
+                    0,
+                    new Dictionary<string, IList<string>>(),
+                    null,
+                    null,
+                    null,
+                    null,
+                    null);
+            }
+
+            void AddMod(Protein p, int pos, Modification m)
+            {
+                if (!p.OneBasedPossibleLocalizedModifications.TryGetValue(pos, out var list1))
+                {
+                    list1 = new List<Modification>();
+                    p.OneBasedPossibleLocalizedModifications[pos] = list1;
+                }
+                list1.Add(m);
+
+                if (!p.OriginalNonVariantModifications.TryGetValue(pos, out var list2))
+                {
+                    list2 = new List<Modification>();
+                    p.OriginalNonVariantModifications[pos] = list2;
+                }
+                list2.Add(m);
+            }
+
+            // Mods to seed
+            var modEtoA = MakeSub("E->A", 'E'); // pos 3
+            var modWtoK = MakeSub("W->K", 'W'); // pos 4
+            var modOxidP = MakeOther("Oxid_P"); // pos 5
+            var malformed = MakeMalformed();    // pos 6
+
+            AddMod(protein, 3, modEtoA);
+            AddMod(protein, 4, modWtoK);
+            AddMod(protein, 5, modOxidP);
+            AddMod(protein, 6, malformed);
+
+            // Pre-existing W->K (may be duplicated by converter if description differs)
+            protein.SequenceVariations.Add(new SequenceVariation(4, 4, "W", "K", "Existing substitution"));
+
+            // Act
+            protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
+
+            // Assert unique AA changes, not raw count (converter may add standardized duplicates)
+            var uniqueChanges = protein.SequenceVariations.Select(v => v.SimpleString()).Distinct().ToList();
+            Assert.That(uniqueChanges.Count, Is.EqualTo(2), "Expected exactly two unique substitutions (E3->A and W4->K).");
+
+            // Ensure E3->A exists
+            var eToA = protein.SequenceVariations.SingleOrDefault(v =>
+                v.OneBasedBeginPosition == 3 && v.OneBasedEndPosition == 3 &&
+                v.OriginalSequence == "E" && v.VariantSequence == "A");
+            Assert.That(eToA, Is.Not.Null, "E3->A variant was not created.");
+
+            // Ensure at least one W4->K exists
+            var wToKCount = protein.SequenceVariations.Count(v =>
+                v.OneBasedBeginPosition == 4 && v.OneBasedEndPosition == 4 &&
+                v.OriginalSequence == "W" && v.VariantSequence == "K");
+            Assert.That(wToKCount, Is.GreaterThanOrEqualTo(1), "Expected a W4->K variant.");
+
+            // Converted positions removed from OneBasedPossibleLocalizedModifications
+            Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(3), Is.False);
+            Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(4), Is.False);
+
+            // Unrelated and malformed mods remain
+            Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(5), Is.True);
+            Assert.That(protein.OneBasedPossibleLocalizedModifications[5].Any(m => m.OriginalId == "Oxid_P"), Is.True);
+            Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(6), Is.True);
+            Assert.That(protein.OneBasedPossibleLocalizedModifications[6].Any(m => m.OriginalId == "E>A"), Is.True);
+        }
     }
 }
\ No newline at end of file

From 6a06a9069c11e7163a6cc627439dd33e9f7c957a Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Fri, 24 Oct 2025 16:12:38 -0500
Subject: [PATCH 24/38] this fucking test

---
 .../Test/DatabaseTests/TestVariantProtein.cs  | 158 +++++++++++--
 .../DatabaseTests/truncationsExpected.tsv     | 137 ++++++++++++
 mzLib/Test/Test.csproj                        |   3 +
 mzLib/Test/TestPeptideWithSetMods.cs          | 210 +++++++++++++-----
 4 files changed, 440 insertions(+), 68 deletions(-)
 create mode 100644 mzLib/Test/DatabaseTests/truncationsExpected.tsv

diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
index 84133ae66..edcc6cf5e 100644
--- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs
+++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
@@ -1415,36 +1415,158 @@ static HashSet<string> AppliedLabels(Protein p) =>
             }
             Assert.That(unknownModifications == null || unknownModifications.Count == 0, "No unknown modifications expected from this input");
         }
+
         [Test]
         public static void ProteinVariantsReadAsModificationsWrittenAsVariants()
         {
+            // PURPOSE
+            // This test verifies the I/O pipeline that converts "nucleotide substitution" modifications
+            // embedded in an input protein XML into canonical "sequence variant" features when read,
+            // and persists them back out as proper <feature type="sequence variant"> entries when written.
+            //
+            // WHY
+            // - Some sources (e.g., GPTMD discovery) encode AA substitutions as modifications with
+            //   ModificationType = "1 nucleotide substitution". Internally, we want these represented
+            //   as SequenceVariations, not remaining as generic modifications.
+            // - On read: these substitution mods must be removed from the protein’s modifications collections
+            //   and turned into SequenceVariations.
+            // - On write: they must be serialized as sequence variant features with a standardized description
+            //   ("Putative GPTMD Substitution") and NOT re-serialized as modifications.
+            //
+            // EXPECTATIONS SUMMARY
+            // - Input file has 57 lines that contain "1 nucleotide substitution" (source encoding as mods).
+            // - After LoadProteinXML:
+            //     * We get exactly 9 proteins (DecoyType.None).
+            //     * Total SequenceVariations across all proteins = 194.
+            //     * Total OneBasedPossibleLocalizedModifications count across all proteins = 0
+            //       (i.e., all substitution mods became sequence variations).
+            //     * No unknown modifications should remain.
+            //     * No applied variants are expected (metadata only; we are not expanding combinatorics here).
+            // - After WriteXmlDatabase then re-load:
+            //     * Count and totals remain the same (9 proteins; 194 total sequence variations; 0 mods).
+            //     * The output file contains:
+            //         - Exactly 194 lines with feature type="sequence variant".
+            //         - Exactly 194 lines with "Putative GPTMD Substitution".
+            //         - Exactly 0 lines with "1 nucleotide substitution" (proved that mods were not serialized).
+            //
+            // NOTE
+            // - We keep DecoyType.None to avoid expanding the protein list.
+            // - We use minAlleleDepth = 1 and maxHeterozygousVariants = 0 to avoid variant-proteoform expansion.
+
+            // Arrange: locate the source database that encodes nucleotide substitutions as modifications
             string databaseName = "nucleotideVariantsAsModifications.xml";
+            string inputPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName);
+            Assert.That(File.Exists(inputPath), Is.True, "Input database file must exist for this test");
 
-            Assert.That(File.ReadAllLines(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName)).Count(l => l.Contains("1 nucleotide substitution")), Is.EqualTo(57));
+            // Sanity: confirm the source encodes substitutions as modifications in the raw XML
+            // (57 lines that mention "1 nucleotide substitution" in the file).
+            var inputLines = File.ReadAllLines(inputPath);
+            int inputSubstitutionModLines = inputLines.Count(l => l.Contains("1 nucleotide substitution"));
+            Assert.That(inputSubstitutionModLines, Is.EqualTo(57), "Source XML should contain 57 substitution-mod lines");
 
-            var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true,
-                DecoyType.None, null, false, null, out var unknownModifications, 1, 0);
-            Assert.AreEqual(9, proteins.Count); // 1 target
-            Assert.AreEqual(194, proteins.Select(v=>v.SequenceVariations.Count).Sum()); // there are no sequence variations in the original proteins
-            Assert.AreEqual(0, proteins.Select(m => m.OneBasedPossibleLocalizedModifications.Sum(list=>list.Value.Count)).Sum()); // there are 194 sequence variants as modifications in the original proteins
+            // Optional sanity: the source should not already be in sequence-variant form
+            int inputSeqVarFeatureLines = inputLines.Count(l => l.Contains("feature type=\"sequence variant\""));
+            Assert.That(inputSeqVarFeatureLines, Is.EqualTo(0), "Source XML should not already encode sequence variants");
+
+            // Act: read the database. Expect conversion to SequenceVariations and removal from modifications
+            var proteins = ProteinDbLoader.LoadProteinXML(
+                inputPath,
+                generateTargets: true,
+                decoyType: DecoyType.None,
+                allKnownModifications: null,
+                isContaminant: false,
+                modTypesToExclude: null,
+                unknownModifications: out var unknownModifications,
+                minAlleleDepth: 1,
+                maxHeterozygousVariants: 0);
+
+            // Assert: no decoys requested, so all should be targets
+            Assert.That(proteins.All(p => !p.IsDecoy), "All proteins should be targets when DecoyType.None is used");
+
+            // Assert: exactly 9 proteins
+            Assert.AreEqual(9, proteins.Count, "Expected exactly 9 proteins from the input");
+
+            // Assert: the loader converted all substitution modifications into proper SequenceVariations
+            // and removed them from OneBasedPossibleLocalizedModifications
+            int totalSequenceVariations = proteins.Sum(p => p.SequenceVariations.Count);
+            int totalPossibleLocalizedMods = proteins.Sum(p => p.OneBasedPossibleLocalizedModifications.Values.Sum(v => v.Count));
+            Assert.AreEqual(194, totalSequenceVariations, "Total number of sequence variations after load must be 194");
+            Assert.AreEqual(0, totalPossibleLocalizedMods, "All substitution modifications should have been converted; none should remain as mods");
 
+            // FIX: Safely log unknown modifications (Dictionary<string, Modification>), if any appear unexpectedly.
+            if (unknownModifications != null && unknownModifications.Count > 0)
+            {
+                TestContext.WriteLine("Unknown modifications encountered during read (unexpected):");
+                foreach (var kvp in unknownModifications)
+                {
+                    var mod = kvp.Value;
+                    var id = mod?.OriginalId ?? "<null>";
+                    var type = mod?.ModificationType ?? "<null>";
+                    TestContext.WriteLine($" - key={kvp.Key}, id={id}, type={type}");
+                }
+            }
+            Assert.That(unknownModifications == null || unknownModifications.Count == 0, "No unknown modifications should remain after conversion");
+
+            // Assert: No applied variants expected in this test (we are validating representation, not expansion)
+            int totalAppliedVariants = proteins.Sum(p => p.AppliedSequenceVariations.Count());
+            Assert.That(totalAppliedVariants, Is.EqualTo(0), "No applied variants expected; variants are metadata here");
+
+            // Assert: None of the proteins should carry "1 nucleotide substitution" as OriginalNonVariantModifications anymore
+            int residualSubstitutionMods =
+                proteins.Sum(p => p.OriginalNonVariantModifications.Values.Sum(list => list.Count(m => m.ModificationType == "1 nucleotide substitution")));
+            Assert.That(residualSubstitutionMods, Is.EqualTo(0), "No '1 nucleotide substitution' mods should remain in OriginalNonVariantModifications");
+
+            // Persist the converted representation and read it back; this file should contain only sequence-variant features
             string tempDir = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString());
             Directory.CreateDirectory(tempDir);
             string tempFile = Path.Combine(tempDir, "xmlWithSequenceVariantsAndNoModifications.txt");
 
-            ProteinDbWriter.WriteXmlDatabase(new Dictionary<string, HashSet<Tuple<int, Modification>>>(), proteins.Where(p => !p.IsDecoy).ToList(), tempFile);
-            proteins = ProteinDbLoader.LoadProteinXML(tempFile, true,
-                DecoyType.None, null, false, null, out unknownModifications, 1, 0);
-            Assert.AreEqual(9, proteins.Count); // 1 target
-            Assert.AreEqual(194, proteins.Select(v => v.SequenceVariations.Count).Sum()); // there are 194 sequence variations in the revised proteins
-            Assert.AreEqual(0, proteins.Select(m => m.OneBasedPossibleLocalizedModifications.Sum(list => list.Value.Count)).Sum()); // there are 0 sequence variants as modifications in the original proteins
-            
-            Assert.That(File.ReadAllLines(tempFile).Count(l => l.Contains("feature type=\"sequence variant\"")), Is.EqualTo(194));
-            Assert.That(File.ReadAllLines(tempFile).Count(l => l.Contains("Putative GPTMD Substitution")), Is.EqualTo(194));
-            Assert.That(File.ReadAllLines(tempFile).Count(l => l.Contains("1 nucleotide substitution")), Is.EqualTo(0));
-            if (Directory.Exists(tempDir)) Directory.Delete(tempDir, true);
+            try
+            {
+                // Write only the targets (all are targets here). Expect the writer to emit sequence variant features.
+                ProteinDbWriter.WriteXmlDatabase(new Dictionary<string, HashSet<Tuple<int, Modification>>>(), proteins.Where(p => !p.IsDecoy).ToList(), tempFile);
+
+                // Inspect the written file contents directly for ground truth
+                var writtenLines = File.ReadAllLines(tempFile);
+                int writtenSeqVarFeatures = writtenLines.Count(l => l.Contains("feature type=\"sequence variant\""));
+                int writtenPutativeGptmd = writtenLines.Count(l => l.Contains("Putative GPTMD Substitution"));
+                int writtenSubstitutionMods = writtenLines.Count(l => l.Contains("1 nucleotide substitution"));
+
+                // Assert: writer produced only sequence variants (not substitution mods)
+                Assert.That(writtenSeqVarFeatures, Is.EqualTo(194), "All 194 substitutions must be serialized as sequence variant features");
+                Assert.That(writtenPutativeGptmd, Is.EqualTo(194), "All 194 variants should have the standardized description label");
+                Assert.That(writtenSubstitutionMods, Is.EqualTo(0), "No '1 nucleotide substitution' strings should remain in the written XML");
+
+                // Re-load the written file to confirm round-trip stability
+                proteins = ProteinDbLoader.LoadProteinXML(
+                    tempFile,
+                    generateTargets: true,
+                    decoyType: DecoyType.None,
+                    allKnownModifications: null,
+                    isContaminant: false,
+                    modTypesToExclude: null,
+                    unknownModifications: out unknownModifications,
+                    minAlleleDepth: 1,
+                    maxHeterozygousVariants: 0);
+
+                // Assert: the round-tripped representation is identical in shape and counts
+                Assert.AreEqual(9, proteins.Count, "Round-trip must preserve protein count");
+                Assert.AreEqual(194, proteins.Sum(v => v.SequenceVariations.Count), "Round-trip must preserve total number of sequence variations");
+                Assert.AreEqual(0, proteins.Sum(m => m.OneBasedPossibleLocalizedModifications.Values.Sum(list => list.Count)),
+                    "Round-trip must preserve the fact that no substitution mods remain");
+                Assert.That(unknownModifications == null || unknownModifications.Count == 0, "Round-trip should not introduce unknown modifications");
+                Assert.That(proteins.Sum(p => p.AppliedSequenceVariations.Count()), Is.EqualTo(0), "Round-trip should not apply any variants");
+            }
+            finally
+            {
+                // Cleanup test artifacts
+                if (Directory.Exists(tempDir))
+                {
+                    try { File.SetAttributes(tempFile, FileAttributes.Normal); } catch { /* ignore */ }
+                    Directory.Delete(tempDir, true);
+                }
+            }
         }
-
         [Test]
         public void Constructor_ParsesDescriptionCorrectly()
         {
diff --git a/mzLib/Test/DatabaseTests/truncationsExpected.tsv b/mzLib/Test/DatabaseTests/truncationsExpected.tsv
new file mode 100644
index 000000000..8a35f98a3
--- /dev/null
+++ b/mzLib/Test/DatabaseTests/truncationsExpected.tsv
@@ -0,0 +1,137 @@
+Sequence	Type	Begin	End	RetainedMethionine
+MALWMRLLPLLALLALWGPDPAAA	Target	1	24	TRUE
+FVNQHLCGSHLVEALYLVCGERGFFYTPKT	Target	25	54	FALSE
+EAEDLQVGQVELGGGPGAGSLQPLALEGSLQ	Target	57	87	FALSE
+GIVEQCCTSICSLYQLENYCN	Target	90	110	FALSE
+MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN	Target	1	110	TRUE
+ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN	Target	2	110	FALSE
+LWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN	Target	3	110	FALSE
+WMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN	Target	4	110	FALSE
+MRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN	Target	5	110	FALSE
+RLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN	Target	6	110	FALSE
+LLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN	Target	7	110	FALSE
+ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYC	Target	2	109	FALSE
+ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENY	Target	2	108	FALSE
+ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLEN	Target	2	107	FALSE
+ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLE	Target	2	106	FALSE
+ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQL	Target	2	105	FALSE
+MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYC	Target	1	109	TRUE
+MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENY	Target	1	108	TRUE
+MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLEN	Target	1	107	TRUE
+MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLE	Target	1	106	TRUE
+MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQL	Target	1	105	TRUE
+ALWMRLLPLLALLALWGPDPAAA	Target	2	24	FALSE
+LWMRLLPLLALLALWGPDPAAA	Target	3	24	FALSE
+WMRLLPLLALLALWGPDPAAA	Target	4	24	FALSE
+MRLLPLLALLALWGPDPAAA	Target	5	24	FALSE
+RLLPLLALLALWGPDPAAA	Target	6	24	FALSE
+LLPLLALLALWGPDPAAA	Target	7	24	FALSE
+ALWMRLLPLLALLALWGPDPAA	Target	2	23	FALSE
+ALWMRLLPLLALLALWGPDPA	Target	2	22	FALSE
+ALWMRLLPLLALLALWGPDP	Target	2	21	FALSE
+ALWMRLLPLLALLALWGPD	Target	2	20	FALSE
+ALWMRLLPLLALLALWGP	Target	2	19	FALSE
+MALWMRLLPLLALLALWGPDPAA	Target	1	23	TRUE
+MALWMRLLPLLALLALWGPDPA	Target	1	22	TRUE
+MALWMRLLPLLALLALWGPDP	Target	1	21	TRUE
+MALWMRLLPLLALLALWGPD	Target	1	20	TRUE
+MALWMRLLPLLALLALWGP	Target	1	19	TRUE
+VNQHLCGSHLVEALYLVCGERGFFYTPKT	Target	26	54	FALSE
+NQHLCGSHLVEALYLVCGERGFFYTPKT	Target	27	54	FALSE
+QHLCGSHLVEALYLVCGERGFFYTPKT	Target	28	54	FALSE
+HLCGSHLVEALYLVCGERGFFYTPKT	Target	29	54	FALSE
+LCGSHLVEALYLVCGERGFFYTPKT	Target	30	54	FALSE
+FVNQHLCGSHLVEALYLVCGERGFFYTPK	Target	25	53	FALSE
+FVNQHLCGSHLVEALYLVCGERGFFYTP	Target	25	52	FALSE
+FVNQHLCGSHLVEALYLVCGERGFFYT	Target	25	51	FALSE
+FVNQHLCGSHLVEALYLVCGERGFFY	Target	25	50	FALSE
+FVNQHLCGSHLVEALYLVCGERGFF	Target	25	49	FALSE
+AEDLQVGQVELGGGPGAGSLQPLALEGSLQ	Target	58	87	FALSE
+EDLQVGQVELGGGPGAGSLQPLALEGSLQ	Target	59	87	FALSE
+DLQVGQVELGGGPGAGSLQPLALEGSLQ	Target	60	87	FALSE
+LQVGQVELGGGPGAGSLQPLALEGSLQ	Target	61	87	FALSE
+QVGQVELGGGPGAGSLQPLALEGSLQ	Target	62	87	FALSE
+EAEDLQVGQVELGGGPGAGSLQPLALEGSL	Target	57	86	FALSE
+EAEDLQVGQVELGGGPGAGSLQPLALEGS	Target	57	85	FALSE
+EAEDLQVGQVELGGGPGAGSLQPLALEG	Target	57	84	FALSE
+EAEDLQVGQVELGGGPGAGSLQPLALE	Target	57	83	FALSE
+EAEDLQVGQVELGGGPGAGSLQPLAL	Target	57	82	FALSE
+IVEQCCTSICSLYQLENYCN	Target	91	110	FALSE
+VEQCCTSICSLYQLENYCN	Target	92	110	FALSE
+EQCCTSICSLYQLENYCN	Target	93	110	FALSE
+QCCTSICSLYQLENYCN	Target	94	110	FALSE
+CCTSICSLYQLENYCN	Target	95	110	FALSE
+GIVEQCCTSICSLYQLENYC	Target	90	109	FALSE
+GIVEQCCTSICSLYQLENY	Target	90	108	FALSE
+GIVEQCCTSICSLYQLEN	Target	90	107	FALSE
+GIVEQCCTSICSLYQLE	Target	90	106	FALSE
+GIVEQCCTSICSLYQL	Target	90	105	FALSE
+FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN	Target	25	110	FALSE
+MNCYNELQYLSCISTCCQEVIGRK	Decoy	1	24	TRUE
+QLSGELALPQLSGAGPGGGLEVQGVQLDEA	Decoy	25	54	FALSE
+RTKPTYFFGREGCVLYLAEVLHSGCLHQNVF	Decoy	57	87	FALSE
+APDPGWLALLALLPLLRMWLA	Decoy	90	110	FALSE
+MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA	Decoy	1	110	TRUE
+NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA	Decoy	2	110	FALSE
+CYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA	Decoy	3	110	FALSE
+YNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA	Decoy	4	110	FALSE
+NELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA	Decoy	5	110	FALSE
+ELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA	Decoy	6	110	FALSE
+LQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA	Decoy	7	110	FALSE
+NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWL	Decoy	2	109	FALSE
+NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMW	Decoy	2	108	FALSE
+NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRM	Decoy	2	107	FALSE
+NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLR	Decoy	2	106	FALSE
+NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLL	Decoy	2	105	FALSE
+MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWL	Decoy	1	109	TRUE
+MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMW	Decoy	1	108	TRUE
+MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRM	Decoy	1	107	TRUE
+MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLR	Decoy	1	106	TRUE
+MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLL	Decoy	1	105	TRUE
+NCYNELQYLSCISTCCQEVIGRK	Decoy	2	24	FALSE
+CYNELQYLSCISTCCQEVIGRK	Decoy	3	24	FALSE
+YNELQYLSCISTCCQEVIGRK	Decoy	4	24	FALSE
+NELQYLSCISTCCQEVIGRK	Decoy	5	24	FALSE
+ELQYLSCISTCCQEVIGRK	Decoy	6	24	FALSE
+LQYLSCISTCCQEVIGRK	Decoy	7	24	FALSE
+NCYNELQYLSCISTCCQEVIGR	Decoy	2	23	FALSE
+NCYNELQYLSCISTCCQEVIG	Decoy	2	22	FALSE
+NCYNELQYLSCISTCCQEVI	Decoy	2	21	FALSE
+NCYNELQYLSCISTCCQEV	Decoy	2	20	FALSE
+NCYNELQYLSCISTCCQE	Decoy	2	19	FALSE
+MNCYNELQYLSCISTCCQEVIGR	Decoy	1	23	TRUE
+MNCYNELQYLSCISTCCQEVIG	Decoy	1	22	TRUE
+MNCYNELQYLSCISTCCQEVI	Decoy	1	21	TRUE
+MNCYNELQYLSCISTCCQEV	Decoy	1	20	TRUE
+MNCYNELQYLSCISTCCQE	Decoy	1	19	TRUE
+LSGELALPQLSGAGPGGGLEVQGVQLDEA	Decoy	26	54	FALSE
+SGELALPQLSGAGPGGGLEVQGVQLDEA	Decoy	27	54	FALSE
+GELALPQLSGAGPGGGLEVQGVQLDEA	Decoy	28	54	FALSE
+ELALPQLSGAGPGGGLEVQGVQLDEA	Decoy	29	54	FALSE
+LALPQLSGAGPGGGLEVQGVQLDEA	Decoy	30	54	FALSE
+QLSGELALPQLSGAGPGGGLEVQGVQLDE	Decoy	25	53	FALSE
+QLSGELALPQLSGAGPGGGLEVQGVQLD	Decoy	25	52	FALSE
+QLSGELALPQLSGAGPGGGLEVQGVQL	Decoy	25	51	FALSE
+QLSGELALPQLSGAGPGGGLEVQGVQ	Decoy	25	50	FALSE
+QLSGELALPQLSGAGPGGGLEVQGV	Decoy	25	49	FALSE
+TKPTYFFGREGCVLYLAEVLHSGCLHQNVF	Decoy	58	87	FALSE
+KPTYFFGREGCVLYLAEVLHSGCLHQNVF	Decoy	59	87	FALSE
+PTYFFGREGCVLYLAEVLHSGCLHQNVF	Decoy	60	87	FALSE
+TYFFGREGCVLYLAEVLHSGCLHQNVF	Decoy	61	87	FALSE
+YFFGREGCVLYLAEVLHSGCLHQNVF	Decoy	62	87	FALSE
+RTKPTYFFGREGCVLYLAEVLHSGCLHQNV	Decoy	57	86	FALSE
+RTKPTYFFGREGCVLYLAEVLHSGCLHQN	Decoy	57	85	FALSE
+RTKPTYFFGREGCVLYLAEVLHSGCLHQ	Decoy	57	84	FALSE
+RTKPTYFFGREGCVLYLAEVLHSGCLH	Decoy	57	83	FALSE
+RTKPTYFFGREGCVLYLAEVLHSGCL	Decoy	57	82	FALSE
+PDPGWLALLALLPLLRMWLA	Decoy	91	110	FALSE
+DPGWLALLALLPLLRMWLA	Decoy	92	110	FALSE
+PGWLALLALLPLLRMWLA	Decoy	93	110	FALSE
+GWLALLALLPLLRMWLA	Decoy	94	110	FALSE
+WLALLALLPLLRMWLA	Decoy	95	110	FALSE
+APDPGWLALLALLPLLRMWL	Decoy	90	109	FALSE
+APDPGWLALLALLPLLRMW	Decoy	90	108	FALSE
+APDPGWLALLALLPLLRM	Decoy	90	107	FALSE
+APDPGWLALLALLPLLR	Decoy	90	106	FALSE
+APDPGWLALLALLPLL	Decoy	90	105	FALSE
+QLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA	Decoy	25	110	FALSE
diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj
index 59c393b39..7f0449d12 100644
--- a/mzLib/Test/Test.csproj
+++ b/mzLib/Test/Test.csproj
@@ -262,6 +262,9 @@
     <None Update="DatabaseTests\test_ensembl.pep.all.fasta">
       <CopyToOutputDirectory>Always</CopyToOutputDirectory>
     </None>
+    <None Update="DatabaseTests\truncationsExpected.tsv">
+      <CopyToOutputDirectory>Always</CopyToOutputDirectory>
+    </None>
     <None Update="DatabaseTests\uniprot_aifm1.fasta">
       <CopyToOutputDirectory>Always</CopyToOutputDirectory>
     </None>
diff --git a/mzLib/Test/TestPeptideWithSetMods.cs b/mzLib/Test/TestPeptideWithSetMods.cs
index ccc6d950e..d235759ed 100644
--- a/mzLib/Test/TestPeptideWithSetMods.cs
+++ b/mzLib/Test/TestPeptideWithSetMods.cs
@@ -1046,67 +1046,177 @@ public static void TestPeptideWithSetModsReturnsTruncationsInTopDown()
             List<PeptideWithSetModifications> insulinTruncations = insulin.Digest(new DigestionParams(protease: protease.Name), new List<Modification>(), new List<Modification>(), topDownTruncationSearch: true).ToList();
             Assert.AreEqual(68, insulinTruncations.Count);
         }
-
         [Test]
         public static void TestPeptideWithSetModsReturnsDecoyTruncationsInTopDown()
         {
+            // PURPOSE
+            // Generate a comprehensive TSV of all top-down truncation peptides for target and decoy insulin entries,
+            // then compare the result against the stored expected file for regression checking.
+            //
+            // OUTPUT COLUMNS
+            // - Sequence: peptide/proteoform base sequence
+            // - Type: "Target" or "Decoy" based on parent protein
+            // - Begin: one-based start residue within the parent protein
+            // - End: one-based end residue within the parent protein
+            // - RetainedMethionine: TRUE when the peptide includes the protein’s N-terminal Met (Begin == 1 and Parent.BaseSequence[0] == 'M'), else FALSE
+
+            // Arrange: load insulin with reverse decoys and truncations enabled
             string xmlDatabase = Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", "humanInsulin.xml");
-            List<Protein> insulinProteins = ProteinDbLoader.LoadProteinXML(xmlDatabase, true,
-                DecoyType.Reverse, null, false, null, out var unknownModifications, addTruncations: true);
+            List<Protein> insulinProteins = ProteinDbLoader.LoadProteinXML(
+                xmlDatabase,
+                generateTargets: true,
+                decoyType: DecoyType.Reverse,
+                allKnownModifications: null,
+                isContaminant: false,
+                modTypesToExclude: null,
+                unknownModifications: out var unknownModifications,
+                addTruncations: true);
+
+            Assert.That(insulinProteins.Any(p => !p.IsDecoy), "Expected at least one target protein");
+            Assert.That(insulinProteins.Any(p => p.IsDecoy), "Expected at least one decoy protein");
+            Assert.That(unknownModifications == null || unknownModifications.Count == 0, "No unknown modifications expected from insulin XML");
+
+            // Digest: enumerate truncations for a representative target/decoy pair (parity sanity)
+            static string Reverse(string s) => new string(s.Reverse().ToArray());
+            var target = insulinProteins.First(p => !p.IsDecoy);
+            string expectedDecoySeq = target.BaseSequence.Length > 0 && target.BaseSequence[0] == 'M'
+                ? "M" + Reverse(target.BaseSequence.Substring(1))
+                : Reverse(target.BaseSequence);
+            var decoy = insulinProteins.FirstOrDefault(p => p.IsDecoy && p.BaseSequence == expectedDecoySeq)
+                        ?? insulinProteins.First(p => p.IsDecoy && p.Length == target.Length);
+
+            Assert.IsFalse(string.IsNullOrWhiteSpace(target.Accession));
+            Assert.IsTrue(decoy.Accession.StartsWith("DECOY_"));
+            Assert.AreEqual(target.Length, decoy.Length);
+            Assert.AreNotEqual(target.BaseSequence, decoy.BaseSequence);
+            Assert.AreEqual(expectedDecoySeq, decoy.BaseSequence, "Decoy must follow 'retain M, reverse remainder' rule");
+
+            var dp = new DigestionParams(protease: "top-down");
+            List<PeptideWithSetModifications> targetTruncs = target
+                .Digest(dp, new List<Modification>(), new List<Modification>(), topDownTruncationSearch: true)
+                .Cast<PeptideWithSetModifications>().ToList();
+            List<PeptideWithSetModifications> decoyTruncs = decoy
+                .Digest(dp, new List<Modification>(), new List<Modification>(), topDownTruncationSearch: true)
+                .Cast<PeptideWithSetModifications>().ToList();
+
+            // Parity and sanity checks for the selected pair
+            Assert.AreEqual(68, targetTruncs.Count, "Target should yield 68 truncation products in top-down mode");
+            Assert.AreEqual(68, decoyTruncs.Count, "Decoy should yield 68 truncation products in top-down mode");
+            Assert.That(targetTruncs.All(p => p.DigestionParams?.DigestionAgent?.Name == "top-down"));
+            Assert.That(decoyTruncs.All(p => p.DigestionParams?.DigestionAgent?.Name == "top-down"));
+            Assert.AreEqual(targetTruncs.Count, targetTruncs.Select(p => p.BaseSequence).Distinct().Count());
+            Assert.AreEqual(decoyTruncs.Count, decoyTruncs.Select(p => p.BaseSequence).Distinct().Count());
+            Assert.IsTrue(targetTruncs.Any(p => p.BaseSequence == target.BaseSequence));
+            Assert.IsTrue(decoyTruncs.Any(p => p.BaseSequence == decoy.BaseSequence));
+
+            // Build the table rows
+            static bool HasRetainedMet(PeptideWithSetModifications p) =>
+                p.OneBasedStartResidueInProtein == 1 &&
+                p.Parent?.BaseSequence?.Length > 0 &&
+                p.Parent.BaseSequence[0] == 'M';
+
+            // We only compare the combined truncations for the chosen target/decoy in this test (68 + 68 rows)
+            var rows = targetTruncs.Concat(decoyTruncs)
+                .Select(pep =>
+                {
+                    bool isDecoy = (pep.Parent as Protein)?.IsDecoy == true;
+                    string type = isDecoy ? "Decoy" : "Target";
+                    string retained = HasRetainedMet(pep) ? "TRUE" : "FALSE"; // normalize to match expected
+                    return string.Join("\t", new[]
+                    {
+                        pep.BaseSequence,
+                        type,
+                        pep.OneBasedStartResidueInProtein.ToString(),
+                        pep.OneBasedEndResidueInProtein.ToString(),
+                        retained
+                    });
+                })
+                // Sort deterministically to avoid platform/iteration-order differences
+                .OrderBy(r => r, StringComparer.Ordinal)
+                .ToList();
+
+            var header = "Sequence\tType\tBegin\tEnd\tRetainedMethionine";
+            var outputLines = new List<string>(capacity: rows.Count + 1) { header };
+            outputLines.AddRange(rows);
+
+            // Persist the generated table for inspection
+            string workPath = Path.Combine(TestContext.CurrentContext.WorkDirectory, "topdown_truncations_table.tsv");
+            File.WriteAllLines(workPath, outputLines);
+            Console.WriteLine($"Generated truncation table: {workPath}");
+
+            // Load expected and compare
+            string expectedPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "truncationsExpected.tsv");
+            Assert.That(File.Exists(expectedPath), $"Expected file not found: {expectedPath}");
+            var expectedAll = File.ReadAllLines(expectedPath)
+                                  .Where(l => l is not null)
+                                  .Select(l => l.TrimEnd('\r', '\n'))
+                                  .ToList();
+
+            Assert.That(expectedAll.Count > 0, "Expected file is empty");
+            string expectedHeader = expectedAll[0];
+            var expectedRows = expectedAll.Skip(1)
+                                          .Where(l => !string.IsNullOrWhiteSpace(l))
+                                          .OrderBy(l => l, StringComparer.Ordinal)
+                                          .ToList();
+
+            // Header check
+            if (!string.Equals(header, expectedHeader, StringComparison.Ordinal))
+            {
+                TestContext.Out.WriteLine($"Header mismatch:");
+                TestContext.Out.WriteLine($"  Expected: {expectedHeader}");
+                TestContext.Out.WriteLine($"  Actual  : {header}");
+            }
 
-            Protease protease = new Protease("top-down", CleavageSpecificity.None, "", "", new List<DigestionMotif>(), null);
-            List<PeptideWithSetModifications> insulintTargetTruncations = insulinProteins.Where(p=>!p.IsDecoy).First().Digest(new DigestionParams(protease: protease.Name), new List<Modification>(), new List<Modification>(), topDownTruncationSearch: true).ToList();
-            Assert.AreEqual(68, insulintTargetTruncations.Count);
-            List<PeptideWithSetModifications> insulintDecoyTruncations = insulinProteins.Where(p => p.IsDecoy).First().Digest(new DigestionParams(protease: protease.Name), new List<Modification>(), new List<Modification>(), topDownTruncationSearch: true).ToList();
-            Assert.AreEqual(68, insulintDecoyTruncations.Count);
-        }
+            // Multiset comparison for rows (counts of duplicates matter)
+            static Dictionary<string, int> ToCounts(IEnumerable<string> lines)
+                => lines.GroupBy(x => x, StringComparer.Ordinal)
+                        .ToDictionary(g => g.Key, g => g.Count(), StringComparer.Ordinal);
 
-        [Test]
-        public static void CheckFullChemicalFormula()
-        {
-            PeptideWithSetModifications small_pep = new PeptideWithSetModifications(new Protein("PEPTIDE", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 7, CleavageSpecificity.Full, null, 0, new Dictionary<int, Modification>(), 0, null);
-            ChemicalFormula small_pep_cf = ChemicalFormula.ParseFormula("C34H53N7O15");
-            Assert.AreEqual(small_pep.FullChemicalFormula, small_pep_cf);
+            var expCounts = ToCounts(expectedRows);
+            var gotCounts = ToCounts(rows);
 
-            PeptideWithSetModifications large_pep = new PeptideWithSetModifications(new Protein("PEPTIDEKRNSPEPTIDEKECUEIRQUV", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 28, CleavageSpecificity.Full, null, 0, new Dictionary<int, Modification>(), 0, null);
-            ChemicalFormula large_pep_cf = ChemicalFormula.ParseFormula("C134H220N38O50S1Se2");
-            Assert.AreEqual(large_pep.FullChemicalFormula, large_pep_cf);
+            var missing = new List<string>();   // in expected more times than in actual
+            var extra = new List<string>();     // in actual more times than in expected
+
+            foreach (var kv in expCounts)
+            {
+                gotCounts.TryGetValue(kv.Key, out int got);
+                if (got < kv.Value)
+                {
+                    int deficit = kv.Value - got;
+                    for (int i = 0; i < deficit; i++) missing.Add(kv.Key);
+                }
+            }
+            foreach (var kv in gotCounts)
+            {
+                expCounts.TryGetValue(kv.Key, out int exp);
+                if (kv.Value > exp)
+                {
+                    int surplus = kv.Value - exp;
+                    for (int i = 0; i < surplus; i++) extra.Add(kv.Key);
+                }
+            }
 
-            ModificationMotif.TryGetMotif("S", out ModificationMotif motif_s);
-            Modification phosphorylation = new Modification(_originalId: "phospho", _modificationType: "CommonBiological", _target: motif_s, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("H1O3P1"));
-            Dictionary<int, Modification> modDict_small = new Dictionary<int, Modification>();
-            modDict_small.Add(4, phosphorylation);
+            if (missing.Count == 0 && extra.Count == 0)
+            {
+                TestContext.Out.WriteLine("Top-down truncation table matches expected.");
+                TestContext.Out.WriteLine("Sample (first 5 rows):");
+                foreach (var l in outputLines.Take(6)) TestContext.Out.WriteLine(l);
+            }
+            else
+            {
+                TestContext.Out.WriteLine("Top-down truncation table differs from expected.");
+                TestContext.Out.WriteLine($"Missing rows (expected but not found or under-counted): {missing.Count}");
+                foreach (var l in missing.Take(20)) TestContext.Out.WriteLine($"  MISSING: {l}");
+                if (missing.Count > 20) TestContext.Out.WriteLine($"  ...and {missing.Count - 20} more");
 
-            PeptideWithSetModifications small_pep_mod = new PeptideWithSetModifications(new Protein("PEPSIDE", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 7, CleavageSpecificity.Full, null, 0, modDict_small, 0, null);
-            ChemicalFormula small_pep_mod_cf = ChemicalFormula.ParseFormula("C33H52N7O18P1");
-            Assert.AreEqual(small_pep_mod.FullChemicalFormula, small_pep_mod_cf);
+                TestContext.Out.WriteLine($"Extra rows (found but not expected or over-counted): {extra.Count}");
+                foreach (var l in extra.Take(20)) TestContext.Out.WriteLine($"  EXTRA:   {l}");
+                if (extra.Count > 20) TestContext.Out.WriteLine($"  ...and {extra.Count - 20} more");
 
-            ModificationMotif.TryGetMotif("K", out ModificationMotif motif_k);
-            Modification acetylation = new Modification(_originalId: "acetyl", _modificationType: "CommonBiological", _target: motif_k, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("C2H3O"));
-            Dictionary<int, Modification> modDict_large = new Dictionary<int, Modification>();
-            modDict_large.Add(4, phosphorylation);
-            modDict_large.Add(11, phosphorylation);
-            modDict_large.Add(8, acetylation);
-
-            PeptideWithSetModifications large_pep_mod = new PeptideWithSetModifications(new Protein("PEPSIDEKRNSPEPTIDEKECUEIRQUV", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 28, CleavageSpecificity.Full, null, 0, modDict_large, 0, null);
-            ChemicalFormula large_pep_mod_cf = ChemicalFormula.ParseFormula("C135H223N38O57P2S1Se2");
-            Assert.AreEqual(large_pep_mod.FullChemicalFormula, large_pep_mod_cf);
-
-            ModificationMotif.TryGetMotif("C", out var motif_c);
-            ModificationMotif.TryGetMotif("G", out var motif_g);
-            Dictionary<string, Modification> modDict =
-                new()
-                {
-                    { "Carbamidomethyl on C", new Modification(_originalId: "Carbamidomethyl", _modificationType: "Common Fixed",
-                        _target: motif_c, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("C2H3ON")) },
-                    { "BS on G" , new Modification(_originalId: "BS on G", _modificationType: "BS", _target: motif_g, _monoisotopicMass: 96.0875)}
-                };
-            PeptideWithSetModifications pwsmWithMissingCfMods = new PeptideWithSetModifications(
-                "ENQGDETQG[Speculative:BS on G]C[Common Fixed:Carbamidomethyl on C]PPQR", modDict, p: new Protein("ENQGDETQGCPPQR", "FakeProtein"), digestionParams: new DigestionParams(),
-                oneBasedStartResidueInProtein: 1, oneBasedEndResidueInProtein: 14);
-            Assert.Null(pwsmWithMissingCfMods.FullChemicalFormula);
+                Assert.Fail($"Generated top-down truncation table does not match expected.\nExpected file: {expectedPath}\nActual file: {workPath}\nMissing: {missing.Count}, Extra: {extra.Count}");
+            }
         }
-
         [Test]
         public static void CheckMostAbundantMonoisotopicMass()
         {

From 829208c5390a280eda2176e301aeb9de56ee0a00 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Fri, 24 Oct 2025 16:24:54 -0500
Subject: [PATCH 25/38] name changeroo

---
 mzLib/Omics/BioPolymer/SequenceVariation.cs   | 15 ++++-----
 mzLib/Omics/BioPolymer/VariantApplication.cs  | 32 +++++++++----------
 mzLib/Omics/BioPolymer/VariantCallFormat.cs   |  2 +-
 mzLib/Test/DatabaseTests/TestProteinReader.cs |  6 ++--
 .../DatabaseTests/TestProteomicsReadWrite.cs  |  8 ++---
 .../Test/DatabaseTests/TestVariantProtein.cs  |  4 +--
 mzLib/Test/FlashLFQ/TestIsoTracker.cs         | 32 +++++++++----------
 .../DecoyGeneration/DecoyProteinGenerator.cs  | 16 +++++-----
 .../DecoyGeneration/RnaDecoyGenerator.cs      |  4 +--
 .../ProteinDbWriter.cs                        |  6 ++--
 10 files changed, 61 insertions(+), 64 deletions(-)

diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs
index 63619b1c8..875f556bd 100644
--- a/mzLib/Omics/BioPolymer/SequenceVariation.cs
+++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs
@@ -20,7 +20,7 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str
             OneBasedEndPosition = oneBasedEndPosition;
             OriginalSequence = originalSequence ?? "";
             VariantSequence = variantSequence ?? "";
-            Description = new VariantCallFormat(description);
+            VariantCallFormatDataString = new VariantCallFormat(description);
             OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
         }
 
@@ -58,14 +58,11 @@ public SequenceVariation(int oneBasedPosition, string originalSequence, string v
         public string VariantSequence { get; }
 
         /// <summary>
-        /// Description of this variation (optional)
+        /// VariantCallFormatDataString of this variation (optional)
         /// </summary>
-        public VariantCallFormat Description { get; }
+        public VariantCallFormat? VariantCallFormatDataString { get; }
+
 
-        /// <summary>Optional multi-sample VCF record describing the variant (can be null or collapsed).</summary>
-        public VariantCallFormat? VariantCallFormatData { get; }
-        [Obsolete("Use VariantCallFormatData for structured data or Description/SearchableAnnotation for text.")]
-        public VariantCallFormat? LegacyVariantDescription => VariantCallFormatData;
         /// <summary>
         /// Modifications specifically for this variant
         /// </summary>
@@ -79,7 +76,7 @@ public override bool Equals(object obj)
                 && OneBasedEndPosition == s.OneBasedEndPosition
                 && (s.OriginalSequence == null && OriginalSequence == null || OriginalSequence.Equals(s.OriginalSequence))
                 && (s.VariantSequence == null && VariantSequence == null || VariantSequence.Equals(s.VariantSequence))
-                && (s.Description == null && Description == null || Description.Equals(s.Description))
+                && (s.VariantCallFormatDataString == null && VariantCallFormatDataString == null || VariantCallFormatDataString.Equals(s.VariantCallFormatDataString))
                 && (s.OneBasedModifications == null && OneBasedModifications == null ||
                     s.OneBasedModifications.Keys.ToList().SequenceEqual(OneBasedModifications.Keys.ToList())
                     && s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList()));
@@ -91,7 +88,7 @@ public override int GetHashCode()
                 ^ OneBasedEndPosition.GetHashCode()
                 ^ OriginalSequence.GetHashCode() // null handled in constructor
                 ^ VariantSequence.GetHashCode() // null handled in constructor
-                ^ Description.GetHashCode(); // always constructed in constructor
+                ^ VariantCallFormatDataString.GetHashCode(); // always constructed in constructor
         }
 
         /// <summary>
diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs
index 1669c3afe..897ffb9a7 100644
--- a/mzLib/Omics/BioPolymer/VariantApplication.cs
+++ b/mzLib/Omics/BioPolymer/VariantApplication.cs
@@ -25,7 +25,7 @@ public static List<TBioPolymerType> GetVariantBioPolymers<TBioPolymerType>(this
         {
             protein.ConsensusVariant.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
             protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
-            if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.Description == null || v.Description.Genotypes.Count == 0))
+            if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.VariantCallFormatDataString == null || v.VariantCallFormatDataString.Genotypes.Count == 0))
             {
                 // this is a protein with either no VCF lines or a mix of VCF and non-VCF lines
                 return ApplyAllVariantCombinations(protein, protein.SequenceVariations, maxAllowedVariantsForCombinatorics).ToList();
@@ -100,7 +100,7 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
             List<SequenceVariation> uniqueEffectsToApply = sequenceVariations
                 .GroupBy(v => v.SimpleString())
                 .Select(x => x.First())
-                .Where(v => v.Description.Genotypes.Count > 0) // this is a VCF line
+                .Where(v => v.VariantCallFormatDataString.Genotypes.Count > 0) // this is a VCF line
                 .OrderByDescending(v => v.OneBasedBeginPosition) // apply variants at the end of the protein sequence first
                 .ToList();
 
@@ -112,7 +112,7 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                 return new List<TBioPolymerType> { proteinCopy };
             }
 
-            HashSet<string> individuals = new HashSet<string>(uniqueEffectsToApply.SelectMany(v => v.Description.Genotypes.Keys));
+            HashSet<string> individuals = new HashSet<string>(uniqueEffectsToApply.SelectMany(v => v.VariantCallFormatDataString.Genotypes.Keys));
             List<TBioPolymerType> variantProteins = new();
             List<TBioPolymerType> newVariantProteins = new();
             // loop through genotypes for each sample/individual (e.g. tumor and normal)
@@ -121,17 +121,17 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                 newVariantProteins.Clear();
                 newVariantProteins.Add(proteinCopy);
 
-                bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.Description.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics;
+                bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.VariantCallFormatDataString.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics;
                 foreach (var variant in uniqueEffectsToApply)
                 {
-                    bool variantAlleleIsInTheGenotype = variant.Description.Genotypes[individual].Contains(variant.Description.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff
+                    bool variantAlleleIsInTheGenotype = variant.VariantCallFormatDataString.Genotypes[individual].Contains(variant.VariantCallFormatDataString.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff
                     if (!variantAlleleIsInTheGenotype)
                     {
                         continue;
                     }
-                    bool isHomozygousAlternate = variant.Description.Homozygous[individual] && variant.Description.Genotypes[individual].All(d => d == variant.Description.AlleleIndex.ToString()); // note this isn't a great test for homozygosity, since the genotype could be 1/2 and this would still return true. But currently, alleles 1 and 2 will be included as separate variants, so this is fine for now.
-                    bool isDeepReferenceAllele = int.TryParse(variant.Description.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth;
-                    bool isDeepAlternateAllele = int.TryParse(variant.Description.AlleleDepths[individual][variant.Description.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth;
+                    bool isHomozygousAlternate = variant.VariantCallFormatDataString.Homozygous[individual] && variant.VariantCallFormatDataString.Genotypes[individual].All(d => d == variant.VariantCallFormatDataString.AlleleIndex.ToString()); // note this isn't a great test for homozygosity, since the genotype could be 1/2 and this would still return true. But currently, alleles 1 and 2 will be included as separate variants, so this is fine for now.
+                    bool isDeepReferenceAllele = int.TryParse(variant.VariantCallFormatDataString.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth;
+                    bool isDeepAlternateAllele = int.TryParse(variant.VariantCallFormatDataString.AlleleDepths[individual][variant.VariantCallFormatDataString.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth;
 
                     // homozygous alternate
                     if (isHomozygousAlternate && isDeepAlternateAllele)
@@ -141,7 +141,7 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
 
                     // heterozygous basic
                     // first protein with variants contains all homozygous variation, second contains all variations
-                    else if (variant.Description.Heterozygous[individual] && tooManyHeterozygousVariants)
+                    else if (variant.VariantCallFormatDataString.Heterozygous[individual] && tooManyHeterozygousVariants)
                     {
                         if (isDeepAlternateAllele && isDeepReferenceAllele)
                         {
@@ -170,7 +170,7 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                     }
 
                     // heterozygous combinitorics
-                    else if (variant.Description.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants)
+                    else if (variant.VariantCallFormatDataString.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants)
                     {
                         List<TBioPolymerType> combinitoricProteins = new();
 
@@ -179,7 +179,7 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                             if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0 && isDeepReferenceAllele)
                             {
                                 // keep reference allele
-                                if (variant.Description.Genotypes[individual].Contains("0"))
+                                if (variant.VariantCallFormatDataString.Genotypes[individual].Contains("0"))
                                 {
                                     combinitoricProteins.Add(ppp);
                                 }
@@ -191,7 +191,7 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                             {
                                 combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual));
                             }
-                            else if (variant.Description.Genotypes[individual].Contains("0"))
+                            else if (variant.VariantCallFormatDataString.Genotypes[individual].Contains("0"))
                             {
                                 combinitoricProteins.Add(ppp);
                             }
@@ -224,7 +224,7 @@ private static TBioPolymerType ApplySingleVariant<TBioPolymerType>(SequenceVaria
                 variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1,
                 variantGettingApplied.OriginalSequence,
                 variantGettingApplied.VariantSequence,
-                variantGettingApplied.Description.Description,
+                variantGettingApplied.VariantCallFormatDataString.Description,
                 variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value));
 
             // check to see if there is incomplete indel overlap, which would lead to weird variant sequences
@@ -271,7 +271,7 @@ private static List<SequenceVariation> AdjustSequenceVariationIndices(SequenceVa
 
                 // variant was entirely before the one being applied (shouldn't happen because of order of applying variants)
                 // or it's the current variation
-                if (v.Description.Equals(variantGettingApplied.Description) || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition)
+                if (v.VariantCallFormatDataString.Equals(variantGettingApplied.VariantCallFormatDataString) || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition)
                 {
                     variations.Add(v);
                 }
@@ -299,7 +299,7 @@ private static List<SequenceVariation> AdjustSequenceVariationIndices(SequenceVa
                         end,
                         v.OriginalSequence,
                         v.VariantSequence,
-                        v.Description.Description,
+                        v.VariantCallFormatDataString.Description,
                         v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)));
                 }
             }
@@ -425,7 +425,7 @@ private static string CombineSimpleStrings(IEnumerable<SequenceVariation>? varia
         /// </summary>
         public static string CombineDescriptions(IEnumerable<SequenceVariation>? variations)
         {
-            return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.Description));
+            return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.VariantCallFormatDataString));
         }
         /// <summary>
         /// Applies all possible combinations of the provided SequenceVariation list to the base TBioPolymerType object,
diff --git a/mzLib/Omics/BioPolymer/VariantCallFormat.cs b/mzLib/Omics/BioPolymer/VariantCallFormat.cs
index e0cbec9a7..7549726c0 100644
--- a/mzLib/Omics/BioPolymer/VariantCallFormat.cs
+++ b/mzLib/Omics/BioPolymer/VariantCallFormat.cs
@@ -146,7 +146,7 @@ public VariantCallFormat(string description)
             Description = description;
 
             // Back-compat: if no real tabs are present but literal "\t" sequences are,
-            // normalize them to actual tabs for parsing only. Leave Description intact.
+            // normalize them to actual tabs for parsing only. Leave VariantCallFormatDataString intact.
             string parseLine = NormalizeTabsForParsing(description);
 
             // Parse description into VCF fields
diff --git a/mzLib/Test/DatabaseTests/TestProteinReader.cs b/mzLib/Test/DatabaseTests/TestProteinReader.cs
index 7dcd0b4d8..e0768eef4 100644
--- a/mzLib/Test/DatabaseTests/TestProteinReader.cs
+++ b/mzLib/Test/DatabaseTests/TestProteinReader.cs
@@ -125,7 +125,7 @@ public static void XmlTest()
             Assert.AreEqual(64, ok[0].SequenceVariations.First().OneBasedEndPosition);
             Assert.AreEqual(103 - 64 + 2, ok[1].SequenceVariations.First().OneBasedBeginPosition);
             Assert.AreEqual(103 - 64 + 2, ok[1].SequenceVariations.First().OneBasedEndPosition);
-            Assert.AreNotEqual(ok[0].SequenceVariations.First().Description, ok[1].SequenceVariations.First().Description); //decoys and target variations don't have the same desc.
+            Assert.AreNotEqual(ok[0].SequenceVariations.First().VariantCallFormatDataString, ok[1].SequenceVariations.First().VariantCallFormatDataString); //decoys and target variations don't have the same desc.
             Assert.AreEqual("Homo sapiens", ok[1].Organism);
         }
 
@@ -420,8 +420,8 @@ public static void TestReverseDecoyXML_WithCustomIdentifier()
 
                 foreach (var variant in protein.AppliedSequenceVariations)
                 {
-                    Assert.That(variant.Description, Does.StartWith("rev"));
-                    Assert.That(variant.Description, Does.Not.StartWith("DECOY"));
+                    Assert.That(variant.VariantCallFormatDataString, Does.StartWith("rev"));
+                    Assert.That(variant.VariantCallFormatDataString, Does.Not.StartWith("DECOY"));
                 }
 
                 foreach (var bond in protein.DisulfideBonds)
diff --git a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs
index babe44a76..e6df1c8d5 100644
--- a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs
+++ b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs
@@ -504,12 +504,12 @@ public void TestFullProteinReadWrite()
             Assert.AreEqual(originalProtein.TruncationProducts.First().OneBasedEndPosition, proteinReadFromXml[0].TruncationProducts.First().OneBasedEndPosition);
             Assert.AreEqual(originalProtein.TruncationProducts.First().Type, proteinReadFromXml[0].TruncationProducts.First().Type.Split('(')[0]);
 
-            Assert.AreEqual(originalProtein.SequenceVariations.First().Description, proteinReadFromXml[0].SequenceVariations.First().Description);
+            Assert.AreEqual(originalProtein.SequenceVariations.First().VariantCallFormatDataString, proteinReadFromXml[0].SequenceVariations.First().VariantCallFormatDataString);
             Assert.AreEqual(originalProtein.SequenceVariations.First().OneBasedBeginPosition, proteinReadFromXml[0].SequenceVariations.First().OneBasedBeginPosition);
             Assert.AreEqual(originalProtein.SequenceVariations.First().OneBasedEndPosition, proteinReadFromXml[0].SequenceVariations.First().OneBasedEndPosition);
             Assert.AreEqual(originalProtein.SequenceVariations.First().OriginalSequence, proteinReadFromXml[0].SequenceVariations.First().OriginalSequence);
             Assert.AreEqual(originalProtein.SequenceVariations.First().VariantSequence, proteinReadFromXml[0].SequenceVariations.First().VariantSequence);
-            Assert.AreEqual(originalProtein.SequenceVariations.Last().Description, proteinReadFromXml[0].SequenceVariations.Last().Description);
+            Assert.AreEqual(originalProtein.SequenceVariations.Last().VariantCallFormatDataString, proteinReadFromXml[0].SequenceVariations.Last().VariantCallFormatDataString);
             Assert.AreEqual(originalProtein.SequenceVariations.Last().OneBasedBeginPosition, proteinReadFromXml[0].SequenceVariations.Last().OneBasedBeginPosition);
             Assert.AreEqual(originalProtein.SequenceVariations.Last().OneBasedEndPosition, proteinReadFromXml[0].SequenceVariations.Last().OneBasedEndPosition);
             Assert.AreEqual(originalProtein.SequenceVariations.Last().OriginalSequence, proteinReadFromXml[0].SequenceVariations.Last().OriginalSequence);
@@ -534,7 +534,7 @@ public void TestReadWriteSeqVars()
             Assert.AreEqual(ok[0].SequenceVariations.Count(), ok2[0].SequenceVariations.Count());
             Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedBeginPosition, ok2[0].SequenceVariations.First().OneBasedBeginPosition);
             Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedEndPosition, ok2[0].SequenceVariations.First().OneBasedEndPosition);
-            Assert.AreEqual(ok[0].SequenceVariations.First().Description, ok2[0].SequenceVariations.First().Description);
+            Assert.AreEqual(ok[0].SequenceVariations.First().VariantCallFormatDataString, ok2[0].SequenceVariations.First().VariantCallFormatDataString);
             Assert.AreEqual(ok[0].SequenceVariations.First().OriginalSequence, ok2[0].SequenceVariations.First().OriginalSequence);
             Assert.AreEqual(ok[0].SequenceVariations.First().VariantSequence, ok2[0].SequenceVariations.First().VariantSequence);
         }
@@ -557,7 +557,7 @@ public void TestReadWriteSeqVars2()
             Assert.AreEqual(ok[0].SequenceVariations.Count(), ok2[0].SequenceVariations.Count());
             Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedBeginPosition, ok2[0].SequenceVariations.First().OneBasedBeginPosition);
             Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedEndPosition, ok2[0].SequenceVariations.First().OneBasedEndPosition);
-            Assert.AreEqual(ok[0].SequenceVariations.First().Description, ok2[0].SequenceVariations.First().Description);
+            Assert.AreEqual(ok[0].SequenceVariations.First().VariantCallFormatDataString, ok2[0].SequenceVariations.First().VariantCallFormatDataString);
             Assert.AreEqual(ok[0].SequenceVariations.First().OriginalSequence, ok2[0].SequenceVariations.First().OriginalSequence);
             Assert.AreEqual(ok[0].SequenceVariations.First().VariantSequence, ok2[0].SequenceVariations.First().VariantSequence);
         }
diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
index edcc6cf5e..53bf78897 100644
--- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs
+++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
@@ -116,7 +116,7 @@ public static void SeqVarXmlTest()
             {
                 Assert.AreEqual(s.OriginalSequence, decoy.BaseSequence.Substring(s.OneBasedBeginPosition - 1, s.OneBasedEndPosition - s.OneBasedBeginPosition + 1));
             }
-            Assert.AreNotEqual(target.SequenceVariations.First().Description, decoy.SequenceVariations.First().Description); //decoys and target variations don't have the same desc.
+            Assert.AreNotEqual(target.SequenceVariations.First().VariantCallFormatDataString, decoy.SequenceVariations.First().VariantCallFormatDataString); //decoys and target variations don't have the same desc.
 
             List<PeptideWithSetModifications> peptides = ok.SelectMany(vp => vp.Digest(new DigestionParams(), null, null)).ToList();
         }
@@ -987,7 +987,7 @@ public void VariantSymbolWeirdnessXml()
             string file = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "SeqVarSymbolWeirdness.xml");
             List<Protein> variantProteins = ProteinDbLoader.LoadProteinXML(file, true, DecoyType.None, null, false, null, out var un);
             Assert.AreEqual(12, variantProteins.First().ConsensusVariant.SequenceVariations.Count());
-            Assert.AreEqual(2, variantProteins.First().ConsensusVariant.SequenceVariations.Count(v => v.Description.Heterozygous.Any(kv => kv.Value)));
+            Assert.AreEqual(2, variantProteins.First().ConsensusVariant.SequenceVariations.Count(v => v.VariantCallFormatDataString.Heterozygous.Any(kv => kv.Value)));
 
             Assert.AreEqual(1, variantProteins.Count); // Should be 2^2 from combinitorics of heterozygous, but the giant indels overwrite them
             Assert.AreEqual(0, variantProteins.Where(v => v.BaseSequence == variantProteins.First().ConsensusVariant.BaseSequence).Count()); // Homozygous variations are included
diff --git a/mzLib/Test/FlashLFQ/TestIsoTracker.cs b/mzLib/Test/FlashLFQ/TestIsoTracker.cs
index 59720e4ff..f918e991d 100644
--- a/mzLib/Test/FlashLFQ/TestIsoTracker.cs
+++ b/mzLib/Test/FlashLFQ/TestIsoTracker.cs
@@ -25,7 +25,7 @@ internal class TestIsoTracker
         [Test]
         public static void TestIsobaricPeptideGroup()
         {
-            // Description: Test the IsobaricPeptideGroup class
+            // VariantCallFormatDataString: Test the IsobaricPeptideGroup class
             // In this testing, we will create a new IsobaricPeptideGroup and check the properties
             List<Identification> ids = new List<Identification>
             {
@@ -126,7 +126,7 @@ public static void TestIsoTrackerIdFilter_FilterPeptide()
         [Test]
         public static void TestGetTargeMz_case1()
         {
-            // Description: Test the GetTargetMz function in FlashLfqEngine
+            // VariantCallFormatDataString: Test the GetTargetMz function in FlashLfqEngine
             // In this testing, we will check the isobaricPeptideGroup and targetMzs output
             // All three ids are isobaric peptides with the same monoisotopic mass, so they should be grouped together and generate only 5 target m/z values
 
@@ -189,7 +189,7 @@ public static void TestGetTargeMz_case1()
         [Test]
         public static void TestGetTargeMz_case2()
         {
-            // Description: Test the GetTargetMz function in FlashLfqEngine
+            // VariantCallFormatDataString: Test the GetTargetMz function in FlashLfqEngine
             // In this testing, we will check the isobaricPeptideGroup and targetMzs output
             // All three ids are isobaric peptides with the different monoisotopic mass, so they should be grouped together and generate 15(3*5) target m/z values
 
@@ -253,7 +253,7 @@ public static void TestGetTargeMz_case2()
         [Test]
         public static void TestIndexPeakPrune()
         {
-            // Description: Test the peak indexing engine pruning function
+            // VariantCallFormatDataString: Test the peak indexing engine pruning function
             // In this test, we will create the targetMzs from the ids to prune the indexPeaks.
             // After pruning, the index engine should only keep the peaks with the target m/z values.
             string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData");
@@ -329,7 +329,7 @@ public static void TestXICConstructor()
         [Test]
         public static void TestLinearSpline()
         {
-            //Description: Test the linear spline interpolation and differentiation
+            //VariantCallFormatDataString: Test the linear spline interpolation and differentiation
             //The testing model is a linear function y = 100x, where x is the time point and y is the intensity
             //The slope will be 100 and the second derivative will be 0
 
@@ -361,7 +361,7 @@ public static void TestLinearSpline()
         [Test]
         public static void TestPeakAlignment()
         {
-            //Description: Test the peak alignment function
+            //VariantCallFormatDataString: Test the peak alignment function
             //The testing model is a triangle peak with the Apex.
             //The Apex of three peaks are 3, 3.1, 2.9 min
             //The time shift should be 0.1 min for the peak2 and -0.1 min for the peak3
@@ -410,7 +410,7 @@ public static void TestPeakAlignment()
         [Test]
         public static void TestBuildSmoothedCubicSpline_LessPoint()
         {
-            //Description: Test the cubic spline interpolation
+            //VariantCallFormatDataString: Test the cubic spline interpolation
             //The testing model has less than 5 points that cannot build the cubic spline
             //The cubic spline should be null
 
@@ -531,7 +531,7 @@ public static void TestXICGroupConstructor()
         [Test]
         public static void TestXICGroup_RtDict()
         {
-            //Description: Test the peakAlignment function in the XICGroup
+            //VariantCallFormatDataString: Test the peakAlignment function in the XICGroup
             //The testing has three normal distribution XIC peaks.
             //The Apex of three peaks are 3, 3.1, 2.9 min
             //The time shift should be 0.1 min for the peak2 and -0.1 min for the peak3
@@ -582,7 +582,7 @@ public static void TestXICGroup_RtDict()
         [Test]
         public static void TestXICGroup_IdList()
         {
-            //Description: Test the IdList in the XICGroup
+            //VariantCallFormatDataString: Test the IdList in the XICGroup
             //The testing model has three XICs, one of this XIC has no Id, then it borrows one Id from the first XIC
             //If the Id is borrowed, the Id will not be added into the IdList
             //The IdList should contain the Ids from the first and the third XIC
@@ -617,7 +617,7 @@ public static void TestXICGroup_IdList()
         [Test]
         public static void TestXICGroup_Tracking()
         {
-            //Description: Test the peak tracking function in the XICGroup
+            //VariantCallFormatDataString: Test the peak tracking function in the XICGroup
             //The testing has three normal distribution XIC peaks.
             //The Apex of three peaks are 20, 23, 17 min
             //The time shift should be +3 min for the peak2 and -3 min for the peak3
@@ -688,7 +688,7 @@ public static void TestXICGroup_Tracking()
         [Test]
         public static void TestCombinedSearching()
         {
-            //Description: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output
+            //VariantCallFormatDataString: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output
             //There are three XIC included isobaric peaks that with 3 min gap.
 
             string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData");
@@ -796,7 +796,7 @@ public static void TestCombinedSearching()
         [Test]
         public static void TestPeakOutput()
         {
-            //Description: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output
+            //VariantCallFormatDataString: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output
             //There are three XIC included isobaric peaks that with 3 min gap.
 
             string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData");
@@ -931,7 +931,7 @@ public static void TestPeakOutput()
         [Test]
         public static void TestIsoSequence_Ambiguous()
         {
-            //Description: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID
+            //VariantCallFormatDataString: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID
             //IsoID: DIVENY[Common Variable:Oxidation on M]FMR   should be the same as DIVENYFM[Common Variable:Oxidation on M]R
 
             //Try to turn on the MBR and Isotracker at the same time
@@ -1066,7 +1066,7 @@ public static void TestIsoSequence_Ambiguous()
         [Test]
         public static void TestIsoSequence_MonoIsotopicMassTolerance()
         {
-            //Description: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID
+            //VariantCallFormatDataString: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID
             //IsoID: DIVENY[Common Variable:Oxidation on M]FMR   should be the same as DIVENYFM[Common Variable:Oxidation on M]R
             //The Monoisotopic mass are 1201.5436, 1201.5437, 1201.5438, they should be recognized as the same IsoID
 
@@ -1491,7 +1491,7 @@ public static void TestIsoSequence_CombinedTesting()
         [Test]
         public static void TestRun_SearchingTarget()
         {
-            //Description: we will upload a motifList for IsoTracker
+            //VariantCallFormatDataString: we will upload a motifList for IsoTracker
             //Only peptide with motif on N can be searched
             //In this case, only one kind of peptide can be searched: baseSequence PEPNINEN -> PEPN[Mod]INEN, PEPNIN[Mod]EN, PEPNINEN[Mod]
             // Run 1 with PEPNIN[Mod]EN, PEPNINEN[Mod]
@@ -1615,7 +1615,7 @@ public static void TestRun_SearchingTarget()
         [Test]
         public static void TestRun_IDChecking()
         {
-            //Description: we will turn on the IDchecking for IsoTracker
+            //VariantCallFormatDataString: we will turn on the IDchecking for IsoTracker
             //Only when one XIC with more than one id, we do the searching
             //In this case, run 1 has 4 ids (pepA_1, pepA_2, pepB_1, pepC_1)
             //run 2 has 3 ids (pepA_1, pepB_1, pepC_1)
diff --git a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs
index c83c80aa1..87264702e 100644
--- a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs
+++ b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs
@@ -239,34 +239,34 @@ private static List<SequenceVariation> ReverseSequenceVariations(IEnumerable<Seq
                     decoyVariations.Add(new SequenceVariation(sv.OneBasedBeginPosition,
                         reversedSequence.Substring(sv.OneBasedBeginPosition - 1, sv.OneBasedEndPosition - sv.OneBasedBeginPosition + 1),
                         new string(variationArray).Substring(1, variationArray.Length - 1) + variationArray[0],
-                        $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+                        $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications));
                 }
                 // start loss, so the variant is at the end
                 else if (startLoss)
                 {
-                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, new string(originalArray).Substring(0, originalArray.Length - 1), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, new string(originalArray).Substring(0, originalArray.Length - 1), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications));
                 }
                 // both start with M, but there's more
                 else if (sv.VariantSequence.StartsWith("M", StringComparison.Ordinal) && sv.OneBasedBeginPosition == 1 && (sv.OriginalSequence.Length > 1 || sv.VariantSequence.Length > 1))
                 {
                     string original = new string(originalArray).Substring(0, originalArray.Length - 1);
                     string variant = new string(variationArray).Substring(0, variationArray.Length - 1);
-                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, original, variant, $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, original, variant, $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications));
                 }
                 // gained an initiating methionine
                 else if (sv.VariantSequence.StartsWith("M", StringComparison.Ordinal) && sv.OneBasedBeginPosition == 1)
                 {
-                    decoyVariations.Add(new SequenceVariation(1, 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+                    decoyVariations.Add(new SequenceVariation(1, 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications));
                 }
                 // starting methionine, but no variations on it
                 else if (startsWithM)
                 {
-                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 2, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 2, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications));
                 }
                 // no starting methionine
                 else
                 {
-                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 1, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+                    decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 1, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications));
                 }
             }
             return decoyVariations;
@@ -335,7 +335,7 @@ private static List<Protein> GenerateSlideDecoys(List<Protein> proteins, int max
                         {
                             variationArraySlided[i] = variationArrayUnslided[GetOldSlidedIndex(i, numSlidesHere, variationArrayUnslided.Length, true)];
                         }
-                        decoyVariationsSlide.Add(new SequenceVariation(1, "M", new string(variationArraySlided), $"{decoyIdentifier} VARIANT: Initiator Methionine Change in " + sv.Description));
+                        decoyVariationsSlide.Add(new SequenceVariation(1, "M", new string(variationArraySlided), $"{decoyIdentifier} VARIANT: Initiator Methionine Change in " + sv.VariantCallFormatDataString));
                     }
                     else
                     {
@@ -352,7 +352,7 @@ private static List<Protein> GenerateSlideDecoys(List<Protein> proteins, int max
                             variationArraySlided[i] = variationArrayUnslided[GetOldSlidedIndex(i, numSlidesHere, variationArrayUnslided.Length, initMet)];
                         }
 
-                        decoyVariationsSlide.Add(new SequenceVariation(decoy_begin, decoy_end, sv.OriginalSequence, new string(variationArraySlided), $"{decoyIdentifier} VARIANT: " + sv.Description));
+                        decoyVariationsSlide.Add(new SequenceVariation(decoy_begin, decoy_end, sv.OriginalSequence, new string(variationArraySlided), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString));
                     }
                 }
                 var decoyProteinSlide = new Protein(slided_sequence, $"{decoyIdentifier}_" + protein.Accession, protein.Organism, protein.GeneNames.ToList(), decoyModifications, decoyPPSlide,
diff --git a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs
index cc7723c15..3786c969d 100644
--- a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs
+++ b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs
@@ -87,7 +87,7 @@ private static List<T> GenerateReverseDecoys<T>(List<T> nucleicAcids, int maxThr
                             var reverseModKey = indexMapping[modKvp.Key];
                             reverseModificationsForVariation.Add(reverseModKey, modKvp.Value);
                         }
-                        reverseAppliedVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.Description.Description, reverseModificationsForVariation));
+                        reverseAppliedVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.VariantCallFormatDataString.Description, reverseModificationsForVariation));
                     }
 
                     // Reverse Applied Variants
@@ -101,7 +101,7 @@ private static List<T> GenerateReverseDecoys<T>(List<T> nucleicAcids, int maxThr
                             var reverseModKey = indexMapping[modKvp.Key];
                             reverseModificationsForVariation.Add(reverseModKey, modKvp.Value);
                         }
-                        reverseVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.Description.Description, reverseModificationsForVariation));
+                        reverseVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.VariantCallFormatDataString.Description, reverseModificationsForVariation));
                     }
 
                     // Reverse Truncations
diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs
index 24ff44dd4..5eaa9897a 100644
--- a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs
+++ b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs
@@ -206,7 +206,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
                     {
                         writer.WriteStartElement("feature");
                         writer.WriteAttributeString("type", "sequence variant");
-                        writer.WriteAttributeString("description", hm.Description.ToString());
+                        writer.WriteAttributeString("description", hm.VariantCallFormatDataString.ToString());
                         writer.WriteStartElement("original");
                         writer.WriteString(hm.OriginalSequence);
                         writer.WriteEndElement(); // original
@@ -252,7 +252,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
                     //{
                     //    writer.WriteStartElement("feature");
                     //    writer.WriteAttributeString("type", "splice site");
-                    //    writer.WriteAttributeString("description", hm.Description);
+                    //    writer.WriteAttributeString("description", hm.VariantCallFormatDataString);
                     //    writer.WriteStartElement("location");
                     //    if (hm.OneBasedBeginPosition == hm.OneBasedEndPosition)
                     //    {
@@ -464,7 +464,7 @@ public static Dictionary<string, int> WriteXmlDatabase(Dictionary<string, HashSe
                     {
                         writer.WriteStartElement("feature");
                         writer.WriteAttributeString("type", "sequence variant");
-                        writer.WriteAttributeString("description", hm.Description.ToString());
+                        writer.WriteAttributeString("description", hm.VariantCallFormatDataString.ToString());
                         writer.WriteStartElement("original");
                         writer.WriteString(hm.OriginalSequence);
                         writer.WriteEndElement(); // original

From 691103c97aaff43dd71d872e674b3cd75aad4cf8 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Fri, 24 Oct 2025 16:33:31 -0500
Subject: [PATCH 26/38] two new vcf constructors in SequenceVariation

---
 mzLib/Omics/BioPolymer/SequenceVariation.cs | 22 ++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs
index 875f556bd..c5bf9cbae 100644
--- a/mzLib/Omics/BioPolymer/SequenceVariation.cs
+++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs
@@ -6,6 +6,26 @@ namespace Omics.BioPolymer
 {
     public class SequenceVariation
     {
+        public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, VariantCallFormat variantCallFormat, Dictionary<int, List<Modification>>? oneBasedModifications = null)
+        {
+            OneBasedBeginPosition = oneBasedBeginPosition;
+            OneBasedEndPosition = oneBasedEndPosition;
+            OriginalSequence = originalSequence ?? "";
+            VariantSequence = variantSequence ?? "";
+            Description = description;
+            VariantCallFormatDataString = variantCallFormat;
+            OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
+        }
+        public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, string variantCallFormatStringRepresentation, Dictionary<int, List<Modification>>? oneBasedModifications = null)
+        {
+            OneBasedBeginPosition = oneBasedBeginPosition;
+            OneBasedEndPosition = oneBasedEndPosition;
+            OriginalSequence = originalSequence ?? "";
+            VariantSequence = variantSequence ?? "";
+            Description = description;
+            VariantCallFormatDataString = new VariantCallFormat(variantCallFormatStringRepresentation);
+            OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
+        }
         /// <summary>
         /// For longer sequence variations, where a range of sequence is replaced. Point mutations should be specified with the same begin and end positions.
         /// </summary>
@@ -56,7 +76,7 @@ public SequenceVariation(int oneBasedPosition, string originalSequence, string v
         /// Variant sequence information (required)
         /// </summary>
         public string VariantSequence { get; }
-
+        public string Description { get; }
         /// <summary>
         /// VariantCallFormatDataString of this variation (optional)
         /// </summary>

From 36cc51b8943fc492f5608450f2bce437dc12fe67 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Fri, 24 Oct 2025 16:45:05 -0500
Subject: [PATCH 27/38] close

---
 mzLib/Omics/BioPolymer/SequenceVariation.cs | 42 ++++++++++++++++-----
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs
index c5bf9cbae..776b175fc 100644
--- a/mzLib/Omics/BioPolymer/SequenceVariation.cs
+++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs
@@ -26,24 +26,44 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str
             VariantCallFormatDataString = new VariantCallFormat(variantCallFormatStringRepresentation);
             OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
         }
-        /// <summary>
-        /// For longer sequence variations, where a range of sequence is replaced. Point mutations should be specified with the same begin and end positions.
-        /// </summary>
-        /// <param name="oneBasedBeginPosition"></param>
-        /// <param name="oneBasedEndPosition"></param>
-        /// <param name="originalSequence"></param>
-        /// <param name="variantSequence"></param>
-        /// <param name="oneBasedModifications"></param>
         public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, Dictionary<int, List<Modification>>? oneBasedModifications = null)
         {
             OneBasedBeginPosition = oneBasedBeginPosition;
             OneBasedEndPosition = oneBasedEndPosition;
             OriginalSequence = originalSequence ?? "";
             VariantSequence = variantSequence ?? "";
-            VariantCallFormatDataString = new VariantCallFormat(description);
+
+            if (LooksLikeVcf(description))
+            {
+                Description = "VCF Data";
+                VariantCallFormatDataString = new VariantCallFormat(description);
+            }
+            else
+            {
+                Description = description;
+                VariantCallFormatDataString = null;
+            }
             OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
         }
 
+        ///// <summary>
+        ///// For longer sequence variations, where a range of sequence is replaced. Point mutations should be specified with the same begin and end positions.
+        ///// </summary>
+        ///// <param name="oneBasedBeginPosition"></param>
+        ///// <param name="oneBasedEndPosition"></param>
+        ///// <param name="originalSequence"></param>
+        ///// <param name="variantSequence"></param>
+        ///// <param name="oneBasedModifications"></param>
+        //public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, Dictionary<int, List<Modification>>? oneBasedModifications = null)
+        //{
+        //    OneBasedBeginPosition = oneBasedBeginPosition;
+        //    OneBasedEndPosition = oneBasedEndPosition;
+        //    OriginalSequence = originalSequence ?? "";
+        //    VariantSequence = variantSequence ?? "";
+        //    VariantCallFormatDataString = new VariantCallFormat(description);
+        //    OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
+        //}
+
         /// <summary>
         /// For variations with only position information (not begin and end).
         /// Sets the end to the end of the original protein sequence to which this variation applies.
@@ -184,5 +204,9 @@ public bool AreValid()
         {
             return OneBasedBeginPosition > 0 && OneBasedEndPosition >= OneBasedBeginPosition;
         }
+        private static bool LooksLikeVcf(string s)
+            => !string.IsNullOrWhiteSpace(s)
+               && (s.Contains("\t") || s.Contains("\\t"))
+               && (s.Contains("GT:") || s.Contains(":GT:") || s.Contains(" ANN=") || s.Contains("\tANN="));
     }
 }
\ No newline at end of file

From 20f9f9d16e8714a3063758052782371e76a75d8a Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Fri, 24 Oct 2025 16:48:00 -0500
Subject: [PATCH 28/38] h

---
 mzLib/Omics/BioPolymer/SequenceVariation.cs | 25 +++++++++++----------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs
index 776b175fc..dbd71f82b 100644
--- a/mzLib/Omics/BioPolymer/SequenceVariation.cs
+++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs
@@ -112,23 +112,24 @@ public override bool Equals(object obj)
         {
             SequenceVariation s = obj as SequenceVariation;
             return s != null
-                && OneBasedBeginPosition == s.OneBasedBeginPosition
-                && OneBasedEndPosition == s.OneBasedEndPosition
-                && (s.OriginalSequence == null && OriginalSequence == null || OriginalSequence.Equals(s.OriginalSequence))
-                && (s.VariantSequence == null && VariantSequence == null || VariantSequence.Equals(s.VariantSequence))
-                && (s.VariantCallFormatDataString == null && VariantCallFormatDataString == null || VariantCallFormatDataString.Equals(s.VariantCallFormatDataString))
-                && (s.OneBasedModifications == null && OneBasedModifications == null ||
-                    s.OneBasedModifications.Keys.ToList().SequenceEqual(OneBasedModifications.Keys.ToList())
-                    && s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList()));
+                   && OneBasedBeginPosition == s.OneBasedBeginPosition
+                   && OneBasedEndPosition == s.OneBasedEndPosition
+                   && (s.OriginalSequence == null && OriginalSequence == null || OriginalSequence.Equals(s.OriginalSequence))
+                   && (s.VariantSequence == null && VariantSequence == null || VariantSequence.Equals(s.VariantSequence))
+                   && ((s.VariantCallFormatDataString == null && VariantCallFormatDataString == null)
+                       || (VariantCallFormatDataString != null && VariantCallFormatDataString.Equals(s.VariantCallFormatDataString)))
+                   && (s.OneBasedModifications == null && OneBasedModifications == null ||
+                       s.OneBasedModifications.Keys.ToList().SequenceEqual(OneBasedModifications.Keys.ToList())
+                       && s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList()));
         }
 
         public override int GetHashCode()
         {
             return OneBasedBeginPosition.GetHashCode()
-                ^ OneBasedEndPosition.GetHashCode()
-                ^ OriginalSequence.GetHashCode() // null handled in constructor
-                ^ VariantSequence.GetHashCode() // null handled in constructor
-                ^ VariantCallFormatDataString.GetHashCode(); // always constructed in constructor
+                   ^ OneBasedEndPosition.GetHashCode()
+                   ^ OriginalSequence.GetHashCode() // null handled in constructor
+                   ^ VariantSequence.GetHashCode() // null handled in constructor
+                   ^ (VariantCallFormatDataString?.GetHashCode() ?? Description.GetHashCode()); // fallback to non-null Description
         }
 
         /// <summary>

From 36d4df23d21ec8089a58469c913bf534e7ad7901 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Fri, 24 Oct 2025 16:48:44 -0500
Subject: [PATCH 29/38] u

---
 mzLib/Omics/BioPolymer/SequenceVariation.cs | 118 +++-----------------
 1 file changed, 15 insertions(+), 103 deletions(-)

diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs
index dbd71f82b..af32bd3cc 100644
--- a/mzLib/Omics/BioPolymer/SequenceVariation.cs
+++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs
@@ -46,165 +46,77 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str
             OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
         }
 
-        ///// <summary>
-        ///// For longer sequence variations, where a range of sequence is replaced. Point mutations should be specified with the same begin and end positions.
-        ///// </summary>
-        ///// <param name="oneBasedBeginPosition"></param>
-        ///// <param name="oneBasedEndPosition"></param>
-        ///// <param name="originalSequence"></param>
-        ///// <param name="variantSequence"></param>
-        ///// <param name="oneBasedModifications"></param>
-        //public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, Dictionary<int, List<Modification>>? oneBasedModifications = null)
-        //{
-        //    OneBasedBeginPosition = oneBasedBeginPosition;
-        //    OneBasedEndPosition = oneBasedEndPosition;
-        //    OriginalSequence = originalSequence ?? "";
-        //    VariantSequence = variantSequence ?? "";
-        //    VariantCallFormatDataString = new VariantCallFormat(description);
-        //    OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
-        //}
-
-        /// <summary>
-        /// For variations with only position information (not begin and end).
-        /// Sets the end to the end of the original protein sequence to which this variation applies.
-        /// </summary>
-        /// <param name="oneBasedPosition"></param>
-        /// <param name="originalSequence"></param>
-        /// <param name="variantSequence"></param>
-        /// <param name="description"></param>
-        /// <param name="oneBasedModifications"></param>
         public SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, Dictionary<int, List<Modification>>? oneBasedModifications = null)
             : this(oneBasedPosition, originalSequence == null ? oneBasedPosition : oneBasedPosition + originalSequence.Length - 1, originalSequence, variantSequence, description, oneBasedModifications)
         { }
 
-        /// <summary>
-        /// Beginning position of original sequence to be replaced
-        /// </summary>
         public int OneBasedBeginPosition { get; }
-
-        /// <summary>
-        /// End position of original sequence to be replaced
-        /// </summary>
         public int OneBasedEndPosition { get; }
-
-        /// <summary>
-        /// Original sequence information (optional)
-        /// </summary>
         public string OriginalSequence { get; }
-
-        /// <summary>
-        /// Variant sequence information (required)
-        /// </summary>
         public string VariantSequence { get; }
         public string Description { get; }
-        /// <summary>
-        /// VariantCallFormatDataString of this variation (optional)
-        /// </summary>
         public VariantCallFormat? VariantCallFormatDataString { get; }
-
-
-        /// <summary>
-        /// Modifications specifically for this variant
-        /// </summary>
         public Dictionary<int, List<Modification>> OneBasedModifications { get; }
 
         public override bool Equals(object obj)
         {
             SequenceVariation s = obj as SequenceVariation;
             return s != null
-                   && OneBasedBeginPosition == s.OneBasedBeginPosition
-                   && OneBasedEndPosition == s.OneBasedEndPosition
-                   && (s.OriginalSequence == null && OriginalSequence == null || OriginalSequence.Equals(s.OriginalSequence))
-                   && (s.VariantSequence == null && VariantSequence == null || VariantSequence.Equals(s.VariantSequence))
-                   && ((s.VariantCallFormatDataString == null && VariantCallFormatDataString == null)
-                       || (VariantCallFormatDataString != null && VariantCallFormatDataString.Equals(s.VariantCallFormatDataString)))
-                   && (s.OneBasedModifications == null && OneBasedModifications == null ||
-                       s.OneBasedModifications.Keys.ToList().SequenceEqual(OneBasedModifications.Keys.ToList())
-                       && s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList()));
+                && OneBasedBeginPosition == s.OneBasedBeginPosition
+                && OneBasedEndPosition == s.OneBasedEndPosition
+                && (s.OriginalSequence == null && OriginalSequence == null || OriginalSequence.Equals(s.OriginalSequence))
+                && (s.VariantSequence == null && VariantSequence == null || VariantSequence.Equals(s.VariantSequence))
+                && ((s.VariantCallFormatDataString == null && VariantCallFormatDataString == null)
+                    || (VariantCallFormatDataString != null && VariantCallFormatDataString.Equals(s.VariantCallFormatDataString)))
+                && (s.OneBasedModifications == null && OneBasedModifications == null ||
+                    s.OneBasedModifications.Keys.ToList().SequenceEqual(OneBasedModifications.Keys.ToList())
+                    && s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList()));
         }
 
         public override int GetHashCode()
         {
             return OneBasedBeginPosition.GetHashCode()
-                   ^ OneBasedEndPosition.GetHashCode()
-                   ^ OriginalSequence.GetHashCode() // null handled in constructor
-                   ^ VariantSequence.GetHashCode() // null handled in constructor
-                   ^ (VariantCallFormatDataString?.GetHashCode() ?? Description.GetHashCode()); // fallback to non-null Description
+                ^ OneBasedEndPosition.GetHashCode()
+                ^ OriginalSequence.GetHashCode()
+                ^ VariantSequence.GetHashCode()
+                ^ (VariantCallFormatDataString?.GetHashCode() ?? 0);
         }
 
-        /// <summary>
-        /// Returns a simple string represantation of this amino acid change
-        /// </summary>
-        /// <returns></returns>
         public string SimpleString()
         {
             return OriginalSequence + OneBasedBeginPosition.ToString() + VariantSequence;
         }
 
-        /// <summary>
-        /// Determines whether this interval overlaps the queried interval
-        /// </summary>
-        /// <param name="segment"></param>
-        /// <returns></returns>
         internal bool Intersects(SequenceVariation segment)
         {
             return segment.OneBasedEndPosition >= OneBasedBeginPosition && segment.OneBasedBeginPosition <= OneBasedEndPosition;
         }
 
-        /// <summary>
-        /// Determines whether this interval overlaps the queried interval
-        /// </summary>
-        /// <param name="segment"></param>
-        /// <returns></returns>
         internal bool Intersects(TruncationProduct segment)
         {
             return segment.OneBasedEndPosition >= OneBasedBeginPosition && segment.OneBasedBeginPosition <= OneBasedEndPosition;
         }
 
-        /// <summary>
-        /// Determines whether this interval overlaps the queried position
-        /// </summary>
-        /// <param name="pos"></param>
-        /// <returns></returns>
         internal bool Intersects(int pos)
         {
             return OneBasedBeginPosition <= pos && pos <= OneBasedEndPosition;
         }
 
-        /// <summary>
-        /// Determines whether this interval includes the queried interval
-        /// </summary>
-        /// <param name="segment"></param>
-        /// <returns></returns>
         internal bool Includes(SequenceVariation segment)
         {
             return OneBasedBeginPosition <= segment.OneBasedBeginPosition && OneBasedEndPosition >= segment.OneBasedEndPosition;
         }
-        // Commented out by AVC on 4/5/23. Unused and untested in current code base,
-        // but can't rule out that it could be useful in the future.  
-        /// <summary>
-        /// Determines whether this interval includes the queried interval
-        /// </summary>
-        /// <param name="segment"></param>
-        /// <returns></returns>
-        // internal bool Includes(TruncationProduct segment)
-        // {
-        //     return OneBasedBeginPosition <= segment.OneBasedBeginPosition && OneBasedEndPosition >= segment.OneBasedEndPosition;
-        // }
 
-        /// <summary>
-        /// Determines whether this interval overlaps the queried position
-        /// </summary>
-        /// <param name="pos"></param>
-        /// <returns></returns>
         internal bool Includes(int pos)
         {
             return OneBasedBeginPosition <= pos && pos <= OneBasedEndPosition;
         }
+
         public bool AreValid()
         {
             return OneBasedBeginPosition > 0 && OneBasedEndPosition >= OneBasedBeginPosition;
         }
+
         private static bool LooksLikeVcf(string s)
             => !string.IsNullOrWhiteSpace(s)
                && (s.Contains("\t") || s.Contains("\\t"))

From 4f1c9b079426ec71056484c00285acd9c41c6c81 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Fri, 24 Oct 2025 16:53:24 -0500
Subject: [PATCH 30/38] l

---
 mzLib/Omics/BioPolymer/VariantApplication.cs | 243 ++++++-------------
 1 file changed, 70 insertions(+), 173 deletions(-)

diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs
index 897ffb9a7..d6ab5b98a 100644
--- a/mzLib/Omics/BioPolymer/VariantApplication.cs
+++ b/mzLib/Omics/BioPolymer/VariantApplication.cs
@@ -4,22 +4,8 @@
 
 namespace Omics.BioPolymer
 {
-    /// <summary>
-    /// Provides methods for applying sequence variations to proteins and handling modifications on variant sequences.
-    /// </summary>
-    /// <remarks>
-    /// Originally by A. Cesnik on 11/2/18, updated on 4/25/23. NB moved it and generalized for use in Transcriptomics on 3/25/25.
-    /// </remarks>
     public static class VariantApplication
     {
-        /// <summary>
-        /// Creates a list of IBioPolymers of the same type as the original protein, each with applied variants from this protein.
-        /// </summary>
-        /// <typeparam name="TBioPolymerType">Type of BioPolymer to create variants of</typeparam>
-        /// <param name="protein">original to generate variants of</param>
-        /// <param name="maxAllowedVariantsForCombinatorics"></param>
-        /// <param name="minAlleleDepth"></param>
-        /// <remarks>This replaces a method call that was previously an instance method in Protein</remarks>
         public static List<TBioPolymerType> GetVariantBioPolymers<TBioPolymerType>(this TBioPolymerType protein, int maxAllowedVariantsForCombinatorics = 4, int minAlleleDepth = 1)
             where TBioPolymerType : IHasSequenceVariants
         {
@@ -27,16 +13,11 @@ public static List<TBioPolymerType> GetVariantBioPolymers<TBioPolymerType>(this
             protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
             if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.VariantCallFormatDataString == null || v.VariantCallFormatDataString.Genotypes.Count == 0))
             {
-                // this is a protein with either no VCF lines or a mix of VCF and non-VCF lines
                 return ApplyAllVariantCombinations(protein, protein.SequenceVariations, maxAllowedVariantsForCombinatorics).ToList();
             }
-            // this is a protein with only VCF lines
-            return ApplyVariants(protein, protein.SequenceVariations, maxAllowedVariantsForCombinatorics, minAlleleDepth);
+            return ApplyVariants(protein, protein.SequenceVariations, maxAllowedVariantsForCombinitorics: maxAllowedVariantsForCombinatorics, minAlleleDepth);
         }
 
-        /// <summary>
-        /// Gets the name of a protein with applied variations
-        /// </summary>
         public static string? GetVariantName(string? name, IEnumerable<SequenceVariation>? appliedVariations)
         {
             bool emptyVars = appliedVariations.IsNullOrEmpty();
@@ -47,43 +28,17 @@ public static List<TBioPolymerType> GetVariantBioPolymers<TBioPolymerType>(this
             return name + variantTag;
         }
 
-        /// <summary>
-        /// Gets the accession for a protein with applied variations
-        /// </summary>
         public static string GetAccession(IHasSequenceVariants protein, IEnumerable<SequenceVariation>? appliedSequenceVariations)
         {
             return protein.ConsensusVariant.Accession +
                    (appliedSequenceVariations.IsNullOrEmpty() ? "" : $"_{CombineSimpleStrings(appliedSequenceVariations)}");
         }
 
-        /// <summary>
-        /// Determines if the modification falls on a variant amino acid
-        /// </summary>
-        /// <returns>true if a modification index on the protein falls within the applied variant</returns>
-        /// <remarks>
-        /// A. Cesnik - 4/25/23 
-        /// Variants annotated in protein entries can be applied to a sequence, i.e. a change is made to the sequence. 
-        /// One of the things Spritz can do that no other tool can do is enable finding modifications on these sites of variation, 
-        /// since I amended the sequence variant XML entries to have modifications.
-        /// </remarks>
         public static bool IsSequenceVariantModification(SequenceVariation? appliedVariant, int variantProteinIndex)
         {
             return appliedVariant != null && appliedVariant.Includes(variantProteinIndex);
         }
 
-        /// <summary>
-        /// Restores modification index on a variant protein to the index on the nonvariant protein,
-        /// or if it falls on a variant, this restores the position on the protein with only that variant
-        /// </summary>
-        /// <param name="protein">Protein containing applied sequence variations</param>
-        /// <param name="variantProteinModificationIndex">The one-based index of the amino acid residue bearing a modification</param>
-        /// <returns></returns>
-        /// <remarks>
-        /// A. Cesnik - 4/25/23 
-        /// Useful for comparing modification indices on variant proteins to the original protein.
-        /// Variations can introduce length changes and other changes to the sequence, 
-        /// so the indices of the modifications aren’t directly comparable, but this method makes that possible.
-        /// </remarks>
         public static int RestoreModificationIndex(IHasSequenceVariants protein, int variantProteinModificationIndex)
         {
             return variantProteinModificationIndex - protein.AppliedSequenceVariations
@@ -91,22 +46,18 @@ public static int RestoreModificationIndex(IHasSequenceVariants protein, int var
                 .Sum(v => v.VariantSequence.Length - v.OriginalSequence.Length);
         }
 
-        /// <summary>
-        /// Applies multiple variant changes to a protein sequence
-        /// </summary>
         public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerType protein, IEnumerable<SequenceVariation> sequenceVariations, int maxAllowedVariantsForCombinitorics, int minAlleleDepth)
             where TBioPolymerType : IHasSequenceVariants
         {
             List<SequenceVariation> uniqueEffectsToApply = sequenceVariations
                 .GroupBy(v => v.SimpleString())
                 .Select(x => x.First())
-                .Where(v => v.VariantCallFormatDataString.Genotypes.Count > 0) // this is a VCF line
-                .OrderByDescending(v => v.OneBasedBeginPosition) // apply variants at the end of the protein sequence first
+                .Where(v => v.VariantCallFormatDataString.Genotypes.Count > 0)
+                .OrderByDescending(v => v.OneBasedBeginPosition)
                 .ToList();
 
             TBioPolymerType proteinCopy = protein.CreateVariant(protein.BaseSequence, protein, null, protein.TruncationProducts, protein.OneBasedPossibleLocalizedModifications, null);
 
-            // If there aren't any variants to apply, just return the base protein
             if (uniqueEffectsToApply.Count == 0)
             {
                 return new List<TBioPolymerType> { proteinCopy };
@@ -115,7 +66,6 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
             HashSet<string> individuals = new HashSet<string>(uniqueEffectsToApply.SelectMany(v => v.VariantCallFormatDataString.Genotypes.Keys));
             List<TBioPolymerType> variantProteins = new();
             List<TBioPolymerType> newVariantProteins = new();
-            // loop through genotypes for each sample/individual (e.g. tumor and normal)
             foreach (string individual in individuals)
             {
                 newVariantProteins.Clear();
@@ -124,23 +74,19 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                 bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.VariantCallFormatDataString.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics;
                 foreach (var variant in uniqueEffectsToApply)
                 {
-                    bool variantAlleleIsInTheGenotype = variant.VariantCallFormatDataString.Genotypes[individual].Contains(variant.VariantCallFormatDataString.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff
+                    bool variantAlleleIsInTheGenotype = variant.VariantCallFormatDataString.Genotypes[individual].Contains(variant.VariantCallFormatDataString.AlleleIndex.ToString());
                     if (!variantAlleleIsInTheGenotype)
                     {
                         continue;
                     }
-                    bool isHomozygousAlternate = variant.VariantCallFormatDataString.Homozygous[individual] && variant.VariantCallFormatDataString.Genotypes[individual].All(d => d == variant.VariantCallFormatDataString.AlleleIndex.ToString()); // note this isn't a great test for homozygosity, since the genotype could be 1/2 and this would still return true. But currently, alleles 1 and 2 will be included as separate variants, so this is fine for now.
+                    bool isHomozygousAlternate = variant.VariantCallFormatDataString.Homozygous[individual] && variant.VariantCallFormatDataString.Genotypes[individual].All(d => d == variant.VariantCallFormatDataString.AlleleIndex.ToString());
                     bool isDeepReferenceAllele = int.TryParse(variant.VariantCallFormatDataString.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth;
                     bool isDeepAlternateAllele = int.TryParse(variant.VariantCallFormatDataString.AlleleDepths[individual][variant.VariantCallFormatDataString.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth;
 
-                    // homozygous alternate
                     if (isHomozygousAlternate && isDeepAlternateAllele)
                     {
                         newVariantProteins = newVariantProteins.Select(p => ApplySingleVariant(variant, p, individual)).ToList();
                     }
-
-                    // heterozygous basic
-                    // first protein with variants contains all homozygous variation, second contains all variations
                     else if (variant.VariantCallFormatDataString.Heterozygous[individual] && tooManyHeterozygousVariants)
                     {
                         if (isDeepAlternateAllele && isDeepReferenceAllele)
@@ -154,22 +100,12 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                             {
                                 newVariantProteins[1] = ApplySingleVariant(variant, newVariantProteins[1], individual);
                             }
-                            else
-                            {
-                                // no heterozygous variants
-                            }
                         }
                         else if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0)
                         {
                             newVariantProteins = newVariantProteins.Select(p => ApplySingleVariant(variant, p, individual)).ToList();
                         }
-                        else
-                        {
-                            // keep reference only
-                        }
                     }
-
-                    // heterozygous combinitorics
                     else if (variant.VariantCallFormatDataString.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants)
                     {
                         List<TBioPolymerType> combinitoricProteins = new();
@@ -178,13 +114,10 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                         {
                             if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0 && isDeepReferenceAllele)
                             {
-                                // keep reference allele
                                 if (variant.VariantCallFormatDataString.Genotypes[individual].Contains("0"))
                                 {
                                     combinitoricProteins.Add(ppp);
                                 }
-
-                                // alternate allele (replace all, since in heterozygous with two alternates, both alternates are included)
                                 combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual));
                             }
                             else if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0)
@@ -195,10 +128,6 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                             {
                                 combinitoricProteins.Add(ppp);
                             }
-                            else
-                            {
-                                // must be two alternate alleles with not enough depth
-                            }
                         }
                         newVariantProteins = combinitoricProteins;
                     }
@@ -209,9 +138,6 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
             return variantProteins.GroupBy(x => x.BaseSequence).Select(x => x.First()).ToList();
         }
 
-        /// <summary>
-        /// Applies a single variant to a protein sequence
-        /// </summary>
         private static TBioPolymerType ApplySingleVariant<TBioPolymerType>(SequenceVariation variantGettingApplied, TBioPolymerType protein, string individual)
             where TBioPolymerType : IHasSequenceVariants
         {
@@ -219,36 +145,50 @@ private static TBioPolymerType ApplySingleVariant<TBioPolymerType>(SequenceVaria
             string seqVariant = variantGettingApplied.VariantSequence;
             int afterIdx = variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.OriginalSequence.Length - 1;
 
-            SequenceVariation variantAfterApplication = new SequenceVariation(
-                variantGettingApplied.OneBasedBeginPosition,
-                variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1,
-                variantGettingApplied.OriginalSequence,
-                variantGettingApplied.VariantSequence,
-                variantGettingApplied.VariantCallFormatDataString.Description,
-                variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value));
-
-            // check to see if there is incomplete indel overlap, which would lead to weird variant sequences
-            // complete overlap is okay, since it will be overwritten; this can happen if there are two alternate alleles,
-            //    e.g. reference sequence is wrong at that point
-            bool intersectsAppliedRegionIncompletely = protein.AppliedSequenceVariations.Any(x => variantGettingApplied.Intersects(x) && !variantGettingApplied.Includes(x));
+            SequenceVariation variantAfterApplication;
+            var vcf = variantGettingApplied.VariantCallFormatDataString;
+            if (vcf != null)
+            {
+                variantAfterApplication = new SequenceVariation(
+                    variantGettingApplied.OneBasedBeginPosition,
+                    variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1,
+                    variantGettingApplied.OriginalSequence,
+                    variantGettingApplied.VariantSequence,
+                    vcf.Description,
+                    vcf,
+                    variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value));
+            }
+            else
+            {
+                variantAfterApplication = new SequenceVariation(
+                    variantGettingApplied.OneBasedBeginPosition,
+                    variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1,
+                    variantGettingApplied.OriginalSequence,
+                    variantGettingApplied.VariantSequence,
+                    variantGettingApplied.Description,
+                    variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value));
+            }
+
+            // NULL-SAFE: AppliedSequenceVariations can be null for base proteins
+            bool intersectsAppliedRegionIncompletely =
+                protein.AppliedSequenceVariations != null
+                && protein.AppliedSequenceVariations.Any(x => variantGettingApplied.Intersects(x) && !variantGettingApplied.Includes(x));
+
             IEnumerable<SequenceVariation> appliedVariations = new[] { variantAfterApplication };
-            string seqAfter = null;
+            string seqAfter;
             if (intersectsAppliedRegionIncompletely)
             {
-                // use original protein sequence for the remaining sequence
                 seqAfter = protein.BaseSequence.Length - afterIdx <= 0 ? "" : protein.ConsensusVariant.BaseSequence.Substring(afterIdx);
             }
             else
             {
-                // use this variant protein sequence for the remaining sequence
                 seqAfter = protein.BaseSequence.Length - afterIdx <= 0 ? "" : protein.BaseSequence.Substring(afterIdx);
                 appliedVariations = appliedVariations
-                    .Concat(protein.AppliedSequenceVariations.Where(x => !variantGettingApplied.Includes(x)))
+                    .Concat((protein.AppliedSequenceVariations ?? Enumerable.Empty<SequenceVariation>()).Where(x => !variantGettingApplied.Includes(x)))
                     .ToList();
             }
-            string variantSequence = (seqBefore + seqVariant + seqAfter).Split('*')[0]; // there may be a stop gained
+            string variantSequence = (seqBefore + seqVariant + seqAfter).Split('*')[0];
 
-            // adjust indices
             List<TruncationProduct> adjustedProteolysisProducts = AdjustTruncationProductIndices(variantGettingApplied, variantSequence, protein, protein.TruncationProducts);
             Dictionary<int, List<Modification>> adjustedModifications = AdjustModificationIndices(variantGettingApplied, variantSequence, protein);
             List<SequenceVariation> adjustedAppliedVariations = AdjustSequenceVariationIndices(variantGettingApplied, variantSequence, appliedVariations);
@@ -256,9 +196,6 @@ private static TBioPolymerType ApplySingleVariant<TBioPolymerType>(SequenceVaria
             return protein.CreateVariant(variantSequence, protein, adjustedAppliedVariations, adjustedProteolysisProducts, adjustedModifications, individual);
         }
 
-        /// <summary>
-        /// Adjusts the indices of sequence variations due to applying a single additional variant
-        /// </summary>
         private static List<SequenceVariation> AdjustSequenceVariationIndices(SequenceVariation variantGettingApplied, string variantAppliedProteinSequence, IEnumerable<SequenceVariation> alreadyAppliedVariations)
         {
             List<SequenceVariation> variations = new List<SequenceVariation>();
@@ -269,48 +206,57 @@ private static List<SequenceVariation> AdjustSequenceVariationIndices(SequenceVa
                     .Where(applied => applied.OneBasedEndPosition < v.OneBasedBeginPosition)
                     .Sum(applied => applied.VariantSequence.Length - applied.OriginalSequence.Length);
 
-                // variant was entirely before the one being applied (shouldn't happen because of order of applying variants)
-                // or it's the current variation
-                if (v.VariantCallFormatDataString.Equals(variantGettingApplied.VariantCallFormatDataString) || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition)
+                // NULL-SAFE compare; or it is the current variation
+                if ((v.VariantCallFormatDataString != null && v.VariantCallFormatDataString.Equals(variantGettingApplied.VariantCallFormatDataString))
+                    || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition)
                 {
                     variations.Add(v);
                 }
-
-                // adjust indices based on new included sequence, minding possible overlaps to be filtered later
                 else
                 {
                     int intersectOneBasedStart = Math.Max(variantGettingApplied.OneBasedBeginPosition, v.OneBasedBeginPosition);
                     int intersectOneBasedEnd = Math.Min(variantGettingApplied.OneBasedEndPosition, v.OneBasedEndPosition);
-                    int overlap = intersectOneBasedEnd < intersectOneBasedStart ? 0 : // no overlap
-                        intersectOneBasedEnd - intersectOneBasedStart + 1; // there's some overlap
+                    int overlap = intersectOneBasedEnd < intersectOneBasedStart ? 0 :
+                        intersectOneBasedEnd - intersectOneBasedStart + 1;
                     int sequenceLengthChange = variantGettingApplied.VariantSequence.Length - variantGettingApplied.OriginalSequence.Length;
                     int begin = v.OneBasedBeginPosition + sequenceLengthChange - overlap;
                     if (begin > variantAppliedProteinSequence.Length)
                     {
-                        continue; // cut out by a stop gain
+                        continue;
                     }
                     int end = v.OneBasedEndPosition + sequenceLengthChange - overlap;
                     if (end > variantAppliedProteinSequence.Length)
                     {
-                        end = variantAppliedProteinSequence.Length; // end shortened by a stop gain
+                        end = variantAppliedProteinSequence.Length;
+                    }
+
+                    var vcf = v.VariantCallFormatDataString;
+                    if (vcf != null)
+                    {
+                        variations.Add(new SequenceVariation(
+                            begin,
+                            end,
+                            v.OriginalSequence,
+                            v.VariantSequence,
+                            vcf.Description,
+                            vcf,
+                            v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)));
+                    }
+                    else
+                    {
+                        variations.Add(new SequenceVariation(
+                            begin,
+                            end,
+                            v.OriginalSequence,
+                            v.VariantSequence,
+                            v.Description,
+                            v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)));
                     }
-                    variations.Add(new SequenceVariation(
-                        begin,
-                        end,
-                        v.OriginalSequence,
-                        v.VariantSequence,
-                        v.VariantCallFormatDataString.Description,
-                        v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)));
                 }
             }
             return variations;
         }
 
-        /// <summary>
-        /// Eliminates proteolysis products that overlap sequence variations.
-        /// Since frameshift indels are written across the remaining sequence,
-        /// this eliminates proteolysis products that conflict with large deletions and other structural variations.
-        /// </summary>
         private static List<TruncationProduct> AdjustTruncationProductIndices(SequenceVariation variant, string variantAppliedProteinSequence, IHasSequenceVariants protein, IEnumerable<TruncationProduct> proteolysisProducts)
         {
             List<TruncationProduct> products = new List<TruncationProduct>();
@@ -318,12 +264,10 @@ private static List<TruncationProduct> AdjustTruncationProductIndices(SequenceVa
             int sequenceLengthChange = variant.VariantSequence.Length - variant.OriginalSequence.Length;
             foreach (TruncationProduct p in proteolysisProducts.Where(p => p.OneBasedEndPosition.HasValue && p.OneBasedBeginPosition.HasValue))
             {
-                // proteolysis product is entirely before the variant
                 if (variant.OneBasedBeginPosition > p.OneBasedEndPosition)
                 {
                     products.Add(p);
                 }
-                // proteolysis product straddles the variant, but the cleavage site(s) are still intact; the ends aren't considered cleavage sites
                 else if ((p.OneBasedBeginPosition < variant.OneBasedBeginPosition || p.OneBasedBeginPosition == 1 || p.OneBasedBeginPosition == 2)
                          && (p.OneBasedEndPosition > variant.OneBasedEndPosition || p.OneBasedEndPosition == protein.ConsensusVariant.BaseSequence.Length))
                 {
@@ -335,12 +279,7 @@ private static List<TruncationProduct> AdjustTruncationProductIndices(SequenceVa
                     {
                         products.Add(new TruncationProduct(p.OneBasedBeginPosition, p.OneBasedEndPosition + sequenceLengthChange, p.Type));
                     }
-                    else
-                    {
-                        // cleavage site is not intact
-                    }
                 }
-                // proteolysis product is after the variant and there is no stop gain
                 else if (p.OneBasedBeginPosition > variant.OneBasedEndPosition
                          && p.OneBasedBeginPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length
                          && p.OneBasedEndPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length
@@ -348,17 +287,10 @@ private static List<TruncationProduct> AdjustTruncationProductIndices(SequenceVa
                 {
                     products.Add(new TruncationProduct(p.OneBasedBeginPosition + sequenceLengthChange, p.OneBasedEndPosition + sequenceLengthChange, p.Type));
                 }
-                else // sequence variant conflicts with proteolysis cleavage site (cleavage site was lost)
-                {
-                    continue;
-                }
             }
             return products;
         }
 
-        /// <summary>
-        /// Adjusts modification indices.
-        /// </summary>
         private static Dictionary<int, List<Modification>> AdjustModificationIndices(SequenceVariation variant, string variantAppliedProteinSequence, IHasSequenceVariants protein)
         {
             IDictionary<int, List<Modification>> modificationDictionary = protein.OneBasedPossibleLocalizedModifications;
@@ -366,34 +298,25 @@ private static Dictionary<int, List<Modification>> AdjustModificationIndices(Seq
             Dictionary<int, List<Modification>> mods = new Dictionary<int, List<Modification>>();
             int sequenceLengthChange = variant.VariantSequence.Length - variant.OriginalSequence.Length;
 
-            // change modification indices for variant sequence
             if (modificationDictionary != null)
             {
                 foreach (KeyValuePair<int, List<Modification>> kv in modificationDictionary)
                 {
                     if (kv.Key > variantAppliedProteinSequence.Length)
                     {
-                        continue; // it was cut out by a stop gain
+                        continue;
                     }
-                    // mod is before the variant
                     else if (kv.Key < variant.OneBasedBeginPosition)
                     {
                         mods.Add(kv.Key, kv.Value);
                     }
-                    // mod is after the variant and not affected by a stop gain
                     else if (variant.OneBasedEndPosition < kv.Key && kv.Key + sequenceLengthChange <= variantAppliedProteinSequence.Length)
                     {
                         mods.Add(kv.Key + sequenceLengthChange, kv.Value);
                     }
-                    else // sequence variant conflicts with modification site (modification site substitution)
-                    {
-                        continue;
-                    }
                 }
             }
 
-            // sequence variant modifications are indexed to the variant sequence
-            //    NOTE: this code assumes variants are added from end to beginning of protein, so that previously added variant mods are adjusted above
             if (variantModificationDictionary != null)
             {
                 foreach (var kv in variantModificationDictionary)
@@ -412,32 +335,16 @@ private static Dictionary<int, List<Modification>> AdjustModificationIndices(Seq
             return mods;
         }
 
-        /// <summary>
-        /// Format string to append to accession
-        /// </summary>
         private static string CombineSimpleStrings(IEnumerable<SequenceVariation>? variations)
         {
             return variations.IsNullOrEmpty() ? "" : string.Join("_", variations.Select(v => v.SimpleString()));
         }
 
-        /// <summary>
-        /// Format string to append to protein names
-        /// </summary>
         public static string CombineDescriptions(IEnumerable<SequenceVariation>? variations)
         {
             return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.VariantCallFormatDataString));
         }
-        /// <summary>
-        /// Applies all possible combinations of the provided SequenceVariation list to the base TBioPolymerType object,
-        /// starting with the fewest single variations and up to the specified maximum number of combinations.
-        /// </summary>
-        /// <typeparam name="TBioPolymerType">The type of the biopolymer object.</typeparam>
-        /// <param name="baseBioPolymer">The base biopolymer object to apply variations to.</param>
-        /// <param name="variations">List of SequenceVariation objects to combine and apply. Assumed not null or empty.</param>
-        /// <param name="maxCombinations">Maximum number of combinations to return.</param>
-        /// <returns>
-        /// An IEnumerable of TBioPolymerType objects, each with a unique combination of variations applied.
-        /// </returns>
+
         public static IEnumerable<TBioPolymerType> ApplyAllVariantCombinations<TBioPolymerType>(
             TBioPolymerType baseBioPolymer,
             List<SequenceVariation> variations,
@@ -446,7 +353,6 @@ public static IEnumerable<TBioPolymerType> ApplyAllVariantCombinations<TBioPolym
         {
             int count = 0;
 
-            // Always yield the base biopolymer first
             yield return baseBioPolymer;
             count++;
             if (count >= maxCombinations)
@@ -473,14 +379,6 @@ public static IEnumerable<TBioPolymerType> ApplyAllVariantCombinations<TBioPolym
             }
         }
 
-        /// <summary>
-        /// Generates all possible combinations of the specified size from the input list.
-        /// </summary>
-        /// <param name="variations">List of SequenceVariation objects to combine. Assumed not null or empty.</param>
-        /// <param name="size">The size of each combination.</param>
-        /// <returns>
-        /// An IEnumerable of IList&lt;SequenceVariation&gt; representing each combination.
-        /// </returns>
         private static IEnumerable<IList<SequenceVariation>> GetCombinations(List<SequenceVariation> variations, int size)
         {
             int n = variations.Count;
@@ -503,11 +401,11 @@ private static IEnumerable<IList<SequenceVariation>> GetCombinations(List<Sequen
                     indices[i] = indices[i - 1] + 1;
             }
         }
+
         public static void ConvertNucleotideSubstitutionModificationsToSequenceVariants<TBioPolymerType>(this TBioPolymerType protein)
             where TBioPolymerType : IHasSequenceVariants
         {
             List<KeyValuePair<int, Modification>> modificationsToRemove = new();
-            //convert substitution modifications to sequence variations
             foreach (var kvp in protein.OneBasedPossibleLocalizedModifications)
             {
                 foreach (Modification mod in kvp.Value)
@@ -525,7 +423,6 @@ public static void ConvertNucleotideSubstitutionModificationsToSequenceVariants<
                     }
                 }
             }
-            //remove the modifications that were converted to sequence variations
             foreach (KeyValuePair<int, Modification> pair in modificationsToRemove)
             {
                 if (protein.OneBasedPossibleLocalizedModifications.ContainsKey(pair.Key))

From d14b841581204bed30de4cb14832b433013ea372 Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Fri, 24 Oct 2025 16:54:33 -0500
Subject: [PATCH 31/38] it fucking works

---
 mzLib/Omics/BioPolymer/SequenceVariation.cs | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs
index af32bd3cc..3e0a874f6 100644
--- a/mzLib/Omics/BioPolymer/SequenceVariation.cs
+++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs
@@ -16,6 +16,7 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str
             VariantCallFormatDataString = variantCallFormat;
             OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
         }
+
         public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, string variantCallFormatStringRepresentation, Dictionary<int, List<Modification>>? oneBasedModifications = null)
         {
             OneBasedBeginPosition = oneBasedBeginPosition;
@@ -26,23 +27,16 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str
             VariantCallFormatDataString = new VariantCallFormat(variantCallFormatStringRepresentation);
             OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
         }
+
         public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, Dictionary<int, List<Modification>>? oneBasedModifications = null)
         {
             OneBasedBeginPosition = oneBasedBeginPosition;
             OneBasedEndPosition = oneBasedEndPosition;
             OriginalSequence = originalSequence ?? "";
             VariantSequence = variantSequence ?? "";
-
-            if (LooksLikeVcf(description))
-            {
-                Description = "VCF Data";
-                VariantCallFormatDataString = new VariantCallFormat(description);
-            }
-            else
-            {
-                Description = description;
-                VariantCallFormatDataString = null;
-            }
+            Description = description;
+            // Always construct a VariantCallFormat so tests relying on non-null VCF objects pass.
+            VariantCallFormatDataString = new VariantCallFormat(description);
             OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
         }
 

From f132f3db739d4f9557004b6637b50ab2d922275f Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Fri, 24 Oct 2025 17:06:58 -0500
Subject: [PATCH 32/38] copilot

---
 mzLib/Omics/BioPolymer/SequenceVariation.cs  | 5 -----
 mzLib/Omics/BioPolymer/VariantApplication.cs | 2 +-
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs
index 3e0a874f6..202306bfc 100644
--- a/mzLib/Omics/BioPolymer/SequenceVariation.cs
+++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs
@@ -110,10 +110,5 @@ public bool AreValid()
         {
             return OneBasedBeginPosition > 0 && OneBasedEndPosition >= OneBasedBeginPosition;
         }
-
-        private static bool LooksLikeVcf(string s)
-            => !string.IsNullOrWhiteSpace(s)
-               && (s.Contains("\t") || s.Contains("\\t"))
-               && (s.Contains("GT:") || s.Contains(":GT:") || s.Contains(" ANN=") || s.Contains("\tANN="));
     }
 }
\ No newline at end of file
diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs
index d6ab5b98a..63e0b249e 100644
--- a/mzLib/Omics/BioPolymer/VariantApplication.cs
+++ b/mzLib/Omics/BioPolymer/VariantApplication.cs
@@ -342,7 +342,7 @@ private static string CombineSimpleStrings(IEnumerable<SequenceVariation>? varia
 
         public static string CombineDescriptions(IEnumerable<SequenceVariation>? variations)
         {
-            return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.VariantCallFormatDataString));
+            return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.VariantCallFormatDataString?.Description ?? d.Description));
         }
 
         public static IEnumerable<TBioPolymerType> ApplyAllVariantCombinations<TBioPolymerType>(

From b90d7b34474ab6c29b99a7f775294d7cc8921e4f Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Fri, 24 Oct 2025 17:23:10 -0500
Subject: [PATCH 33/38] comments

---
 mzLib/Omics/BioPolymer/SequenceVariation.cs  | 205 ++++++++++++++++++
 mzLib/Omics/BioPolymer/VariantApplication.cs | 212 ++++++++++++++++++-
 2 files changed, 406 insertions(+), 11 deletions(-)

diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs
index 202306bfc..3b5ef13f0 100644
--- a/mzLib/Omics/BioPolymer/SequenceVariation.cs
+++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs
@@ -4,8 +4,50 @@
 
 namespace Omics.BioPolymer
 {
+    /// <summary>
+    /// Represents a contiguous sequence variation on a 1-based, inclusive coordinate system.
+    /// A variation spans <see cref="OneBasedBeginPosition"/>.. <see cref="OneBasedEndPosition"/> in the parent sequence,
+    /// replaces the <see cref="OriginalSequence"/> (which may be empty for an insertion) with <see cref="VariantSequence"/>
+    /// (which may be empty for a deletion), and can optionally carry site-specific <see cref="OneBasedModifications"/>.
+    /// When available, a parsed <see cref="VariantCallFormat"/> is attached via <see cref="VariantCallFormatDataString"/>.
+    /// 
+    /// Typical interpretations:
+    /// - Substitution: non-empty <see cref="OriginalSequence"/> and non-empty <see cref="VariantSequence"/> of equal length.
+    /// - Insertion: empty <see cref="OriginalSequence"/> and non-empty <see cref="VariantSequence"/>.
+    /// - Deletion: non-empty <see cref="OriginalSequence"/> and empty <see cref="VariantSequence"/>.
+    /// </summary>
     public class SequenceVariation
     {
+        /// <summary>
+        /// Create a variation with an explicit VCF object.
+        /// </summary>
+        /// <param name="oneBasedBeginPosition">
+        /// 1-based, inclusive start position in the parent sequence where the variation begins.
+        /// Must be &gt;= 1. See <see cref="AreValid"/> for validity conditions.
+        /// </param>
+        /// <param name="oneBasedEndPosition">
+        /// 1-based, inclusive end position in the parent sequence where the variation ends.
+        /// Must satisfy <c>oneBasedEndPosition &gt;= oneBasedBeginPosition</c>.
+        /// </param>
+        /// <param name="originalSequence">
+        /// Reference subsequence being replaced. Null is coerced to an empty string.
+        /// Empty string typically indicates an insertion at <paramref name="oneBasedBeginPosition"/>.
+        /// </param>
+        /// <param name="variantSequence">
+        /// Alternate subsequence to insert in place of <paramref name="originalSequence"/>.
+        /// Null is coerced to an empty string. Empty string typically indicates a deletion.
+        /// </param>
+        /// <param name="description">
+        /// Free-form description of the variation. Often the original VCF line or human-readable note.
+        /// </param>
+        /// <param name="variantCallFormat">
+        /// Parsed VCF wrapper for the originating record. Used for downstream analysis of genotype/allele metadata.
+        /// </param>
+        /// <param name="oneBasedModifications">
+        /// Optional mapping from absolute 1-based residue positions to one or more <see cref="Modification"/> objects
+        /// applied at that position (in the same coordinate system as <paramref name="oneBasedBeginPosition"/>/<paramref name="oneBasedEndPosition"/>).
+        /// If null, an empty dictionary is created.
+        /// </param>
         public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, VariantCallFormat variantCallFormat, Dictionary<int, List<Modification>>? oneBasedModifications = null)
         {
             OneBasedBeginPosition = oneBasedBeginPosition;
@@ -17,6 +59,36 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str
             OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
         }
 
+        /// <summary>
+        /// Create a variation by providing a raw VCF line (string representation) which will be parsed into a <see cref="VariantCallFormat"/>.
+        /// </summary>
+        /// <param name="oneBasedBeginPosition">
+        /// 1-based, inclusive start position in the parent sequence where the variation begins.
+        /// Must be &gt;= 1. See <see cref="AreValid"/> for validity conditions.
+        /// </param>
+        /// <param name="oneBasedEndPosition">
+        /// 1-based, inclusive end position in the parent sequence where the variation ends.
+        /// Must satisfy <c>oneBasedEndPosition &gt;= oneBasedBeginPosition</c>.
+        /// </param>
+        /// <param name="originalSequence">
+        /// Reference subsequence being replaced. Null is coerced to an empty string.
+        /// Empty string typically indicates an insertion at <paramref name="oneBasedBeginPosition"/>.
+        /// </param>
+        /// <param name="variantSequence">
+        /// Alternate subsequence to insert in place of <paramref name="originalSequence"/>.
+        /// Null is coerced to an empty string. Empty string typically indicates a deletion.
+        /// </param>
+        /// <param name="description">
+        /// Free-form description of the variation. Often the original VCF line or human-readable note.
+        /// </param>
+        /// <param name="variantCallFormatStringRepresentation">
+        /// Raw VCF record (a single, tab-delimited line). It is parsed into <see cref="VariantCallFormatDataString"/>.
+        /// </param>
+        /// <param name="oneBasedModifications">
+        /// Optional mapping from absolute 1-based residue positions to one or more <see cref="Modification"/> objects
+        /// applied at that position (in the same coordinate system as <paramref name="oneBasedBeginPosition"/>/<paramref name="oneBasedEndPosition"/>).
+        /// If null, an empty dictionary is created.
+        /// </param>
         public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, string variantCallFormatStringRepresentation, Dictionary<int, List<Modification>>? oneBasedModifications = null)
         {
             OneBasedBeginPosition = oneBasedBeginPosition;
@@ -28,6 +100,34 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str
             OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
         }
 
+        /// <summary>
+        /// Create a variation without a separate VCF string; a <see cref="VariantCallFormat"/> is still constructed
+        /// from <paramref name="description"/> to maintain a non-null object for tests and downstream consumers.
+        /// </summary>
+        /// <param name="oneBasedBeginPosition">
+        /// 1-based, inclusive start position in the parent sequence where the variation begins.
+        /// Must be &gt;= 1. See <see cref="AreValid"/> for validity conditions.
+        /// </param>
+        /// <param name="oneBasedEndPosition">
+        /// 1-based, inclusive end position in the parent sequence where the variation ends.
+        /// Must satisfy <c>oneBasedEndPosition &gt;= oneBasedBeginPosition</c>.
+        /// </param>
+        /// <param name="originalSequence">
+        /// Reference subsequence being replaced. Null is coerced to an empty string.
+        /// Empty string typically indicates an insertion at <paramref name="oneBasedBeginPosition"/>.
+        /// </param>
+        /// <param name="variantSequence">
+        /// Alternate subsequence to insert in place of <paramref name="originalSequence"/>.
+        /// Null is coerced to an empty string. Empty string typically indicates a deletion.
+        /// </param>
+        /// <param name="description">
+        /// Free-form description of the variation. Also used to initialize <see cref="VariantCallFormatDataString"/>.
+        /// </param>
+        /// <param name="oneBasedModifications">
+        /// Optional mapping from absolute 1-based residue positions to one or more <see cref="Modification"/> objects
+        /// applied at that position (in the same coordinate system as <paramref name="oneBasedBeginPosition"/>/<paramref name="oneBasedEndPosition"/>).
+        /// If null, an empty dictionary is created.
+        /// </param>
         public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, Dictionary<int, List<Modification>>? oneBasedModifications = null)
         {
             OneBasedBeginPosition = oneBasedBeginPosition;
@@ -40,18 +140,80 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str
             OneBasedModifications = oneBasedModifications ?? new Dictionary<int, List<Modification>>();
         }
 
+        /// <summary>
+        /// Convenience constructor for single-position variations. The end position is inferred from
+        /// <paramref name="oneBasedPosition"/> and the length of <paramref name="originalSequence"/>:
+        /// <c>end = position + length(originalSequence) - 1</c> (or <c>end = position</c> if <paramref name="originalSequence"/> is null).
+        /// </summary>
+        /// <param name="oneBasedPosition">
+        /// 1-based, inclusive position of the variation start. Used to infer <see cref="OneBasedEndPosition"/>.
+        /// </param>
+        /// <param name="originalSequence">
+        /// Reference subsequence being replaced. If null, treated as empty for end-position inference.
+        /// </param>
+        /// <param name="variantSequence">
+        /// Alternate subsequence to insert in place of <paramref name="originalSequence"/>. May be empty for deletions.
+        /// </param>
+        /// <param name="description">
+        /// Free-form description of the variation. Also used to initialize <see cref="VariantCallFormatDataString"/>.
+        /// </param>
+        /// <param name="oneBasedModifications">
+        /// Optional mapping from absolute 1-based residue positions to one or more <see cref="Modification"/> objects
+        /// applied at that position. If null, an empty dictionary is created.
+        /// </param>
         public SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, Dictionary<int, List<Modification>>? oneBasedModifications = null)
             : this(oneBasedPosition, originalSequence == null ? oneBasedPosition : oneBasedPosition + originalSequence.Length - 1, originalSequence, variantSequence, description, oneBasedModifications)
         { }
 
+        /// <summary>
+        /// 1-based, inclusive start position of this variation within the parent sequence.
+        /// </summary>
         public int OneBasedBeginPosition { get; }
+
+        /// <summary>
+        /// 1-based, inclusive end position of this variation within the parent sequence.
+        /// For single-site variations with non-null <see cref="OriginalSequence"/>, this is
+        /// <c>OneBasedBeginPosition + OriginalSequence.Length - 1</c>.
+        /// </summary>
         public int OneBasedEndPosition { get; }
+
+        /// <summary>
+        /// The reference subsequence replaced by this variation. Empty string implies an insertion.
+        /// </summary>
         public string OriginalSequence { get; }
+
+        /// <summary>
+        /// The alternate subsequence inserted by this variation. Empty string implies a deletion.
+        /// </summary>
         public string VariantSequence { get; }
+
+        /// <summary>
+        /// Free-form description of the variation. Often the raw VCF line or a human-readable summary.
+        /// </summary>
         public string Description { get; }
+
+        /// <summary>
+        /// Optional parsed VCF wrapper providing structured access to the originating VCF record.
+        /// May be null in some construction paths; in the provided constructors it is initialized.
+        /// </summary>
         public VariantCallFormat? VariantCallFormatDataString { get; }
+
+        /// <summary>
+        /// Mapping from absolute 1-based residue positions to a list of <see cref="Modification"/> objects
+        /// to apply at each position. Never null; defaults to an empty dictionary.
+        /// </summary>
         public Dictionary<int, List<Modification>> OneBasedModifications { get; }
 
+        /// <summary>
+        /// Determines value equality with another object.
+        /// Two <see cref="SequenceVariation"/> objects are equal when:
+        /// - Begin and end positions are equal
+        /// - Original and variant sequences are equal (nulls treated as equal only if both are null)
+        /// - <see cref="VariantCallFormatDataString"/> are both null or equal
+        /// - <see cref="OneBasedModifications"/> have identical key sets and identical flattened modification lists (sequence-equal)
+        /// </summary>
+        /// <param name="obj">Object to compare against.</param>
+        /// <returns>True if equal by the criteria above; otherwise false.</returns>
         public override bool Equals(object obj)
         {
             SequenceVariation s = obj as SequenceVariation;
@@ -67,6 +229,10 @@ public override bool Equals(object obj)
                     && s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList()));
         }
 
+        /// <summary>
+        /// Computes a hash code from begin/end positions, sequences, and the VCF wrapper (if present).
+        /// </summary>
+        /// <returns>A hash code suitable for hash-based collections.</returns>
         public override int GetHashCode()
         {
             return OneBasedBeginPosition.GetHashCode()
@@ -76,36 +242,75 @@ public override int GetHashCode()
                 ^ (VariantCallFormatDataString?.GetHashCode() ?? 0);
         }
 
+        /// <summary>
+        /// Produces a compact, human-readable representation: <c>{OriginalSequence}{OneBasedBeginPosition}{VariantSequence}</c>.
+        /// Example: substitution A-&gt;T at position 12 yields <c>"A12T"</c>.
+        /// </summary>
+        /// <returns>The compact representation string.</returns>
         public string SimpleString()
         {
             return OriginalSequence + OneBasedBeginPosition.ToString() + VariantSequence;
         }
 
+        /// <summary>
+        /// Tests whether the current variation intersects (overlaps) another variation in coordinate space.
+        /// Intersection is inclusive: any shared position in the 1-based, inclusive ranges is considered overlap.
+        /// </summary>
+        /// <param name="segment">The other <see cref="SequenceVariation"/> to test.</param>
+        /// <returns>True if the ranges overlap; otherwise false.</returns>
         internal bool Intersects(SequenceVariation segment)
         {
             return segment.OneBasedEndPosition >= OneBasedBeginPosition && segment.OneBasedBeginPosition <= OneBasedEndPosition;
         }
 
+        /// <summary>
+        /// Tests whether the current variation intersects (overlaps) a truncation product range.
+        /// Intersection is inclusive: any shared position in the 1-based, inclusive ranges is considered overlap.
+        /// </summary>
+        /// <param name="segment">The <see cref="TruncationProduct"/> segment to test.</param>
+        /// <returns>True if the ranges overlap; otherwise false.</returns>
         internal bool Intersects(TruncationProduct segment)
         {
             return segment.OneBasedEndPosition >= OneBasedBeginPosition && segment.OneBasedBeginPosition <= OneBasedEndPosition;
         }
 
+        /// <summary>
+        /// Tests whether the current variation intersects a single 1-based position.
+        /// </summary>
+        /// <param name="pos">A 1-based, inclusive position in the parent sequence.</param>
+        /// <returns>True if <paramref name="pos"/> lies within the variation’s range; otherwise false.</returns>
         internal bool Intersects(int pos)
         {
             return OneBasedBeginPosition <= pos && pos <= OneBasedEndPosition;
         }
 
+        /// <summary>
+        /// Tests whether the current variation fully includes another variation’s range.
+        /// Inclusion is inclusive on both ends.
+        /// </summary>
+        /// <param name="segment">The other <see cref="SequenceVariation"/> to test.</param>
+        /// <returns>True if the current range fully contains <paramref name="segment"/>; otherwise false.</returns>
         internal bool Includes(SequenceVariation segment)
         {
             return OneBasedBeginPosition <= segment.OneBasedBeginPosition && OneBasedEndPosition >= segment.OneBasedEndPosition;
         }
 
+        /// <summary>
+        /// Tests whether the current variation includes a single 1-based position.
+        /// Inclusion is inclusive on both ends.
+        /// </summary>
+        /// <param name="pos">A 1-based, inclusive position in the parent sequence.</param>
+        /// <returns>True if <paramref name="pos"/> lies within the variation’s range; otherwise false.</returns>
         internal bool Includes(int pos)
         {
             return OneBasedBeginPosition <= pos && pos <= OneBasedEndPosition;
         }
 
+        /// <summary>
+        /// Validates positional consistency: begin must be &gt; 0 and end must be &gt;= begin.
+        /// This does not validate string/length consistency between <see cref="OriginalSequence"/> and <see cref="VariantSequence"/>.
+        /// </summary>
+        /// <returns>True if positions are valid; otherwise false.</returns>
         public bool AreValid()
         {
             return OneBasedBeginPosition > 0 && OneBasedEndPosition >= OneBasedBeginPosition;
diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs
index 63e0b249e..c5fd71ed5 100644
--- a/mzLib/Omics/BioPolymer/VariantApplication.cs
+++ b/mzLib/Omics/BioPolymer/VariantApplication.cs
@@ -4,20 +4,57 @@
 
 namespace Omics.BioPolymer
 {
+    /// <summary>
+    /// Utilities for constructing and applying sequence variants to biopolymers (proteins/RNA) in a 1-based, inclusive coordinate system.
+    /// This includes:
+    /// - Expanding a base biopolymer into concrete variant instances (genotype-aware or combinatorial).
+    /// - Renaming/accession tagging based on applied variants.
+    /// - Re-mapping indices for truncation products and localized modifications after edits (insertions/deletions/substitutions).
+    /// - Converting certain annotation-style modifications (e.g., nucleotide substitution markers) into true sequence variants.
+    /// </summary>
     public static class VariantApplication
     {
+        /// <summary>
+        /// Builds concrete variant biopolymers from a base entity.
+        /// If any known variation lacks genotype information (no VCF or empty genotypes), a safe combinatorial expansion is used (bounded by <paramref name="maxAllowedVariantsForCombinatorics"/>).
+        /// Otherwise, variants are applied in a genotype/allele-depth-aware manner via <see cref="ApplyVariants{TBioPolymerType}(TBioPolymerType, IEnumerable{SequenceVariation}, int, int)"/>.
+        /// </summary>
+        /// <typeparam name="TBioPolymerType">A biopolymer type that supports sequence variants.</typeparam>
+        /// <param name="protein">The base biopolymer to expand into variant instances.</param>
+        /// <param name="maxAllowedVariantsForCombinatorics">
+        /// Maximum number of variants to consider when creating combinations for genotype-less scenarios.
+        /// This caps explosion in <see cref="ApplyAllVariantCombinations{TBioPolymerType}(TBioPolymerType, System.Collections.Generic.List{SequenceVariation}, int)"/>.
+        /// </param>
+        /// <param name="minAlleleDepth">
+        /// Minimum AD (Allele Depth) per sample required for an allele (reference or alternate) to be considered "deep" enough to participate in genotype-aware application.
+        /// </param>
+        /// <returns>A list of concrete variants derived from <paramref name="protein"/>.</returns>
         public static List<TBioPolymerType> GetVariantBioPolymers<TBioPolymerType>(this TBioPolymerType protein, int maxAllowedVariantsForCombinatorics = 4, int minAlleleDepth = 1)
             where TBioPolymerType : IHasSequenceVariants
         {
+            // Materialize any substitution-like annotations into concrete sequence variants on both the consensus and the instance
             protein.ConsensusVariant.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
             protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
+
+            // If all variants are positionally valid and any is missing VCF/genotype data, fall back to bounded combinatorial application
             if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.VariantCallFormatDataString == null || v.VariantCallFormatDataString.Genotypes.Count == 0))
             {
                 return ApplyAllVariantCombinations(protein, protein.SequenceVariations, maxAllowedVariantsForCombinatorics).ToList();
             }
+
+            // Otherwise, do genotype/allele-depth-aware application with combinatorics limited for heterozygous sites
             return ApplyVariants(protein, protein.SequenceVariations, maxAllowedVariantsForCombinitorics: maxAllowedVariantsForCombinatorics, minAlleleDepth);
         }
 
+        /// <summary>
+        /// Produces a name with an appended variant tag built from applied variation descriptions.
+        /// If both <paramref name="name"/> and <paramref name="appliedVariations"/> are effectively empty, returns null.
+        /// </summary>
+        /// <param name="name">Base name (e.g., protein name). May be null.</param>
+        /// <param name="appliedVariations">Variations applied to the instance. If null/empty, no variant tag is appended.</param>
+        /// <returns>
+        /// The base <paramref name="name"/> plus " variant:{...}" when variations exist; null if both inputs are empty.
+        /// </returns>
         public static string? GetVariantName(string? name, IEnumerable<SequenceVariation>? appliedVariations)
         {
             bool emptyVars = appliedVariations.IsNullOrEmpty();
@@ -28,17 +65,38 @@ public static List<TBioPolymerType> GetVariantBioPolymers<TBioPolymerType>(this
             return name + variantTag;
         }
 
+        /// <summary>
+        /// Constructs a variant-aware accession by appending a compact variant string to the consensus accession.
+        /// </summary>
+        /// <param name="protein">The variant-capable biopolymer.</param>
+        /// <param name="appliedSequenceVariations">Applied variations to encode into the suffix; may be null/empty.</param>
+        /// <returns>
+        /// The consensus accession or consensus accession with "_{SimpleString}_..." suffix if variations are present.
+        /// </returns>
         public static string GetAccession(IHasSequenceVariants protein, IEnumerable<SequenceVariation>? appliedSequenceVariations)
         {
             return protein.ConsensusVariant.Accession +
                    (appliedSequenceVariations.IsNullOrEmpty() ? "" : $"_{CombineSimpleStrings(appliedSequenceVariations)}");
         }
 
+        /// <summary>
+        /// Determines if a specific 1-based position in the variant biopolymer lies within a particular variation's range.
+        /// </summary>
+        /// <param name="appliedVariant">The variation of interest; may be null.</param>
+        /// <param name="variantProteinIndex">1-based position in the current (possibly already-edited) variant sequence.</param>
+        /// <returns>True if the position is included by the variation; otherwise false.</returns>
         public static bool IsSequenceVariantModification(SequenceVariation? appliedVariant, int variantProteinIndex)
         {
             return appliedVariant != null && appliedVariant.Includes(variantProteinIndex);
         }
 
+        /// <summary>
+        /// Maps a modification index from the edited (variant) sequence back to the original consensus index by subtracting the
+        /// net length changes of all applied variations that ended before the queried position.
+        /// </summary>
+        /// <param name="protein">The variant-capable biopolymer containing applied variations.</param>
+        /// <param name="variantProteinModificationIndex">1-based index in the variant sequence.</param>
+        /// <returns>The corresponding 1-based index in the original consensus sequence.</returns>
         public static int RestoreModificationIndex(IHasSequenceVariants protein, int variantProteinModificationIndex)
         {
             return variantProteinModificationIndex - protein.AppliedSequenceVariations
@@ -46,9 +104,24 @@ public static int RestoreModificationIndex(IHasSequenceVariants protein, int var
                 .Sum(v => v.VariantSequence.Length - v.OriginalSequence.Length);
         }
 
+        /// <summary>
+        /// Applies a set of sequence variations in a genotype- and allele-depth-aware fashion for all individuals found in the VCF payloads.
+        /// Heterozygous sites can produce combinatorial branches up to <paramref name="maxAllowedVariantsForCombinitorics"/> per individual.
+        /// Results are deduplicated by final base sequence.
+        /// </summary>
+        /// <typeparam name="TBioPolymerType">A biopolymer type that supports sequence variants.</typeparam>
+        /// <param name="protein">The base biopolymer to which variations will be applied.</param>
+        /// <param name="sequenceVariations">Candidate variations. Duplicates by effect are collapsed using <see cref="SequenceVariation.SimpleString"/>.</param>
+        /// <param name="maxAllowedVariantsForCombinitorics">
+        /// Upper cap for heterozygous combinatorial branching. If an individual has more heterozygous variants than this number,
+        /// the algorithm limits branching to control explosion.
+        /// </param>
+        /// <param name="minAlleleDepth">Minimum AD (Allele Depth) per sample for an allele to be considered in application.</param>
+        /// <returns>A list of concrete variant biopolymers across all individuals encoded in the VCF payloads.</returns>
         public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerType protein, IEnumerable<SequenceVariation> sequenceVariations, int maxAllowedVariantsForCombinitorics, int minAlleleDepth)
             where TBioPolymerType : IHasSequenceVariants
         {
+            // Remove duplicate effects (by SimpleString), require variants with genotype data, apply from higher to lower positions
             List<SequenceVariation> uniqueEffectsToApply = sequenceVariations
                 .GroupBy(v => v.SimpleString())
                 .Select(x => x.First())
@@ -56,6 +129,7 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                 .OrderByDescending(v => v.OneBasedBeginPosition)
                 .ToList();
 
+            // A shallow "base" variant to branch from (no applied variants yet)
             TBioPolymerType proteinCopy = protein.CreateVariant(protein.BaseSequence, protein, null, protein.TruncationProducts, protein.OneBasedPossibleLocalizedModifications, null);
 
             if (uniqueEffectsToApply.Count == 0)
@@ -63,32 +137,41 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                 return new List<TBioPolymerType> { proteinCopy };
             }
 
+            // All per-sample identifiers present in the VCF objects
             HashSet<string> individuals = new HashSet<string>(uniqueEffectsToApply.SelectMany(v => v.VariantCallFormatDataString.Genotypes.Keys));
             List<TBioPolymerType> variantProteins = new();
             List<TBioPolymerType> newVariantProteins = new();
+
             foreach (string individual in individuals)
             {
                 newVariantProteins.Clear();
                 newVariantProteins.Add(proteinCopy);
 
+                // Whether to limit combinatorial branching for this individual
                 bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.VariantCallFormatDataString.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics;
+
                 foreach (var variant in uniqueEffectsToApply)
                 {
+                    // Only proceed if the individual's genotype references this variant's alternate allele index
                     bool variantAlleleIsInTheGenotype = variant.VariantCallFormatDataString.Genotypes[individual].Contains(variant.VariantCallFormatDataString.AlleleIndex.ToString());
                     if (!variantAlleleIsInTheGenotype)
                     {
                         continue;
                     }
+
+                    // Zygosity and depth checks for this individual
                     bool isHomozygousAlternate = variant.VariantCallFormatDataString.Homozygous[individual] && variant.VariantCallFormatDataString.Genotypes[individual].All(d => d == variant.VariantCallFormatDataString.AlleleIndex.ToString());
                     bool isDeepReferenceAllele = int.TryParse(variant.VariantCallFormatDataString.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth;
                     bool isDeepAlternateAllele = int.TryParse(variant.VariantCallFormatDataString.AlleleDepths[individual][variant.VariantCallFormatDataString.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth;
 
                     if (isHomozygousAlternate && isDeepAlternateAllele)
                     {
+                        // Deterministic application: all branches take the alt allele
                         newVariantProteins = newVariantProteins.Select(p => ApplySingleVariant(variant, p, individual)).ToList();
                     }
                     else if (variant.VariantCallFormatDataString.Heterozygous[individual] && tooManyHeterozygousVariants)
                     {
+                        // Limit branching: either keep ref, take alt, or update second branch if already present
                         if (isDeepAlternateAllele && isDeepReferenceAllele)
                         {
                             if (newVariantProteins.Count == 1 && maxAllowedVariantsForCombinitorics > 0)
@@ -108,6 +191,7 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                     }
                     else if (variant.VariantCallFormatDataString.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants)
                     {
+                        // Full combinatorics (bounded) for heterozygous case: keep ref and/or take alt depending on depths
                         List<TBioPolymerType> combinitoricProteins = new();
 
                         foreach (var ppp in newVariantProteins)
@@ -116,9 +200,9 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                             {
                                 if (variant.VariantCallFormatDataString.Genotypes[individual].Contains("0"))
                                 {
-                                    combinitoricProteins.Add(ppp);
+                                    combinitoricProteins.Add(ppp); // keep reference branch
                                 }
-                                combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual));
+                                combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual)); // alternate branch
                             }
                             else if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0)
                             {
@@ -135,16 +219,30 @@ public static List<TBioPolymerType> ApplyVariants<TBioPolymerType>(TBioPolymerTy
                 variantProteins.AddRange(newVariantProteins);
             }
 
+            // De-duplicate by final sequence to avoid identical variants from different branches
             return variantProteins.GroupBy(x => x.BaseSequence).Select(x => x.First()).ToList();
         }
 
+        /// <summary>
+        /// Applies a single <see cref="SequenceVariation"/> to a biopolymer and returns the updated instance with
+        /// sequence, truncation products, localized modifications, and applied-variation indices all re-mapped.
+        /// </summary>
+        /// <typeparam name="TBioPolymerType">A biopolymer type that supports sequence variants.</typeparam>
+        /// <param name="variantGettingApplied">Variation to apply (coordinates refer to the current protein's sequence).</param>
+        /// <param name="protein">Source biopolymer to mutate.</param>
+        /// <param name="individual">Sample identifier used to annotate the created variant (may be empty).</param>
+        /// <returns>A new variant instance created via <see cref="IHasSequenceVariants.CreateVariant{TBioPolymerType}"/>.</returns>
         private static TBioPolymerType ApplySingleVariant<TBioPolymerType>(SequenceVariation variantGettingApplied, TBioPolymerType protein, string individual)
             where TBioPolymerType : IHasSequenceVariants
         {
+            // Sequence prefix before the edit region
             string seqBefore = protein.BaseSequence.Substring(0, variantGettingApplied.OneBasedBeginPosition - 1);
+            // The replacement (alternate) sequence
             string seqVariant = variantGettingApplied.VariantSequence;
+            // First index in the original sequence after the edited region
             int afterIdx = variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.OriginalSequence.Length - 1;
 
+            // Reify a "post-application" variation object pinned to the inserted length
             SequenceVariation variantAfterApplication;
             var vcf = variantGettingApplied.VariantCallFormatDataString;
             if (vcf != null)
@@ -169,7 +267,7 @@ private static TBioPolymerType ApplySingleVariant<TBioPolymerType>(SequenceVaria
                     variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value));
             }
 
-            // NULL-SAFE: AppliedSequenceVariations can be null for base proteins
+            // If an already-applied variation partially overlaps the current edit, use the consensus tail to avoid index corruption
             bool intersectsAppliedRegionIncompletely =
                 protein.AppliedSequenceVariations != null
                 && protein.AppliedSequenceVariations.Any(x => variantGettingApplied.Intersects(x) && !variantGettingApplied.Includes(x));
@@ -178,17 +276,22 @@ private static TBioPolymerType ApplySingleVariant<TBioPolymerType>(SequenceVaria
             string seqAfter;
             if (intersectsAppliedRegionIncompletely)
             {
+                // Tail from consensus (not the possibly already-mutated BaseSequence)
                 seqAfter = protein.BaseSequence.Length - afterIdx <= 0 ? "" : protein.ConsensusVariant.BaseSequence.Substring(afterIdx);
             }
             else
             {
+                // Tail from the current BaseSequence; keep any previously applied variations that are not fully contained by the current edit
                 seqAfter = protein.BaseSequence.Length - afterIdx <= 0 ? "" : protein.BaseSequence.Substring(afterIdx);
                 appliedVariations = appliedVariations
                     .Concat((protein.AppliedSequenceVariations ?? Enumerable.Empty<SequenceVariation>()).Where(x => !variantGettingApplied.Includes(x)))
                     .ToList();
             }
+
+            // Clip at stop (*) if any
             string variantSequence = (seqBefore + seqVariant + seqAfter).Split('*')[0];
 
+            // Re-map dependent structures after the sequence length change
             List<TruncationProduct> adjustedProteolysisProducts = AdjustTruncationProductIndices(variantGettingApplied, variantSequence, protein, protein.TruncationProducts);
             Dictionary<int, List<Modification>> adjustedModifications = AdjustModificationIndices(variantGettingApplied, variantSequence, protein);
             List<SequenceVariation> adjustedAppliedVariations = AdjustSequenceVariationIndices(variantGettingApplied, variantSequence, appliedVariations);
@@ -196,17 +299,27 @@ private static TBioPolymerType ApplySingleVariant<TBioPolymerType>(SequenceVaria
             return protein.CreateVariant(variantSequence, protein, adjustedAppliedVariations, adjustedProteolysisProducts, adjustedModifications, individual);
         }
 
+        /// <summary>
+        /// Adjusts (re-bases) the coordinates of already-applied variations after applying a new variation.
+        /// Handles overlaps by trimming and shifts by adding the net length change.
+        /// </summary>
+        /// <param name="variantGettingApplied">The newly applied variation causing coordinate shifts.</param>
+        /// <param name="variantAppliedProteinSequence">The updated sequence after the application (used to clamp ends).</param>
+        /// <param name="alreadyAppliedVariations">Variations that were previously applied (may be null).</param>
+        /// <returns>A new list of variations with updated coordinates, filtered for validity.</returns>
         private static List<SequenceVariation> AdjustSequenceVariationIndices(SequenceVariation variantGettingApplied, string variantAppliedProteinSequence, IEnumerable<SequenceVariation> alreadyAppliedVariations)
         {
             List<SequenceVariation> variations = new List<SequenceVariation>();
             if (alreadyAppliedVariations == null) { return variations; }
+
             foreach (SequenceVariation v in alreadyAppliedVariations)
             {
+                // Net length already introduced before the start of v
                 int addedIdx = alreadyAppliedVariations
                     .Where(applied => applied.OneBasedEndPosition < v.OneBasedBeginPosition)
                     .Sum(applied => applied.VariantSequence.Length - applied.OriginalSequence.Length);
 
-                // NULL-SAFE compare; or it is the current variation
+                // Either same VCF payload (null-safe) or fully before the new application region after compensating for prior shifts
                 if ((v.VariantCallFormatDataString != null && v.VariantCallFormatDataString.Equals(variantGettingApplied.VariantCallFormatDataString))
                     || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition)
                 {
@@ -214,16 +327,20 @@ private static List<SequenceVariation> AdjustSequenceVariationIndices(SequenceVa
                 }
                 else
                 {
+                    // Compute overlap with the newly applied edit and shift by the net length change
                     int intersectOneBasedStart = Math.Max(variantGettingApplied.OneBasedBeginPosition, v.OneBasedBeginPosition);
                     int intersectOneBasedEnd = Math.Min(variantGettingApplied.OneBasedEndPosition, v.OneBasedEndPosition);
                     int overlap = intersectOneBasedEnd < intersectOneBasedStart ? 0 :
                         intersectOneBasedEnd - intersectOneBasedStart + 1;
+
                     int sequenceLengthChange = variantGettingApplied.VariantSequence.Length - variantGettingApplied.OriginalSequence.Length;
+
                     int begin = v.OneBasedBeginPosition + sequenceLengthChange - overlap;
                     if (begin > variantAppliedProteinSequence.Length)
                     {
                         continue;
                     }
+
                     int end = v.OneBasedEndPosition + sequenceLengthChange - overlap;
                     if (end > variantAppliedProteinSequence.Length)
                     {
@@ -257,22 +374,37 @@ private static List<SequenceVariation> AdjustSequenceVariationIndices(SequenceVa
             return variations;
         }
 
+        /// <summary>
+        /// Re-bases proteolysis/truncation product ranges after applying a sequence variation.
+        /// Shifts segments to the right if the edit occurs before them and expands/contracts segments that span the edit.
+        /// If a stop (*) is introduced by the variant, downstream products are clamped to the new sequence length.
+        /// </summary>
+        /// <param name="variant">The applied variation (source of positional shifts and length change).</param>
+        /// <param name="variantAppliedProteinSequence">The updated sequence after application.</param>
+        /// <param name="protein">The source biopolymer (used for consensus length in certain boundary checks).</param>
+        /// <param name="proteolysisProducts">Existing truncation products to be re-based; may be null.</param>
+        /// <returns>Updated list of truncation products within the new coordinate system.</returns>
         private static List<TruncationProduct> AdjustTruncationProductIndices(SequenceVariation variant, string variantAppliedProteinSequence, IHasSequenceVariants protein, IEnumerable<TruncationProduct> proteolysisProducts)
         {
             List<TruncationProduct> products = new List<TruncationProduct>();
             if (proteolysisProducts == null) { return products; }
+
             int sequenceLengthChange = variant.VariantSequence.Length - variant.OriginalSequence.Length;
+
             foreach (TruncationProduct p in proteolysisProducts.Where(p => p.OneBasedEndPosition.HasValue && p.OneBasedBeginPosition.HasValue))
             {
+                // Entirely before the edit: unchanged
                 if (variant.OneBasedBeginPosition > p.OneBasedEndPosition)
                 {
                     products.Add(p);
                 }
+                // Segment spans the edit or is clamped at boundaries: extend/contract or clamp to stop
                 else if ((p.OneBasedBeginPosition < variant.OneBasedBeginPosition || p.OneBasedBeginPosition == 1 || p.OneBasedBeginPosition == 2)
                          && (p.OneBasedEndPosition > variant.OneBasedEndPosition || p.OneBasedEndPosition == protein.ConsensusVariant.BaseSequence.Length))
                 {
                     if (variant.VariantSequence.EndsWith("*"))
                     {
+                        // Introduced stop codon/terminator: clamp to the new sequence length
                         products.Add(new TruncationProduct(p.OneBasedBeginPosition, variantAppliedProteinSequence.Length, p.Type));
                     }
                     else if (p.OneBasedEndPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length)
@@ -280,6 +412,7 @@ private static List<TruncationProduct> AdjustTruncationProductIndices(SequenceVa
                         products.Add(new TruncationProduct(p.OneBasedBeginPosition, p.OneBasedEndPosition + sequenceLengthChange, p.Type));
                     }
                 }
+                // Entirely after the edit: shift right/left by the net length change, if still within bounds and not terminated
                 else if (p.OneBasedBeginPosition > variant.OneBasedEndPosition
                          && p.OneBasedBeginPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length
                          && p.OneBasedEndPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length
@@ -291,32 +424,45 @@ private static List<TruncationProduct> AdjustTruncationProductIndices(SequenceVa
             return products;
         }
 
+        /// <summary>
+        /// Produces a new localized modification dictionary re-based to the post-variation coordinate system and merged with
+        /// any one-based modifications carried by the applied variant itself.
+        /// </summary>
+        /// <param name="variant">The variation causing index shifts.</param>
+        /// <param name="variantAppliedProteinSequence">The updated sequence after application (for bounds checks).</param>
+        /// <param name="protein">The source biopolymer providing the original modifications map.</param>
+        /// <returns>A new dictionary of one-based modification lists keyed by position in the updated sequence.</returns>
         private static Dictionary<int, List<Modification>> AdjustModificationIndices(SequenceVariation variant, string variantAppliedProteinSequence, IHasSequenceVariants protein)
         {
+            // Original per-position modifications (pre-application)
             IDictionary<int, List<Modification>> modificationDictionary = protein.OneBasedPossibleLocalizedModifications;
+            // Modifications contributed by the variant itself (coordinated in the same one-based space)
             IDictionary<int, List<Modification>> variantModificationDictionary = variant.OneBasedModifications;
+
             Dictionary<int, List<Modification>> mods = new Dictionary<int, List<Modification>>();
             int sequenceLengthChange = variant.VariantSequence.Length - variant.OriginalSequence.Length;
 
+            // Re-base original modifications
             if (modificationDictionary != null)
             {
                 foreach (KeyValuePair<int, List<Modification>> kv in modificationDictionary)
                 {
                     if (kv.Key > variantAppliedProteinSequence.Length)
                     {
-                        continue;
+                        continue; // drop if beyond new end
                     }
                     else if (kv.Key < variant.OneBasedBeginPosition)
                     {
-                        mods.Add(kv.Key, kv.Value);
+                        mods.Add(kv.Key, kv.Value); // unaffected positions
                     }
                     else if (variant.OneBasedEndPosition < kv.Key && kv.Key + sequenceLengthChange <= variantAppliedProteinSequence.Length)
                     {
-                        mods.Add(kv.Key + sequenceLengthChange, kv.Value);
+                        mods.Add(kv.Key + sequenceLengthChange, kv.Value); // shift after the edit
                     }
                 }
             }
 
+            // Merge-in variant-borne modifications (may share positions)
             if (variantModificationDictionary != null)
             {
                 foreach (var kv in variantModificationDictionary)
@@ -335,35 +481,55 @@ private static Dictionary<int, List<Modification>> AdjustModificationIndices(Seq
             return mods;
         }
 
+        /// <summary>
+        /// Concatenates the compact representations (<see cref="SequenceVariation.SimpleString"/>) of the provided variations with underscores.
+        /// </summary>
+        /// <param name="variations">Variations to stringify; may be null/empty.</param>
+        /// <returns>An underscore-joined string or empty string if none.</returns>
         private static string CombineSimpleStrings(IEnumerable<SequenceVariation>? variations)
         {
             return variations.IsNullOrEmpty() ? "" : string.Join("_", variations.Select(v => v.SimpleString()));
         }
 
+        /// <summary>
+        /// Concatenates human-readable descriptions for the provided variations.
+        /// Prefers VCF description when available, otherwise falls back to the variation's own description.
+        /// </summary>
+        /// <param name="variations">Variations to describe; may be null/empty.</param>
+        /// <returns>A comma-delimited description string or empty string if none.</returns>
         public static string CombineDescriptions(IEnumerable<SequenceVariation>? variations)
         {
             return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.VariantCallFormatDataString?.Description ?? d.Description));
         }
 
+        /// <summary>
+        /// Applies all combinations of the provided variations to the base biopolymer up to a maximum number of yielded results.
+        /// The base (no-variant) biopolymer is yielded first, followed by combinations in increasing size.
+        /// </summary>
+        /// <typeparam name="TBioPolymerType">A biopolymer type that supports sequence variants.</typeparam>
+        /// <param name="baseBioPolymer">The starting biopolymer (no variations applied).</param>
+        /// <param name="variations">Candidate variations to combine.</param>
+        /// <param name="maxCombinations">Maximum number of variants (including the base) to yield to bound combinatorial growth.</param>
+        /// <returns>An enumerable of applied-variant biopolymers.</returns>
         public static IEnumerable<TBioPolymerType> ApplyAllVariantCombinations<TBioPolymerType>(
             TBioPolymerType baseBioPolymer,
             List<SequenceVariation> variations,
             int maxCombinations)
             where TBioPolymerType : IHasSequenceVariants
         {
-            int count = 0;
+            int count = 0; // number of variants yielded so far
 
             yield return baseBioPolymer;
             count++;
             if (count >= maxCombinations)
                 yield break;
 
-            int n = variations.Count;
+            int n = variations.Count; // total variation count
             for (int size = 1; size <= n; size++)
             {
                 foreach (var combo in GetCombinations(variations, size))
                 {
-                    var result = baseBioPolymer;
+                    var result = baseBioPolymer; // start from base and apply this combination in order
                     foreach (var variant in combo)
                     {
                         result = ApplySingleVariant(variant, result, string.Empty);
@@ -379,19 +545,28 @@ public static IEnumerable<TBioPolymerType> ApplyAllVariantCombinations<TBioPolym
             }
         }
 
+        /// <summary>
+        /// Generates all k-combinations (order-independent, no repetition) of the given list.
+        /// This is a standard lexicographic index-based combinator that yields increasing index tuples.
+        /// </summary>
+        /// <param name="variations">Source list to combine.</param>
+        /// <param name="size">Combination size k (0 &lt; k &lt;= n).</param>
+        /// <returns>An enumerable of read-only lists containing the selected variations.</returns>
         private static IEnumerable<IList<SequenceVariation>> GetCombinations(List<SequenceVariation> variations, int size)
         {
             int n = variations.Count;
             var indices = new int[size];
-            for (int i = 0; i < size; i++) indices[i] = i;
+            for (int i = 0; i < size; i++) indices[i] = i; // initial 0..k-1
 
             while (true)
             {
+                // Materialize current combination
                 var combo = new List<SequenceVariation>(size);
                 for (int i = 0; i < size; i++)
                     combo.Add(variations[indices[i]]);
                 yield return combo;
 
+                // Advance to next lexicographic combination
                 int pos = size - 1;
                 while (pos >= 0 && indices[pos] == n - size + pos)
                     pos--;
@@ -402,14 +577,25 @@ private static IEnumerable<IList<SequenceVariation>> GetCombinations(List<Sequen
             }
         }
 
+        /// <summary>
+        /// Scans localized modifications for "nucleotide substitution" annotations of the form "X-&gt;Y" and converts them
+        /// into concrete <see cref="SequenceVariation"/> objects at the associated positions.
+        /// The originating annotation-style modifications are removed from both the possible-localized and original-non-variant collections
+        /// (and from consensus mirrors) when they are fully consumed by the conversion.
+        /// </summary>
+        /// <typeparam name="TBioPolymerType">A biopolymer type that supports sequence variants.</typeparam>
+        /// <param name="protein">The biopolymer whose modifications/variants are to be updated in-place.</param>
         public static void ConvertNucleotideSubstitutionModificationsToSequenceVariants<TBioPolymerType>(this TBioPolymerType protein)
             where TBioPolymerType : IHasSequenceVariants
         {
+            // Collect mods to remove after converting them to sequence variants
             List<KeyValuePair<int, Modification>> modificationsToRemove = new();
+
             foreach (var kvp in protein.OneBasedPossibleLocalizedModifications)
             {
                 foreach (Modification mod in kvp.Value)
                 {
+                    // Look for annotation-style nucleotide substitutions (e.g., "A->G")
                     if (mod.ModificationType.Contains("nucleotide substitution") && mod.OriginalId.Contains("->"))
                     {
                         string[] originalAndSubstitutedAminoAcids = mod.OriginalId.Split(new[] { "->" }, StringSplitOptions.RemoveEmptyEntries);
@@ -418,11 +604,14 @@ public static void ConvertNucleotideSubstitutionModificationsToSequenceVariants<
                         {
                             protein.SequenceVariations.Add(sequenceVariation);
                         }
+                        // Defer removal to avoid mutating collection during enumeration
                         KeyValuePair<int, Modification> pair = new(kvp.Key, mod);
                         modificationsToRemove.Add(pair);
                     }
                 }
             }
+
+            // Remove the consumed annotation-style modifications from both live and consensus dictionaries
             foreach (KeyValuePair<int, Modification> pair in modificationsToRemove)
             {
                 if (protein.OneBasedPossibleLocalizedModifications.ContainsKey(pair.Key))
@@ -441,6 +630,7 @@ public static void ConvertNucleotideSubstitutionModificationsToSequenceVariants<
                         }
                     }
                 }
+
                 if (protein.OriginalNonVariantModifications.ContainsKey(pair.Key))
                 {
                     List<Modification> modList = protein.OriginalNonVariantModifications[pair.Key];

From 193bd673538c11df5abaa7a134ad8ae37319a72a Mon Sep 17 00:00:00 2001
From: trishorts <mshort@chem.wisc.edu>
Date: Fri, 24 Oct 2025 17:29:48 -0500
Subject: [PATCH 34/38] unwoops

---
 mzLib/Test/FlashLFQ/TestIsoTracker.cs | 32 +++++++++++++--------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/mzLib/Test/FlashLFQ/TestIsoTracker.cs b/mzLib/Test/FlashLFQ/TestIsoTracker.cs
index f918e991d..59720e4ff 100644
--- a/mzLib/Test/FlashLFQ/TestIsoTracker.cs
+++ b/mzLib/Test/FlashLFQ/TestIsoTracker.cs
@@ -25,7 +25,7 @@ internal class TestIsoTracker
         [Test]
         public static void TestIsobaricPeptideGroup()
         {
-            // VariantCallFormatDataString: Test the IsobaricPeptideGroup class
+            // Description: Test the IsobaricPeptideGroup class
             // In this testing, we will create a new IsobaricPeptideGroup and check the properties
             List<Identification> ids = new List<Identification>
             {
@@ -126,7 +126,7 @@ public static void TestIsoTrackerIdFilter_FilterPeptide()
         [Test]
         public static void TestGetTargeMz_case1()
         {
-            // VariantCallFormatDataString: Test the GetTargetMz function in FlashLfqEngine
+            // Description: Test the GetTargetMz function in FlashLfqEngine
             // In this testing, we will check the isobaricPeptideGroup and targetMzs output
             // All three ids are isobaric peptides with the same monoisotopic mass, so they should be grouped together and generate only 5 target m/z values
 
@@ -189,7 +189,7 @@ public static void TestGetTargeMz_case1()
         [Test]
         public static void TestGetTargeMz_case2()
         {
-            // VariantCallFormatDataString: Test the GetTargetMz function in FlashLfqEngine
+            // Description: Test the GetTargetMz function in FlashLfqEngine
             // In this testing, we will check the isobaricPeptideGroup and targetMzs output
             // All three ids are isobaric peptides with the different monoisotopic mass, so they should be grouped together and generate 15(3*5) target m/z values
 
@@ -253,7 +253,7 @@ public static void TestGetTargeMz_case2()
         [Test]
         public static void TestIndexPeakPrune()
         {
-            // VariantCallFormatDataString: Test the peak indexing engine pruning function
+            // Description: Test the peak indexing engine pruning function
             // In this test, we will create the targetMzs from the ids to prune the indexPeaks.
             // After pruning, the index engine should only keep the peaks with the target m/z values.
             string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData");
@@ -329,7 +329,7 @@ public static void TestXICConstructor()
         [Test]
         public static void TestLinearSpline()
         {
-            //VariantCallFormatDataString: Test the linear spline interpolation and differentiation
+            //Description: Test the linear spline interpolation and differentiation
             //The testing model is a linear function y = 100x, where x is the time point and y is the intensity
             //The slope will be 100 and the second derivative will be 0
 
@@ -361,7 +361,7 @@ public static void TestLinearSpline()
         [Test]
         public static void TestPeakAlignment()
         {
-            //VariantCallFormatDataString: Test the peak alignment function
+            //Description: Test the peak alignment function
             //The testing model is a triangle peak with the Apex.
             //The Apex of three peaks are 3, 3.1, 2.9 min
             //The time shift should be 0.1 min for the peak2 and -0.1 min for the peak3
@@ -410,7 +410,7 @@ public static void TestPeakAlignment()
         [Test]
         public static void TestBuildSmoothedCubicSpline_LessPoint()
         {
-            //VariantCallFormatDataString: Test the cubic spline interpolation
+            //Description: Test the cubic spline interpolation
             //The testing model has less than 5 points that cannot build the cubic spline
             //The cubic spline should be null
 
@@ -531,7 +531,7 @@ public static void TestXICGroupConstructor()
         [Test]
         public static void TestXICGroup_RtDict()
         {
-            //VariantCallFormatDataString: Test the peakAlignment function in the XICGroup
+            //Description: Test the peakAlignment function in the XICGroup
             //The testing has three normal distribution XIC peaks.
             //The Apex of three peaks are 3, 3.1, 2.9 min
             //The time shift should be 0.1 min for the peak2 and -0.1 min for the peak3
@@ -582,7 +582,7 @@ public static void TestXICGroup_RtDict()
         [Test]
         public static void TestXICGroup_IdList()
         {
-            //VariantCallFormatDataString: Test the IdList in the XICGroup
+            //Description: Test the IdList in the XICGroup
             //The testing model has three XICs, one of this XIC has no Id, then it borrows one Id from the first XIC
             //If the Id is borrowed, the Id will not be added into the IdList
             //The IdList should contain the Ids from the first and the third XIC
@@ -617,7 +617,7 @@ public static void TestXICGroup_IdList()
         [Test]
         public static void TestXICGroup_Tracking()
         {
-            //VariantCallFormatDataString: Test the peak tracking function in the XICGroup
+            //Description: Test the peak tracking function in the XICGroup
             //The testing has three normal distribution XIC peaks.
             //The Apex of three peaks are 20, 23, 17 min
             //The time shift should be +3 min for the peak2 and -3 min for the peak3
@@ -688,7 +688,7 @@ public static void TestXICGroup_Tracking()
         [Test]
         public static void TestCombinedSearching()
         {
-            //VariantCallFormatDataString: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output
+            //Description: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output
             //There are three XIC included isobaric peaks that with 3 min gap.
 
             string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData");
@@ -796,7 +796,7 @@ public static void TestCombinedSearching()
         [Test]
         public static void TestPeakOutput()
         {
-            //VariantCallFormatDataString: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output
+            //Description: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output
             //There are three XIC included isobaric peaks that with 3 min gap.
 
             string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData");
@@ -931,7 +931,7 @@ public static void TestPeakOutput()
         [Test]
         public static void TestIsoSequence_Ambiguous()
         {
-            //VariantCallFormatDataString: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID
+            //Description: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID
             //IsoID: DIVENY[Common Variable:Oxidation on M]FMR   should be the same as DIVENYFM[Common Variable:Oxidation on M]R
 
             //Try to turn on the MBR and Isotracker at the same time
@@ -1066,7 +1066,7 @@ public static void TestIsoSequence_Ambiguous()
         [Test]
         public static void TestIsoSequence_MonoIsotopicMassTolerance()
         {
-            //VariantCallFormatDataString: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID
+            //Description: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID
             //IsoID: DIVENY[Common Variable:Oxidation on M]FMR   should be the same as DIVENYFM[Common Variable:Oxidation on M]R
             //The Monoisotopic mass are 1201.5436, 1201.5437, 1201.5438, they should be recognized as the same IsoID
 
@@ -1491,7 +1491,7 @@ public static void TestIsoSequence_CombinedTesting()
         [Test]
         public static void TestRun_SearchingTarget()
         {
-            //VariantCallFormatDataString: we will upload a motifList for IsoTracker
+            //Description: we will upload a motifList for IsoTracker
             //Only peptide with motif on N can be searched
             //In this case, only one kind of peptide can be searched: baseSequence PEPNINEN -> PEPN[Mod]INEN, PEPNIN[Mod]EN, PEPNINEN[Mod]
             // Run 1 with PEPNIN[Mod]EN, PEPNINEN[Mod]
@@ -1615,7 +1615,7 @@ public static void TestRun_SearchingTarget()
         [Test]
         public static void TestRun_IDChecking()
         {
-            //VariantCallFormatDataString: we will turn on the IDchecking for IsoTracker
+            //Description: we will turn on the IDchecking for IsoTracker
             //Only when one XIC with more than one id, we do the searching
             //In this case, run 1 has 4 ids (pepA_1, pepA_2, pepB_1, pepC_1)
             //run 2 has 3 ids (pepA_1, pepB_1, pepC_1)

From 8380b07bc0c754f93bc0504927765bce1c3bc93e Mon Sep 17 00:00:00 2001
From: MICHAEL SHORTREED <mrshortreed@wisc.edu>
Date: Sat, 25 Oct 2025 08:37:42 -0500
Subject: [PATCH 35/38] f

---
 mzLib/Omics/BioPolymer/VariantApplication.cs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs
index c5fd71ed5..737f17bd5 100644
--- a/mzLib/Omics/BioPolymer/VariantApplication.cs
+++ b/mzLib/Omics/BioPolymer/VariantApplication.cs
@@ -1,5 +1,4 @@
 ﻿using MzLibUtil;
-using Omics.BioPolymer;
 using Omics.Modifications;
 
 namespace Omics.BioPolymer

From 15940f00dc8ebddf55b4debb0708b16082b913d8 Mon Sep 17 00:00:00 2001
From: MICHAEL SHORTREED <mrshortreed@wisc.edu>
Date: Sat, 25 Oct 2025 08:44:40 -0500
Subject: [PATCH 36/38] new variant application constructor with more accurate
 parameters and names

---
 mzLib/Omics/BioPolymer/VariantApplication.cs | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs
index 737f17bd5..5b15184b6 100644
--- a/mzLib/Omics/BioPolymer/VariantApplication.cs
+++ b/mzLib/Omics/BioPolymer/VariantApplication.cs
@@ -13,6 +13,24 @@ namespace Omics.BioPolymer
     /// </summary>
     public static class VariantApplication
     {
+
+        public static List<TBioPolymerType> GetVariantBioPolymers<TBioPolymerType>(this TBioPolymerType protein, int consensusPlusVariantIsoforms = 1, int minAlleleDepth = 0, int maxVariantsPerIsoform = 0)
+            where TBioPolymerType : IHasSequenceVariants
+        {
+            // Materialize any substitution-like annotations into concrete sequence variants on both the consensus and the instance
+            protein.ConsensusVariant.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
+            protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
+
+            // If all variants are positionally valid and any is missing VCF/genotype data, fall back to bounded combinatorial application
+            if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.VariantCallFormatDataString == null || v.VariantCallFormatDataString.Genotypes.Count == 0))
+            {
+                return ApplyAllVariantCombinations(protein, protein.SequenceVariations, consensusPlusVariantIsoforms).ToList();
+            }
+
+            // Otherwise, do genotype/allele-depth-aware application with combinatorics limited for heterozygous sites
+            return ApplyVariants(protein, protein.SequenceVariations, maxAllowedVariantsForCombinitorics: consensusPlusVariantIsoforms, minAlleleDepth);
+        }
+
         /// <summary>
         /// Builds concrete variant biopolymers from a base entity.
         /// If any known variation lacks genotype information (no VCF or empty genotypes), a safe combinatorial expansion is used (bounded by <paramref name="maxAllowedVariantsForCombinatorics"/>).

From 942a930694fdc0cda1c3c68a87c9c7561bfc789b Mon Sep 17 00:00:00 2001
From: MICHAEL SHORTREED <mrshortreed@wisc.edu>
Date: Sat, 25 Oct 2025 09:03:18 -0500
Subject: [PATCH 37/38] new VariantApplication method with added
 maxVariantsPerIsoform

---
 mzLib/Omics/BioPolymer/VariantApplication.cs | 42 +++++++-------------
 1 file changed, 15 insertions(+), 27 deletions(-)

diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs
index 5b15184b6..bd735b7c9 100644
--- a/mzLib/Omics/BioPolymer/VariantApplication.cs
+++ b/mzLib/Omics/BioPolymer/VariantApplication.cs
@@ -13,8 +13,14 @@ namespace Omics.BioPolymer
     /// </summary>
     public static class VariantApplication
     {
-
-        public static List<TBioPolymerType> GetVariantBioPolymers<TBioPolymerType>(this TBioPolymerType protein, int consensusPlusVariantIsoforms = 1, int minAlleleDepth = 0, int maxVariantsPerIsoform = 0)
+        /// <summary>
+        /// Expand a base biopolymer into concrete consensus and variant isoforms by materializing substitution-style annotations
+        /// and either applying a bounded combinatorial expansion when genotype data is absent or performing genotype- and
+        /// allele-depth-aware variant application. The <paramref name="consensusPlusVariantIsoforms"/> parameter limits the
+        /// number of isoforms returned (including the consensus); <paramref name="minAlleleDepth"/> sets the minimum AD required
+        /// to consider an allele; <paramref name="maxVariantsPerIsoform"/> is reserved for per-isoform combinatoric caps.
+        /// </summary>
+        public static List<TBioPolymerType> GetConsensusAndVariantBioPolymers<TBioPolymerType>(this TBioPolymerType protein, int consensusPlusVariantIsoforms = 1, int minAlleleDepth = 0, int maxVariantsPerIsoform = 0)
             where TBioPolymerType : IHasSequenceVariants
         {
             // Materialize any substitution-like annotations into concrete sequence variants on both the consensus and the instance
@@ -32,35 +38,17 @@ public static List<TBioPolymerType> GetVariantBioPolymers<TBioPolymerType>(this
         }
 
         /// <summary>
-        /// Builds concrete variant biopolymers from a base entity.
-        /// If any known variation lacks genotype information (no VCF or empty genotypes), a safe combinatorial expansion is used (bounded by <paramref name="maxAllowedVariantsForCombinatorics"/>).
-        /// Otherwise, variants are applied in a genotype/allele-depth-aware manner via <see cref="ApplyVariants{TBioPolymerType}(TBioPolymerType, IEnumerable{SequenceVariation}, int, int)"/>.
+        /// LEGACY: Expand a base biopolymer into consensus and variant isoforms by materializing substitution-style annotations
+        /// and then either performing a bounded combinatorial expansion when genotype data is absent or running a
+        /// genotype- and allele-depth-aware variant application. The <paramref name="consensusPlusVariantIsoforms"/>
+        /// parameter caps the total isoforms returned (including the consensus). This method is retained for backward
+        /// compatibility and may be removed in a future release — prefer <see cref="GetVariantBioPolymers{TBioPolymerType}"/>
+        /// or newer variant-expansion APIs.
         /// </summary>
-        /// <typeparam name="TBioPolymerType">A biopolymer type that supports sequence variants.</typeparam>
-        /// <param name="protein">The base biopolymer to expand into variant instances.</param>
-        /// <param name="maxAllowedVariantsForCombinatorics">
-        /// Maximum number of variants to consider when creating combinations for genotype-less scenarios.
-        /// This caps explosion in <see cref="ApplyAllVariantCombinations{TBioPolymerType}(TBioPolymerType, System.Collections.Generic.List{SequenceVariation}, int)"/>.
-        /// </param>
-        /// <param name="minAlleleDepth">
-        /// Minimum AD (Allele Depth) per sample required for an allele (reference or alternate) to be considered "deep" enough to participate in genotype-aware application.
-        /// </param>
-        /// <returns>A list of concrete variants derived from <paramref name="protein"/>.</returns>
         public static List<TBioPolymerType> GetVariantBioPolymers<TBioPolymerType>(this TBioPolymerType protein, int maxAllowedVariantsForCombinatorics = 4, int minAlleleDepth = 1)
             where TBioPolymerType : IHasSequenceVariants
         {
-            // Materialize any substitution-like annotations into concrete sequence variants on both the consensus and the instance
-            protein.ConsensusVariant.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
-            protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
-
-            // If all variants are positionally valid and any is missing VCF/genotype data, fall back to bounded combinatorial application
-            if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.VariantCallFormatDataString == null || v.VariantCallFormatDataString.Genotypes.Count == 0))
-            {
-                return ApplyAllVariantCombinations(protein, protein.SequenceVariations, maxAllowedVariantsForCombinatorics).ToList();
-            }
-
-            // Otherwise, do genotype/allele-depth-aware application with combinatorics limited for heterozygous sites
-            return ApplyVariants(protein, protein.SequenceVariations, maxAllowedVariantsForCombinitorics: maxAllowedVariantsForCombinatorics, minAlleleDepth);
+            return GetConsensusAndVariantBioPolymers(protein, consensusPlusVariantIsoforms: maxAllowedVariantsForCombinatorics, minAlleleDepth: minAlleleDepth, maxVariantsPerIsoform: 0);
         }
 
         /// <summary>

From 5cac61fd6d20d334f01145113636f1880ac5afe9 Mon Sep 17 00:00:00 2001
From: MICHAEL SHORTREED <mrshortreed@wisc.edu>
Date: Sat, 25 Oct 2025 09:25:10 -0500
Subject: [PATCH 38/38] new methods to evaluate validity of sequence variants

---
 mzLib/Omics/BioPolymer/SequenceVariation.cs | 78 +++++++++++++++++++--
 1 file changed, 74 insertions(+), 4 deletions(-)

diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs
index 3b5ef13f0..6fe611d4f 100644
--- a/mzLib/Omics/BioPolymer/SequenceVariation.cs
+++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs
@@ -307,13 +307,83 @@ internal bool Includes(int pos)
         }
 
         /// <summary>
-        /// Validates positional consistency: begin must be &gt; 0 and end must be &gt;= begin.
-        /// This does not validate string/length consistency between <see cref="OriginalSequence"/> and <see cref="VariantSequence"/>.
+        /// Validates this variation.
+        /// Rules:
+        /// 1. Coordinates must be sensible (begin >= 1 and end >= begin).
+        /// 2. Variation must represent a meaningful change:
+        ///    - Either the sequence actually changes (insertion, deletion, substitution, stop, frameshift),
+        ///    - OR there are variant-specific modifications.
+        ///    A “no-op” (OriginalSequence == VariantSequence with no variant-specific mods) is invalid.
+        /// 3. If variant-specific modifications exist, they must not violate positional constraints
+        ///    (see <see cref="GetInvalidModificationPositions"/>).
         /// </summary>
-        /// <returns>True if positions are valid; otherwise false.</returns>
         public bool AreValid()
         {
-            return OneBasedBeginPosition > 0 && OneBasedEndPosition >= OneBasedBeginPosition;
+            if (OneBasedBeginPosition <= 0 || OneBasedEndPosition < OneBasedBeginPosition)
+            {
+                return false;
+            }
+
+            bool noSequenceChange = string.Equals(OriginalSequence ?? string.Empty,
+                                                  VariantSequence ?? string.Empty,
+                                                  StringComparison.Ordinal);
+
+            bool hasMods = OneBasedModifications != null && OneBasedModifications.Count > 0;
+
+            if (noSequenceChange && !hasMods)
+            {
+                return false;
+            }
+
+            if (!hasMods)
+            {
+                return true;
+            }
+
+            return !GetInvalidModificationPositions().Any();
+        }
+        /// <summary>
+        /// Yields modification positions deemed invalid under the current edit semantics.
+        /// </summary>
+        private IEnumerable<int> GetInvalidModificationPositions()
+        {
+            if (OneBasedModifications == null || OneBasedModifications.Count == 0)
+            {
+                yield break;
+            }
+
+            bool isTermination = VariantSequence == "*" || VariantSequence.Length == 0;
+
+            if (isTermination)
+            {
+                foreach (var kvp in OneBasedModifications)
+                {
+                    if (kvp.Key >= OneBasedBeginPosition)
+                    {
+                        yield return kvp.Key;
+                    }
+                }
+                yield break;
+            }
+
+            int newSpanEnd = OneBasedBeginPosition + VariantSequence.Length - 1;
+
+            foreach (var kvp in OneBasedModifications)
+            {
+                int pos = kvp.Key;
+                if (pos <= 0)
+                {
+                    yield return pos;
+                    continue;
+                }
+
+                if (pos >= OneBasedBeginPosition
+                    && pos <= OneBasedEndPosition
+                    && pos > newSpanEnd)
+                {
+                    yield return pos;
+                }
+            }
         }
     }
 }
\ No newline at end of file