From d09576c62f0308971f8af4f4724564470d5bf143 Mon Sep 17 00:00:00 2001 From: trishorts Date: Wed, 22 Oct 2025 15:13:05 -0500 Subject: [PATCH 01/38] rename sequence variant description as variant call format --- mzLib/Omics/BioPolymer/SequenceVariation.cs | 10 +++--- mzLib/Omics/BioPolymer/VariantApplication.cs | 32 +++++++++---------- ...antDescription.cs => VariantCallFormat.cs} | 6 ++-- mzLib/Omics/IBioPolymerWithSetMods.cs | 2 +- mzLib/Proteomics/Protein/DisulfideBond.cs | 2 +- .../MsFraggerPeptide.cs | 2 +- .../MsFraggerProtein.cs | 2 +- .../IndividualResultRecords/MsFraggerPsm.cs | 2 +- .../SpectrumMatchFromTsvHeader.cs | 4 +-- mzLib/Readers/Thermo/ThermoRawFileReader.cs | 2 +- mzLib/Test/DatabaseTests/TestProteinReader.cs | 6 ++-- .../DatabaseTests/TestProteomicsReadWrite.cs | 8 ++--- .../Test/DatabaseTests/TestVariantProtein.cs | 6 ++-- mzLib/Test/FlashLFQ/TestIsoTracker.cs | 32 +++++++++---------- mzLib/Test/Transcriptomics/TestDbLoader.cs | 2 +- .../DecoyGeneration/DecoyProteinGenerator.cs | 16 +++++----- .../DecoyGeneration/RnaDecoyGenerator.cs | 4 +-- .../ProteinDbWriter.cs | 6 ++-- .../Transcriptomics/RnaDbLoader.cs | 2 +- 19 files changed, 73 insertions(+), 73 deletions(-) rename mzLib/Omics/BioPolymer/{SequenceVariantDescription.cs => VariantCallFormat.cs} (95%) diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs index 22f0347b4..d48df14b0 100644 --- a/mzLib/Omics/BioPolymer/SequenceVariation.cs +++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs @@ -20,7 +20,7 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str OneBasedEndPosition = oneBasedEndPosition; OriginalSequence = originalSequence ?? ""; VariantSequence = variantSequence ?? ""; - Description = new SequenceVariantDescription(description); + CallFormat = new VariantCallFormat(description); OneBasedModifications = oneBasedModifications ?? new Dictionary>(); } @@ -58,9 +58,9 @@ public SequenceVariation(int oneBasedPosition, string originalSequence, string v public string VariantSequence { get; } /// - /// Description of this variation (optional) + /// CallFormat of this variation (optional) /// - public SequenceVariantDescription Description { get; } + public VariantCallFormat CallFormat { get; } /// /// Modifications specifically for this variant @@ -75,7 +75,7 @@ public override bool Equals(object obj) && OneBasedEndPosition == s.OneBasedEndPosition && (s.OriginalSequence == null && OriginalSequence == null || OriginalSequence.Equals(s.OriginalSequence)) && (s.VariantSequence == null && VariantSequence == null || VariantSequence.Equals(s.VariantSequence)) - && (s.Description == null && Description == null || Description.Equals(s.Description)) + && (s.CallFormat == null && CallFormat == null || CallFormat.Equals(s.CallFormat)) && (s.OneBasedModifications == null && OneBasedModifications == null || s.OneBasedModifications.Keys.ToList().SequenceEqual(OneBasedModifications.Keys.ToList()) && s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList())); @@ -87,7 +87,7 @@ public override int GetHashCode() ^ OneBasedEndPosition.GetHashCode() ^ OriginalSequence.GetHashCode() // null handled in constructor ^ VariantSequence.GetHashCode() // null handled in constructor - ^ Description.GetHashCode(); // always constructed in constructor + ^ CallFormat.GetHashCode(); // always constructed in constructor } /// diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs index 1669c3afe..a9ee258f5 100644 --- a/mzLib/Omics/BioPolymer/VariantApplication.cs +++ b/mzLib/Omics/BioPolymer/VariantApplication.cs @@ -25,7 +25,7 @@ public static List GetVariantBioPolymers(this { protein.ConsensusVariant.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); - if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.Description == null || v.Description.Genotypes.Count == 0)) + if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.CallFormat == null || v.CallFormat.Genotypes.Count == 0)) { // this is a protein with either no VCF lines or a mix of VCF and non-VCF lines return ApplyAllVariantCombinations(protein, protein.SequenceVariations, maxAllowedVariantsForCombinatorics).ToList(); @@ -100,7 +100,7 @@ public static List ApplyVariants(TBioPolymerTy List uniqueEffectsToApply = sequenceVariations .GroupBy(v => v.SimpleString()) .Select(x => x.First()) - .Where(v => v.Description.Genotypes.Count > 0) // this is a VCF line + .Where(v => v.CallFormat.Genotypes.Count > 0) // this is a VCF line .OrderByDescending(v => v.OneBasedBeginPosition) // apply variants at the end of the protein sequence first .ToList(); @@ -112,7 +112,7 @@ public static List ApplyVariants(TBioPolymerTy return new List { proteinCopy }; } - HashSet individuals = new HashSet(uniqueEffectsToApply.SelectMany(v => v.Description.Genotypes.Keys)); + HashSet individuals = new HashSet(uniqueEffectsToApply.SelectMany(v => v.CallFormat.Genotypes.Keys)); List variantProteins = new(); List newVariantProteins = new(); // loop through genotypes for each sample/individual (e.g. tumor and normal) @@ -121,17 +121,17 @@ public static List ApplyVariants(TBioPolymerTy newVariantProteins.Clear(); newVariantProteins.Add(proteinCopy); - bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.Description.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics; + bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.CallFormat.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics; foreach (var variant in uniqueEffectsToApply) { - bool variantAlleleIsInTheGenotype = variant.Description.Genotypes[individual].Contains(variant.Description.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff + bool variantAlleleIsInTheGenotype = variant.CallFormat.Genotypes[individual].Contains(variant.CallFormat.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff if (!variantAlleleIsInTheGenotype) { continue; } - bool isHomozygousAlternate = variant.Description.Homozygous[individual] && variant.Description.Genotypes[individual].All(d => d == variant.Description.AlleleIndex.ToString()); // note this isn't a great test for homozygosity, since the genotype could be 1/2 and this would still return true. But currently, alleles 1 and 2 will be included as separate variants, so this is fine for now. - bool isDeepReferenceAllele = int.TryParse(variant.Description.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth; - bool isDeepAlternateAllele = int.TryParse(variant.Description.AlleleDepths[individual][variant.Description.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth; + bool isHomozygousAlternate = variant.CallFormat.Homozygous[individual] && variant.CallFormat.Genotypes[individual].All(d => d == variant.CallFormat.AlleleIndex.ToString()); // note this isn't a great test for homozygosity, since the genotype could be 1/2 and this would still return true. But currently, alleles 1 and 2 will be included as separate variants, so this is fine for now. + bool isDeepReferenceAllele = int.TryParse(variant.CallFormat.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth; + bool isDeepAlternateAllele = int.TryParse(variant.CallFormat.AlleleDepths[individual][variant.CallFormat.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth; // homozygous alternate if (isHomozygousAlternate && isDeepAlternateAllele) @@ -141,7 +141,7 @@ public static List ApplyVariants(TBioPolymerTy // heterozygous basic // first protein with variants contains all homozygous variation, second contains all variations - else if (variant.Description.Heterozygous[individual] && tooManyHeterozygousVariants) + else if (variant.CallFormat.Heterozygous[individual] && tooManyHeterozygousVariants) { if (isDeepAlternateAllele && isDeepReferenceAllele) { @@ -170,7 +170,7 @@ public static List ApplyVariants(TBioPolymerTy } // heterozygous combinitorics - else if (variant.Description.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants) + else if (variant.CallFormat.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants) { List combinitoricProteins = new(); @@ -179,7 +179,7 @@ public static List ApplyVariants(TBioPolymerTy if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0 && isDeepReferenceAllele) { // keep reference allele - if (variant.Description.Genotypes[individual].Contains("0")) + if (variant.CallFormat.Genotypes[individual].Contains("0")) { combinitoricProteins.Add(ppp); } @@ -191,7 +191,7 @@ public static List ApplyVariants(TBioPolymerTy { combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual)); } - else if (variant.Description.Genotypes[individual].Contains("0")) + else if (variant.CallFormat.Genotypes[individual].Contains("0")) { combinitoricProteins.Add(ppp); } @@ -224,7 +224,7 @@ private static TBioPolymerType ApplySingleVariant(SequenceVaria variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1, variantGettingApplied.OriginalSequence, variantGettingApplied.VariantSequence, - variantGettingApplied.Description.Description, + variantGettingApplied.CallFormat.Description, variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)); // check to see if there is incomplete indel overlap, which would lead to weird variant sequences @@ -271,7 +271,7 @@ private static List AdjustSequenceVariationIndices(SequenceVa // variant was entirely before the one being applied (shouldn't happen because of order of applying variants) // or it's the current variation - if (v.Description.Equals(variantGettingApplied.Description) || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition) + if (v.CallFormat.Equals(variantGettingApplied.CallFormat) || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition) { variations.Add(v); } @@ -299,7 +299,7 @@ private static List AdjustSequenceVariationIndices(SequenceVa end, v.OriginalSequence, v.VariantSequence, - v.Description.Description, + v.CallFormat.Description, v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value))); } } @@ -425,7 +425,7 @@ private static string CombineSimpleStrings(IEnumerable? varia /// public static string CombineDescriptions(IEnumerable? variations) { - return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.Description)); + return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.CallFormat)); } /// /// Applies all possible combinations of the provided SequenceVariation list to the base TBioPolymerType object, diff --git a/mzLib/Omics/BioPolymer/SequenceVariantDescription.cs b/mzLib/Omics/BioPolymer/VariantCallFormat.cs similarity index 95% rename from mzLib/Omics/BioPolymer/SequenceVariantDescription.cs rename to mzLib/Omics/BioPolymer/VariantCallFormat.cs index 4b4b9d81a..386d97705 100644 --- a/mzLib/Omics/BioPolymer/SequenceVariantDescription.cs +++ b/mzLib/Omics/BioPolymer/VariantCallFormat.cs @@ -4,9 +4,9 @@ namespace Omics.BioPolymer { - public class SequenceVariantDescription + public class VariantCallFormat { - public SequenceVariantDescription(string description) + public VariantCallFormat(string description) { Description = description; if (description == null) @@ -67,7 +67,7 @@ public override string ToString() public override bool Equals(object obj) { - SequenceVariantDescription s = obj as SequenceVariantDescription; + VariantCallFormat s = obj as VariantCallFormat; return s != null && s.Description == Description; } diff --git a/mzLib/Omics/IBioPolymerWithSetMods.cs b/mzLib/Omics/IBioPolymerWithSetMods.cs index e475a3b5d..f065238a4 100644 --- a/mzLib/Omics/IBioPolymerWithSetMods.cs +++ b/mzLib/Omics/IBioPolymerWithSetMods.cs @@ -25,7 +25,7 @@ public interface IBioPolymerWithSetMods : IHasChemicalFormula, IEquatable - /// Description of where the BioPolymerWithSetMods originated from examples include + /// CallFormat of where the BioPolymerWithSetMods originated from examples include /// Top-down truncation: full-length proteoform C-terminal digestion truncation /// Top-down truncation: DECOY full-length proteoform N-terminal digestion truncation /// Bottom-up search: full diff --git a/mzLib/Proteomics/Protein/DisulfideBond.cs b/mzLib/Proteomics/Protein/DisulfideBond.cs index 6bd4fa1af..a338957e8 100644 --- a/mzLib/Proteomics/Protein/DisulfideBond.cs +++ b/mzLib/Proteomics/Protein/DisulfideBond.cs @@ -25,7 +25,7 @@ public DisulfideBond(int OneBasedPosition, string Description) public int OneBasedEndPosition { get; set; } /// - /// Description of this variation (optional) + /// CallFormat of this variation (optional) /// public string Description { get; set; } diff --git a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPeptide.cs b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPeptide.cs index 32def7c09..d7eac04ca 100644 --- a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPeptide.cs +++ b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPeptide.cs @@ -79,7 +79,7 @@ public string ProteinName [Name("Gene")] public string Gene { get; set; } - [Name("Protein Description")] + [Name("Protein CallFormat")] public string ProteinDescription { get; set; } [Name("Mapped Genes")] diff --git a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerProtein.cs b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerProtein.cs index 37c043590..fea95ac3b 100644 --- a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerProtein.cs +++ b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerProtein.cs @@ -38,7 +38,7 @@ public class MsFraggerProtein [Name("Organism")] public string Organism { get; set; } - [Name("Protein Description", "Description")] + [Name("Protein CallFormat", "CallFormat")] public string Description { get; set; } [Name("Protein Existence")] diff --git a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs index e5272a779..8ee977e6e 100644 --- a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs +++ b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs @@ -124,7 +124,7 @@ public class MsFraggerPsm : IQuantifiableRecord [Name("Gene")] public string Gene { get; set; } - [Name("Protein Description")] + [Name("Protein CallFormat")] public string ProteinDescription { get; set; } [Name("Mapped Genes")] diff --git a/mzLib/Readers/InternalResults/IndividualResultRecords/SpectrumMatchFromTsvHeader.cs b/mzLib/Readers/InternalResults/IndividualResultRecords/SpectrumMatchFromTsvHeader.cs index 7ca469da5..d71eac995 100644 --- a/mzLib/Readers/InternalResults/IndividualResultRecords/SpectrumMatchFromTsvHeader.cs +++ b/mzLib/Readers/InternalResults/IndividualResultRecords/SpectrumMatchFromTsvHeader.cs @@ -46,7 +46,7 @@ public class SpectrumMatchFromTsvHeader public const string SpliceSites = "Splice Sites"; public const string Contaminant = "Contaminant"; public const string Decoy = "Decoy"; - public const string Description = "Description"; + public const string Description = "CallFormat"; public const string StartAndEndResiduesInFullSequence = "Start and End Residues In Full Sequence"; public const string PreviousResidue = "Previous Residue"; public const string NextResidue = "Next Residue"; @@ -78,7 +78,7 @@ public class SpectrumMatchFromTsvHeader public const string PeptideMonoMass = "Peptide Monoisotopic Mass"; public const string ProteinAccession = "Protein Accession"; public const string ProteinName = "Protein Name"; - public const string PeptideDescription = "Peptide Description"; + public const string PeptideDescription = "Peptide CallFormat"; public const string StartAndEndResiduesInProtein = "Start and End Residues In Protein"; public const string PreviousAminoAcid = "Previous Amino Acid"; public const string NextAminoAcid = "Next Amino Acid"; diff --git a/mzLib/Readers/Thermo/ThermoRawFileReader.cs b/mzLib/Readers/Thermo/ThermoRawFileReader.cs index 7ed5456ea..7a61a0639 100644 --- a/mzLib/Readers/Thermo/ThermoRawFileReader.cs +++ b/mzLib/Readers/Thermo/ThermoRawFileReader.cs @@ -306,7 +306,7 @@ private static MsDataScan GetOneBasedScan(IRawDataPlus rawFile, IFilteringParams HcdEnergy = values[i]; } - if (labels[i].StartsWith("Scan Description", StringComparison.Ordinal)) + if (labels[i].StartsWith("Scan CallFormat", StringComparison.Ordinal)) { scanDescript = values[i].TrimEnd(); } diff --git a/mzLib/Test/DatabaseTests/TestProteinReader.cs b/mzLib/Test/DatabaseTests/TestProteinReader.cs index 7dcd0b4d8..cbfc4c603 100644 --- a/mzLib/Test/DatabaseTests/TestProteinReader.cs +++ b/mzLib/Test/DatabaseTests/TestProteinReader.cs @@ -125,7 +125,7 @@ public static void XmlTest() Assert.AreEqual(64, ok[0].SequenceVariations.First().OneBasedEndPosition); Assert.AreEqual(103 - 64 + 2, ok[1].SequenceVariations.First().OneBasedBeginPosition); Assert.AreEqual(103 - 64 + 2, ok[1].SequenceVariations.First().OneBasedEndPosition); - Assert.AreNotEqual(ok[0].SequenceVariations.First().Description, ok[1].SequenceVariations.First().Description); //decoys and target variations don't have the same desc. + Assert.AreNotEqual(ok[0].SequenceVariations.First().CallFormat, ok[1].SequenceVariations.First().CallFormat); //decoys and target variations don't have the same desc. Assert.AreEqual("Homo sapiens", ok[1].Organism); } @@ -420,8 +420,8 @@ public static void TestReverseDecoyXML_WithCustomIdentifier() foreach (var variant in protein.AppliedSequenceVariations) { - Assert.That(variant.Description, Does.StartWith("rev")); - Assert.That(variant.Description, Does.Not.StartWith("DECOY")); + Assert.That(variant.CallFormat, Does.StartWith("rev")); + Assert.That(variant.CallFormat, Does.Not.StartWith("DECOY")); } foreach (var bond in protein.DisulfideBonds) diff --git a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs index babe44a76..07acea46b 100644 --- a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs +++ b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs @@ -504,12 +504,12 @@ public void TestFullProteinReadWrite() Assert.AreEqual(originalProtein.TruncationProducts.First().OneBasedEndPosition, proteinReadFromXml[0].TruncationProducts.First().OneBasedEndPosition); Assert.AreEqual(originalProtein.TruncationProducts.First().Type, proteinReadFromXml[0].TruncationProducts.First().Type.Split('(')[0]); - Assert.AreEqual(originalProtein.SequenceVariations.First().Description, proteinReadFromXml[0].SequenceVariations.First().Description); + Assert.AreEqual(originalProtein.SequenceVariations.First().CallFormat, proteinReadFromXml[0].SequenceVariations.First().CallFormat); Assert.AreEqual(originalProtein.SequenceVariations.First().OneBasedBeginPosition, proteinReadFromXml[0].SequenceVariations.First().OneBasedBeginPosition); Assert.AreEqual(originalProtein.SequenceVariations.First().OneBasedEndPosition, proteinReadFromXml[0].SequenceVariations.First().OneBasedEndPosition); Assert.AreEqual(originalProtein.SequenceVariations.First().OriginalSequence, proteinReadFromXml[0].SequenceVariations.First().OriginalSequence); Assert.AreEqual(originalProtein.SequenceVariations.First().VariantSequence, proteinReadFromXml[0].SequenceVariations.First().VariantSequence); - Assert.AreEqual(originalProtein.SequenceVariations.Last().Description, proteinReadFromXml[0].SequenceVariations.Last().Description); + Assert.AreEqual(originalProtein.SequenceVariations.Last().CallFormat, proteinReadFromXml[0].SequenceVariations.Last().CallFormat); Assert.AreEqual(originalProtein.SequenceVariations.Last().OneBasedBeginPosition, proteinReadFromXml[0].SequenceVariations.Last().OneBasedBeginPosition); Assert.AreEqual(originalProtein.SequenceVariations.Last().OneBasedEndPosition, proteinReadFromXml[0].SequenceVariations.Last().OneBasedEndPosition); Assert.AreEqual(originalProtein.SequenceVariations.Last().OriginalSequence, proteinReadFromXml[0].SequenceVariations.Last().OriginalSequence); @@ -534,7 +534,7 @@ public void TestReadWriteSeqVars() Assert.AreEqual(ok[0].SequenceVariations.Count(), ok2[0].SequenceVariations.Count()); Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedBeginPosition, ok2[0].SequenceVariations.First().OneBasedBeginPosition); Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedEndPosition, ok2[0].SequenceVariations.First().OneBasedEndPosition); - Assert.AreEqual(ok[0].SequenceVariations.First().Description, ok2[0].SequenceVariations.First().Description); + Assert.AreEqual(ok[0].SequenceVariations.First().CallFormat, ok2[0].SequenceVariations.First().CallFormat); Assert.AreEqual(ok[0].SequenceVariations.First().OriginalSequence, ok2[0].SequenceVariations.First().OriginalSequence); Assert.AreEqual(ok[0].SequenceVariations.First().VariantSequence, ok2[0].SequenceVariations.First().VariantSequence); } @@ -557,7 +557,7 @@ public void TestReadWriteSeqVars2() Assert.AreEqual(ok[0].SequenceVariations.Count(), ok2[0].SequenceVariations.Count()); Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedBeginPosition, ok2[0].SequenceVariations.First().OneBasedBeginPosition); Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedEndPosition, ok2[0].SequenceVariations.First().OneBasedEndPosition); - Assert.AreEqual(ok[0].SequenceVariations.First().Description, ok2[0].SequenceVariations.First().Description); + Assert.AreEqual(ok[0].SequenceVariations.First().CallFormat, ok2[0].SequenceVariations.First().CallFormat); Assert.AreEqual(ok[0].SequenceVariations.First().OriginalSequence, ok2[0].SequenceVariations.First().OriginalSequence); Assert.AreEqual(ok[0].SequenceVariations.First().VariantSequence, ok2[0].SequenceVariations.First().VariantSequence); } diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 5580544ef..27d545cfe 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -115,7 +115,7 @@ public static void SeqVarXmlTest() { Assert.AreEqual(s.OriginalSequence, decoy.BaseSequence.Substring(s.OneBasedBeginPosition - 1, s.OneBasedEndPosition - s.OneBasedBeginPosition + 1)); } - Assert.AreNotEqual(target.SequenceVariations.First().Description, decoy.SequenceVariations.First().Description); //decoys and target variations don't have the same desc. + Assert.AreNotEqual(target.SequenceVariations.First().CallFormat, decoy.SequenceVariations.First().CallFormat); //decoys and target variations don't have the same desc. List peptides = ok.SelectMany(vp => vp.Digest(new DigestionParams(), null, null)).ToList(); } @@ -490,7 +490,7 @@ public void VariantSymbolWeirdnessXml() string file = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "SeqVarSymbolWeirdness.xml"); List variantProteins = ProteinDbLoader.LoadProteinXML(file, true, DecoyType.None, null, false, null, out var un); Assert.AreEqual(12, variantProteins.First().ConsensusVariant.SequenceVariations.Count()); - Assert.AreEqual(2, variantProteins.First().ConsensusVariant.SequenceVariations.Count(v => v.Description.Heterozygous.Any(kv => kv.Value))); + Assert.AreEqual(2, variantProteins.First().ConsensusVariant.SequenceVariations.Count(v => v.CallFormat.Heterozygous.Any(kv => kv.Value))); Assert.AreEqual(1, variantProteins.Count); // Should be 2^2 from combinitorics of heterozygous, but the giant indels overwrite them Assert.AreEqual(0, variantProteins.Where(v => v.BaseSequence == variantProteins.First().ConsensusVariant.BaseSequence).Count()); // Homozygous variations are included @@ -749,7 +749,7 @@ public void Constructor_ParsesDescriptionCorrectly() // Act - var svd = new SequenceVariantDescription(description); + var svd = new VariantCallFormat(description); // Assert Assert.AreEqual(description, svd.Description); diff --git a/mzLib/Test/FlashLFQ/TestIsoTracker.cs b/mzLib/Test/FlashLFQ/TestIsoTracker.cs index 59720e4ff..562126ec8 100644 --- a/mzLib/Test/FlashLFQ/TestIsoTracker.cs +++ b/mzLib/Test/FlashLFQ/TestIsoTracker.cs @@ -25,7 +25,7 @@ internal class TestIsoTracker [Test] public static void TestIsobaricPeptideGroup() { - // Description: Test the IsobaricPeptideGroup class + // CallFormat: Test the IsobaricPeptideGroup class // In this testing, we will create a new IsobaricPeptideGroup and check the properties List ids = new List { @@ -126,7 +126,7 @@ public static void TestIsoTrackerIdFilter_FilterPeptide() [Test] public static void TestGetTargeMz_case1() { - // Description: Test the GetTargetMz function in FlashLfqEngine + // CallFormat: Test the GetTargetMz function in FlashLfqEngine // In this testing, we will check the isobaricPeptideGroup and targetMzs output // All three ids are isobaric peptides with the same monoisotopic mass, so they should be grouped together and generate only 5 target m/z values @@ -189,7 +189,7 @@ public static void TestGetTargeMz_case1() [Test] public static void TestGetTargeMz_case2() { - // Description: Test the GetTargetMz function in FlashLfqEngine + // CallFormat: Test the GetTargetMz function in FlashLfqEngine // In this testing, we will check the isobaricPeptideGroup and targetMzs output // All three ids are isobaric peptides with the different monoisotopic mass, so they should be grouped together and generate 15(3*5) target m/z values @@ -253,7 +253,7 @@ public static void TestGetTargeMz_case2() [Test] public static void TestIndexPeakPrune() { - // Description: Test the peak indexing engine pruning function + // CallFormat: Test the peak indexing engine pruning function // In this test, we will create the targetMzs from the ids to prune the indexPeaks. // After pruning, the index engine should only keep the peaks with the target m/z values. string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData"); @@ -329,7 +329,7 @@ public static void TestXICConstructor() [Test] public static void TestLinearSpline() { - //Description: Test the linear spline interpolation and differentiation + //CallFormat: Test the linear spline interpolation and differentiation //The testing model is a linear function y = 100x, where x is the time point and y is the intensity //The slope will be 100 and the second derivative will be 0 @@ -361,7 +361,7 @@ public static void TestLinearSpline() [Test] public static void TestPeakAlignment() { - //Description: Test the peak alignment function + //CallFormat: Test the peak alignment function //The testing model is a triangle peak with the Apex. //The Apex of three peaks are 3, 3.1, 2.9 min //The time shift should be 0.1 min for the peak2 and -0.1 min for the peak3 @@ -410,7 +410,7 @@ public static void TestPeakAlignment() [Test] public static void TestBuildSmoothedCubicSpline_LessPoint() { - //Description: Test the cubic spline interpolation + //CallFormat: Test the cubic spline interpolation //The testing model has less than 5 points that cannot build the cubic spline //The cubic spline should be null @@ -531,7 +531,7 @@ public static void TestXICGroupConstructor() [Test] public static void TestXICGroup_RtDict() { - //Description: Test the peakAlignment function in the XICGroup + //CallFormat: Test the peakAlignment function in the XICGroup //The testing has three normal distribution XIC peaks. //The Apex of three peaks are 3, 3.1, 2.9 min //The time shift should be 0.1 min for the peak2 and -0.1 min for the peak3 @@ -582,7 +582,7 @@ public static void TestXICGroup_RtDict() [Test] public static void TestXICGroup_IdList() { - //Description: Test the IdList in the XICGroup + //CallFormat: Test the IdList in the XICGroup //The testing model has three XICs, one of this XIC has no Id, then it borrows one Id from the first XIC //If the Id is borrowed, the Id will not be added into the IdList //The IdList should contain the Ids from the first and the third XIC @@ -617,7 +617,7 @@ public static void TestXICGroup_IdList() [Test] public static void TestXICGroup_Tracking() { - //Description: Test the peak tracking function in the XICGroup + //CallFormat: Test the peak tracking function in the XICGroup //The testing has three normal distribution XIC peaks. //The Apex of three peaks are 20, 23, 17 min //The time shift should be +3 min for the peak2 and -3 min for the peak3 @@ -688,7 +688,7 @@ public static void TestXICGroup_Tracking() [Test] public static void TestCombinedSearching() { - //Description: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output + //CallFormat: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output //There are three XIC included isobaric peaks that with 3 min gap. string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData"); @@ -796,7 +796,7 @@ public static void TestCombinedSearching() [Test] public static void TestPeakOutput() { - //Description: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output + //CallFormat: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output //There are three XIC included isobaric peaks that with 3 min gap. string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData"); @@ -931,7 +931,7 @@ public static void TestPeakOutput() [Test] public static void TestIsoSequence_Ambiguous() { - //Description: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID + //CallFormat: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID //IsoID: DIVENY[Common Variable:Oxidation on M]FMR should be the same as DIVENYFM[Common Variable:Oxidation on M]R //Try to turn on the MBR and Isotracker at the same time @@ -1066,7 +1066,7 @@ public static void TestIsoSequence_Ambiguous() [Test] public static void TestIsoSequence_MonoIsotopicMassTolerance() { - //Description: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID + //CallFormat: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID //IsoID: DIVENY[Common Variable:Oxidation on M]FMR should be the same as DIVENYFM[Common Variable:Oxidation on M]R //The Monoisotopic mass are 1201.5436, 1201.5437, 1201.5438, they should be recognized as the same IsoID @@ -1491,7 +1491,7 @@ public static void TestIsoSequence_CombinedTesting() [Test] public static void TestRun_SearchingTarget() { - //Description: we will upload a motifList for IsoTracker + //CallFormat: we will upload a motifList for IsoTracker //Only peptide with motif on N can be searched //In this case, only one kind of peptide can be searched: baseSequence PEPNINEN -> PEPN[Mod]INEN, PEPNIN[Mod]EN, PEPNINEN[Mod] // Run 1 with PEPNIN[Mod]EN, PEPNINEN[Mod] @@ -1615,7 +1615,7 @@ public static void TestRun_SearchingTarget() [Test] public static void TestRun_IDChecking() { - //Description: we will turn on the IDchecking for IsoTracker + //CallFormat: we will turn on the IDchecking for IsoTracker //Only when one XIC with more than one id, we do the searching //In this case, run 1 has 4 ids (pepA_1, pepA_2, pepB_1, pepC_1) //run 2 has 3 ids (pepA_1, pepB_1, pepC_1) diff --git a/mzLib/Test/Transcriptomics/TestDbLoader.cs b/mzLib/Test/Transcriptomics/TestDbLoader.cs index db5de6417..cea7bb7c7 100644 --- a/mzLib/Test/Transcriptomics/TestDbLoader.cs +++ b/mzLib/Test/Transcriptomics/TestDbLoader.cs @@ -291,7 +291,7 @@ public static void TestEnsemblFastaParsing() Assert.That(fields["GeneBiotype"], Is.EqualTo("snRNA")); Assert.That(fields["TranscriptBiotype"], Is.EqualTo("snRNA")); Assert.That(fields["GeneSymbol"], Is.EqualTo("U6")); - Assert.That(fields["Description"], Does.Contain("U6 spliceosomal RNA")); + Assert.That(fields["CallFormat"], Does.Contain("U6 spliceosomal RNA")); } diff --git a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs index c83c80aa1..949cf68a3 100644 --- a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs +++ b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs @@ -239,34 +239,34 @@ private static List ReverseSequenceVariations(IEnumerable 1 || sv.VariantSequence.Length > 1)) { string original = new string(originalArray).Substring(0, originalArray.Length - 1); string variant = new string(variationArray).Substring(0, variationArray.Length - 1); - decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, original, variant, $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications)); + decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, original, variant, $"{decoyIdentifier} VARIANT: " + sv.CallFormat, decoyVariantModifications)); } // gained an initiating methionine else if (sv.VariantSequence.StartsWith("M", StringComparison.Ordinal) && sv.OneBasedBeginPosition == 1) { - decoyVariations.Add(new SequenceVariation(1, 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications)); + decoyVariations.Add(new SequenceVariation(1, 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.CallFormat, decoyVariantModifications)); } // starting methionine, but no variations on it else if (startsWithM) { - decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 2, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications)); + decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 2, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.CallFormat, decoyVariantModifications)); } // no starting methionine else { - decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 1, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications)); + decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 1, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.CallFormat, decoyVariantModifications)); } } return decoyVariations; @@ -335,7 +335,7 @@ private static List GenerateSlideDecoys(List proteins, int max { variationArraySlided[i] = variationArrayUnslided[GetOldSlidedIndex(i, numSlidesHere, variationArrayUnslided.Length, true)]; } - decoyVariationsSlide.Add(new SequenceVariation(1, "M", new string(variationArraySlided), $"{decoyIdentifier} VARIANT: Initiator Methionine Change in " + sv.Description)); + decoyVariationsSlide.Add(new SequenceVariation(1, "M", new string(variationArraySlided), $"{decoyIdentifier} VARIANT: Initiator Methionine Change in " + sv.CallFormat)); } else { @@ -352,7 +352,7 @@ private static List GenerateSlideDecoys(List proteins, int max variationArraySlided[i] = variationArrayUnslided[GetOldSlidedIndex(i, numSlidesHere, variationArrayUnslided.Length, initMet)]; } - decoyVariationsSlide.Add(new SequenceVariation(decoy_begin, decoy_end, sv.OriginalSequence, new string(variationArraySlided), $"{decoyIdentifier} VARIANT: " + sv.Description)); + decoyVariationsSlide.Add(new SequenceVariation(decoy_begin, decoy_end, sv.OriginalSequence, new string(variationArraySlided), $"{decoyIdentifier} VARIANT: " + sv.CallFormat)); } } var decoyProteinSlide = new Protein(slided_sequence, $"{decoyIdentifier}_" + protein.Accession, protein.Organism, protein.GeneNames.ToList(), decoyModifications, decoyPPSlide, diff --git a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs index cc7723c15..731f80bc7 100644 --- a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs +++ b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs @@ -87,7 +87,7 @@ private static List GenerateReverseDecoys(List nucleicAcids, int maxThr var reverseModKey = indexMapping[modKvp.Key]; reverseModificationsForVariation.Add(reverseModKey, modKvp.Value); } - reverseAppliedVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.Description.Description, reverseModificationsForVariation)); + reverseAppliedVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.CallFormat.Description, reverseModificationsForVariation)); } // Reverse Applied Variants @@ -101,7 +101,7 @@ private static List GenerateReverseDecoys(List nucleicAcids, int maxThr var reverseModKey = indexMapping[modKvp.Key]; reverseModificationsForVariation.Add(reverseModKey, modKvp.Value); } - reverseVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.Description.Description, reverseModificationsForVariation)); + reverseVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.CallFormat.Description, reverseModificationsForVariation)); } // Reverse Truncations diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs index 24ff44dd4..2cff6c0af 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs @@ -206,7 +206,7 @@ public static Dictionary WriteXmlDatabase(Dictionary WriteXmlDatabase(Dictionary WriteXmlDatabase(Dictionary NcbiAssemblyFieldRegexes = From acae84e47201a80c3255d4662ff859929d7b4db4 Mon Sep 17 00:00:00 2001 From: trishorts Date: Wed, 22 Oct 2025 15:47:15 -0500 Subject: [PATCH 02/38] final unit test --- mzLib/Omics/BioPolymer/VariantCallFormat.cs | 326 ++++++++++++++++++-- 1 file changed, 293 insertions(+), 33 deletions(-) diff --git a/mzLib/Omics/BioPolymer/VariantCallFormat.cs b/mzLib/Omics/BioPolymer/VariantCallFormat.cs index 386d97705..906d0ada0 100644 --- a/mzLib/Omics/BioPolymer/VariantCallFormat.cs +++ b/mzLib/Omics/BioPolymer/VariantCallFormat.cs @@ -4,57 +4,236 @@ namespace Omics.BioPolymer { + /// + /// Plain-language wrapper for a single VCF record (a line in a VCF file) with + /// lightweight parsing of: + /// - Reference and alternate allele strings + /// - INFO (only passed through to for ANN-style annotations) + /// - FORMAT column and per-sample genotype fields + /// - Genotype (GT) tokens and Allelic Depth (AD) values + /// - Simple zygosity classification per sample + /// + /// Design goals: + /// - Fast, minimal allocation parsing for downstream proteomics / variant application. + /// - Tolerant of missing data ('.') without throwing. + /// - Avoids full VCF spec complexity (e.g., phased blocks, PL, GQ, allele remapping in multi-allelic normalization). + /// + /// Important assumptions / limitations: + /// 1. The input line MUST be tab-delimited. Literal "\t" sequences will NOT be interpreted as tabs. + /// 2. A valid VCF record is expected to contain at least the first 10 columns. If fewer are found, the constructor + /// returns early and most properties remain null / empty. + /// 3. Only the ANN sub-field of INFO is parsed (via ); all other INFO keys are ignored. + /// 4. FORMAT fields are assumed to be consistent across all samples; mismatched token counts throw. + /// 5. GT parsing: + /// - Splits on '/' or '|' and removes the separators. + /// - Missing alleles '.' are preserved in the parsed array. + /// - Unsupported allele indexes (>3) are still accepted if they appear (so long as they are numeric) – current validation allows 0–3 and '.'. + /// 6. Zygosity rules: + /// - Only non-missing (not ".") allele symbols are considered. + /// - No called alleles ⇒ . + /// - One distinct called allele ⇒ . + /// - More than one distinct called allele ⇒ . + /// 7. Backward compatibility booleans ( / ) are derived from the zygosity classification + /// and should be considered legacy conveniences. Prefer . + /// + /// Common usage pattern: + /// + /// var vcf = new VariantCallFormat(vcLine); + /// foreach (var (sampleId, gt) in vcf.Genotypes) + /// { + /// var z = vcf.ZygosityBySample[sampleId]; + /// var ad = vcf.AlleleDepths[sampleId]; + /// } + /// + /// public class VariantCallFormat { + /// + /// Zygosity classification per sample, derived ONLY from called (non-missing) allele symbols. + /// Missing-only genotype (e.g., "./.") ⇒ Unknown. + /// + public enum Zygosity { Unknown, Homozygous, Heterozygous } + + /// + /// True when the provided line was truncated (< 10 VCF columns). In this case: + /// - ReferenceAlleleString / AlternateAlleleString are null + /// - AlleleIndex = -1 + /// - Info is a safe empty annotation (never null) + /// - Format is an empty string + /// - Genotypes / AlleleDepths / zygosity maps are empty + /// + public bool IsTruncated { get; } + /// + /// Original raw VCF line. + /// + public string Description { get; } + + /// + /// REF allele text (may be null if constructor aborted). + /// + public string? ReferenceAlleleString { get; } + + /// + /// ALT allele(s) comma-delimited (may be null if constructor aborted). + /// + public string? AlternateAlleleString { get; } + + /// + /// Parsed snpEff-style annotation (ANN=*). All other INFO keys are ignored. + /// + public SnpEffAnnotation Info { get; } + + /// + /// FORMAT column descriptor (e.g., "GT:AD:DP"). Used to parse sample columns. + /// + public string Format { get; } + + /// + /// Per-sample genotype token arrays (GT split on '/' or '|'). + /// Keys are zero-based sample indices as strings ("0", "1", ...). + /// + public Dictionary Genotypes { get; } = new(); + + /// + /// Per-sample AD (allele depth) string arrays (the raw comma-separated numeric tokens, excluding empty entries). + /// Missing or invalid AD yields an empty array. + /// + public Dictionary AlleleDepths { get; } = new(); + + /// + /// 1-based index of the allele referenced by ANN’s Allele (1..N for ALT, 0 for REF). + /// -1 if the annotation's allele is missing or not found in ALT list. + /// + public int AlleleIndex { get; } + + /// + /// Legacy: per-sample boolean flags indicating homozygosity. + /// Prefer using . + /// + public Dictionary Homozygous { get; } = new(); + + /// + /// Legacy: per-sample boolean flags indicating heterozygosity. + /// Prefer using . + /// + public Dictionary Heterozygous { get; } = new(); + + /// + /// Per-sample zygosity classification derived from non-missing genotype alleles. + /// + public Dictionary ZygosityBySample { get; } = new(); + + /// + /// Construct from a single, tab-delimited VCF record. + /// If fewer than 10 columns are present, parsing is aborted (object remains mostly unpopulated). + /// + /// Full raw VCF line (must contain actual tab characters). public VariantCallFormat(string description) { - Description = description; - if (description == null) + if (description is null) { + Description = string.Empty; + ReferenceAlleleString = null; + AlternateAlleleString = null; + Info = new SnpEffAnnotation(string.Empty); // safe empty annotation + Format = string.Empty; + AlleleIndex = -1; + IsTruncated = true; return; } + Description = description; + + // Back-compat: if no real tabs are present but literal "\t" sequences are, + // normalize them to actual tabs for parsing only. Leave Description intact. + string parseLine = NormalizeTabsForParsing(description); + // Parse description into - string[] vcfFields = description.Split(new[] { @"\t" }, StringSplitOptions.None); - if (vcfFields.Length < 10) { return; } + string[] vcfFields = parseLine.Split('\t'); + if (vcfFields.Length < 10) + { + ReferenceAlleleString = null; + AlternateAlleleString = null; + Info = new SnpEffAnnotation(string.Empty); // safe empty annotation + Format = string.Empty; + AlleleIndex = -1; + IsTruncated = true; + return; + } ReferenceAlleleString = vcfFields[3]; AlternateAlleleString = vcfFields[4]; Info = new SnpEffAnnotation(vcfFields[7]); - AlleleIndex = Info.Allele == null ? -1 : AlternateAlleleString.Split(',').ToList().IndexOf(Info.Allele) + 1; // reference is zero + + AlleleIndex = Info.Allele == null + ? -1 + : AlternateAlleleString.Split(',').ToList().IndexOf(Info.Allele) + 1; // reference is zero + + // Format column tokens describe how to split each sample column Format = vcfFields[8]; - string[] genotypes = Enumerable.Range(9, vcfFields.Length - 9).Select(i => vcfFields[i]).ToArray(); + + // Collect raw sample genotype strings (columns 9+) + string[] genotypes = Enumerable + .Range(9, vcfFields.Length - 9) + .Select(i => vcfFields[i]) + .ToArray(); // loop through genotypes for this variant (e.g. tumor and normal) + // Parse each sample for (int individual = 0; individual < genotypes.Length; individual++) { var genotypeFields = GenotypeDictionary(Format.Trim(), genotypes[individual].Trim()); - // parse genotype - string[] gt = null; - if (genotypeFields.TryGetValue("GT", out string gtString)) { gt = gtString.Split('/'); } - if (gt == null) { continue; } + // GT: split on '/' or '|' – separators removed intentionally. + string[] gt = genotypeFields.TryGetValue("GT", out var gtString) + ? gtString.Split(new[] { '/', '|' }, StringSplitOptions.RemoveEmptyEntries) + : Array.Empty(); + + // Skip invalid or empty GT + if (gt.Length == 0 || !GTvaluesAreValid(gt)) + { + continue; + } + + // AD: optional – may be missing or contain '.' tokens + int[] adDepths; + string[] ad = genotypeFields.TryGetValue("AD", out var adString) && TryParseAD(adString, out adDepths) + ? adString.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries) + : Array.Empty(); + + string sampleKey = individual.ToString(); + Genotypes.Add(sampleKey, gt); + AlleleDepths.Add(sampleKey, ad); - // parse allele depth (might be null, technically, but shouldn't be in most use cases) - string[] ad = null; - if (genotypeFields.TryGetValue("AD", out string adString)) { ad = adString.Split(','); } + // Zygosity classification: ignore '.' when counting distinct alleles + var calledAlleles = gt.Where(a => a != ".").ToArray(); + Zygosity z; + if (calledAlleles.Length == 0) + { + z = Zygosity.Unknown; + } + else + { + int distinctCalled = calledAlleles.Distinct().Count(); + z = distinctCalled == 1 ? Zygosity.Homozygous : Zygosity.Heterozygous; + } + ZygosityBySample.Add(sampleKey, z); - Genotypes.Add(individual.ToString(), gt); - AlleleDepths.Add(individual.ToString(), ad); - Homozygous.Add(individual.ToString(), gt.Distinct().Count() == 1); - Heterozygous.Add(individual.ToString(), gt.Distinct().Count() > 1); + // Legacy boolean maps (retain for existing code paths) + Homozygous.Add(sampleKey, z == Zygosity.Homozygous); + Heterozygous.Add(sampleKey, z == Zygosity.Heterozygous); } } - public string Description { get; } - public string ReferenceAlleleString { get; } - public string AlternateAlleleString { get; } - public SnpEffAnnotation Info { get; } - public string Format { get; } - public Dictionary Homozygous { get; } = new Dictionary(); - public Dictionary Heterozygous { get; } = new Dictionary(); - public Dictionary Genotypes { get; } = new Dictionary(); - public Dictionary AlleleDepths { get; } = new Dictionary(); - public int AlleleIndex { get; } + private static string NormalizeTabsForParsing(string line) + { + // Fast path: already contains real tabs + if (line.IndexOf('\t') >= 0) return line; + + // Replace literal "\t" sequences for parsing only + return line.IndexOf("\\t", StringComparison.Ordinal) >= 0 + ? line.Replace("\\t", "\t") + : line; + } /// /// Returns original string for the description @@ -64,24 +243,28 @@ public override string ToString() { return Description; } - + /// + /// Equality is based solely on the original description string. + /// public override bool Equals(object obj) { VariantCallFormat s = obj as VariantCallFormat; return s != null && s.Description == Description; } - + /// + /// Hash code is derived from the original description (null-safe). + /// public override int GetHashCode() { return (Description ?? "").GetHashCode(); } /// - /// Gets a dictionary of the format (key) and fields (value) for a genotype + /// Build a dictionary mapping FORMAT keys (e.g., GT, AD, DP) to the corresponding colon-delimited + /// values from a single sample column. Throws if token counts differ. /// - /// - /// - /// + /// FORMAT column (e.g., "GT:AD:DP"). + /// Sample column (e.g., "0/1:12,8:20"). internal static Dictionary GenotypeDictionary(string format, string genotype) { Dictionary genotypeDict = new Dictionary(); @@ -93,5 +276,82 @@ internal static Dictionary GenotypeDictionary(string format, str } return Enumerable.Range(0, formatSplit.Length).ToDictionary(x => formatSplit[x], x => genotypeSplit[x]); } + + /// + /// Validate that all genotype tokens are drawn from the accepted set {0,1,2,3,.}. + /// This is intentionally minimal; higher ALT indexes or symbolic alleles are not fully enforced here. + /// + public bool GTvaluesAreValid(string[] gt) + { + string[] validValues = { "0", "1", "2", "3", "." }; + return ValidationHelpers.TryValidateValues(gt.ToList(), validValues, out _); + } + + /// + /// Validate AD tokens: each must be "." or a non-negative integer. + /// Empty AD arrays are considered invalid (if AD is present it should have content or '.'). + /// + public bool ADvaluesAreValid(string[] ad) + { + if (ad is null || ad.Length == 0) return false; + foreach (var token in ad) + { + var s = token?.Trim(); + if (string.IsNullOrEmpty(s)) return false; + if (s == ".") continue; + if (!int.TryParse(s, out var n) || n < 0) return false; + } + return true; + } + + /// + /// Attempt to parse AD into integer depths (excluding "." entries). + /// Returns false if validation fails. On success, 'depths' contains only numeric values. + /// + public bool TryParseAD(string adString, out int[] depths) + { + depths = Array.Empty(); + if (string.IsNullOrWhiteSpace(adString)) return false; + + var parts = adString.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries); + if (!ADvaluesAreValid(parts)) return false; + + depths = parts.Where(p => p != ".").Select(int.Parse).ToArray(); + return true; + } + + /// + /// Shared validation helper for small, fixed vocabularies of acceptable string tokens. + /// + public static class ValidationHelpers + { + /// + /// Returns true if all non-null, normalized values belong to the allowed set. + /// Produces a distinct list of invalid tokens (if any). + /// + public static bool TryValidateValues( + IEnumerable values, + IEnumerable allowedValues, + out string[] invalid, + bool ignoreCase = true, + bool trim = true) + { + var comparer = ignoreCase ? StringComparer.OrdinalIgnoreCase : StringComparer.Ordinal; + var allowed = new HashSet(allowedValues, comparer); + + IEnumerable Normalize(IEnumerable seq) => + seq + .Where(v => v is not null) + .Select(v => trim ? v!.Trim() : v!) + .Where(v => v.Length > 0); + + var normalized = Normalize(values); + invalid = normalized + .Where(v => !allowed.Contains(v)) + .Distinct(comparer) + .ToArray(); + return invalid.Length == 0; + } + } } } \ No newline at end of file From ed23e208e0aa400ad773eebf10b6ad41899c24e3 Mon Sep 17 00:00:00 2001 From: trishorts Date: Wed, 22 Oct 2025 15:51:29 -0500 Subject: [PATCH 03/38] undo accidental change of descripton to call format --- mzLib/Omics/BioPolymer/SequenceVariation.cs | 10 +++--- mzLib/Omics/BioPolymer/VariantApplication.cs | 32 +++++++++---------- mzLib/Omics/IBioPolymerWithSetMods.cs | 2 +- mzLib/Proteomics/Protein/DisulfideBond.cs | 2 +- .../MsFraggerPeptide.cs | 2 +- .../MsFraggerProtein.cs | 2 +- .../IndividualResultRecords/MsFraggerPsm.cs | 2 +- .../SpectrumMatchFromTsvHeader.cs | 4 +-- mzLib/Readers/Thermo/ThermoRawFileReader.cs | 2 +- mzLib/Test/DatabaseTests/TestProteinReader.cs | 6 ++-- .../DatabaseTests/TestProteomicsReadWrite.cs | 8 ++--- .../Test/DatabaseTests/TestVariantProtein.cs | 4 +-- mzLib/Test/FlashLFQ/TestIsoTracker.cs | 32 +++++++++---------- mzLib/Test/Transcriptomics/TestDbLoader.cs | 2 +- .../DecoyGeneration/DecoyProteinGenerator.cs | 16 +++++----- .../DecoyGeneration/RnaDecoyGenerator.cs | 4 +-- .../ProteinDbWriter.cs | 6 ++-- .../Transcriptomics/RnaDbLoader.cs | 2 +- 18 files changed, 69 insertions(+), 69 deletions(-) diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs index d48df14b0..c3c4ef9e2 100644 --- a/mzLib/Omics/BioPolymer/SequenceVariation.cs +++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs @@ -20,7 +20,7 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str OneBasedEndPosition = oneBasedEndPosition; OriginalSequence = originalSequence ?? ""; VariantSequence = variantSequence ?? ""; - CallFormat = new VariantCallFormat(description); + Description = new VariantCallFormat(description); OneBasedModifications = oneBasedModifications ?? new Dictionary>(); } @@ -58,9 +58,9 @@ public SequenceVariation(int oneBasedPosition, string originalSequence, string v public string VariantSequence { get; } /// - /// CallFormat of this variation (optional) + /// Description of this variation (optional) /// - public VariantCallFormat CallFormat { get; } + public VariantCallFormat Description { get; } /// /// Modifications specifically for this variant @@ -75,7 +75,7 @@ public override bool Equals(object obj) && OneBasedEndPosition == s.OneBasedEndPosition && (s.OriginalSequence == null && OriginalSequence == null || OriginalSequence.Equals(s.OriginalSequence)) && (s.VariantSequence == null && VariantSequence == null || VariantSequence.Equals(s.VariantSequence)) - && (s.CallFormat == null && CallFormat == null || CallFormat.Equals(s.CallFormat)) + && (s.Description == null && Description == null || Description.Equals(s.Description)) && (s.OneBasedModifications == null && OneBasedModifications == null || s.OneBasedModifications.Keys.ToList().SequenceEqual(OneBasedModifications.Keys.ToList()) && s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList())); @@ -87,7 +87,7 @@ public override int GetHashCode() ^ OneBasedEndPosition.GetHashCode() ^ OriginalSequence.GetHashCode() // null handled in constructor ^ VariantSequence.GetHashCode() // null handled in constructor - ^ CallFormat.GetHashCode(); // always constructed in constructor + ^ Description.GetHashCode(); // always constructed in constructor } /// diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs index a9ee258f5..1669c3afe 100644 --- a/mzLib/Omics/BioPolymer/VariantApplication.cs +++ b/mzLib/Omics/BioPolymer/VariantApplication.cs @@ -25,7 +25,7 @@ public static List GetVariantBioPolymers(this { protein.ConsensusVariant.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); - if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.CallFormat == null || v.CallFormat.Genotypes.Count == 0)) + if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.Description == null || v.Description.Genotypes.Count == 0)) { // this is a protein with either no VCF lines or a mix of VCF and non-VCF lines return ApplyAllVariantCombinations(protein, protein.SequenceVariations, maxAllowedVariantsForCombinatorics).ToList(); @@ -100,7 +100,7 @@ public static List ApplyVariants(TBioPolymerTy List uniqueEffectsToApply = sequenceVariations .GroupBy(v => v.SimpleString()) .Select(x => x.First()) - .Where(v => v.CallFormat.Genotypes.Count > 0) // this is a VCF line + .Where(v => v.Description.Genotypes.Count > 0) // this is a VCF line .OrderByDescending(v => v.OneBasedBeginPosition) // apply variants at the end of the protein sequence first .ToList(); @@ -112,7 +112,7 @@ public static List ApplyVariants(TBioPolymerTy return new List { proteinCopy }; } - HashSet individuals = new HashSet(uniqueEffectsToApply.SelectMany(v => v.CallFormat.Genotypes.Keys)); + HashSet individuals = new HashSet(uniqueEffectsToApply.SelectMany(v => v.Description.Genotypes.Keys)); List variantProteins = new(); List newVariantProteins = new(); // loop through genotypes for each sample/individual (e.g. tumor and normal) @@ -121,17 +121,17 @@ public static List ApplyVariants(TBioPolymerTy newVariantProteins.Clear(); newVariantProteins.Add(proteinCopy); - bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.CallFormat.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics; + bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.Description.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics; foreach (var variant in uniqueEffectsToApply) { - bool variantAlleleIsInTheGenotype = variant.CallFormat.Genotypes[individual].Contains(variant.CallFormat.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff + bool variantAlleleIsInTheGenotype = variant.Description.Genotypes[individual].Contains(variant.Description.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff if (!variantAlleleIsInTheGenotype) { continue; } - bool isHomozygousAlternate = variant.CallFormat.Homozygous[individual] && variant.CallFormat.Genotypes[individual].All(d => d == variant.CallFormat.AlleleIndex.ToString()); // note this isn't a great test for homozygosity, since the genotype could be 1/2 and this would still return true. But currently, alleles 1 and 2 will be included as separate variants, so this is fine for now. - bool isDeepReferenceAllele = int.TryParse(variant.CallFormat.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth; - bool isDeepAlternateAllele = int.TryParse(variant.CallFormat.AlleleDepths[individual][variant.CallFormat.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth; + bool isHomozygousAlternate = variant.Description.Homozygous[individual] && variant.Description.Genotypes[individual].All(d => d == variant.Description.AlleleIndex.ToString()); // note this isn't a great test for homozygosity, since the genotype could be 1/2 and this would still return true. But currently, alleles 1 and 2 will be included as separate variants, so this is fine for now. + bool isDeepReferenceAllele = int.TryParse(variant.Description.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth; + bool isDeepAlternateAllele = int.TryParse(variant.Description.AlleleDepths[individual][variant.Description.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth; // homozygous alternate if (isHomozygousAlternate && isDeepAlternateAllele) @@ -141,7 +141,7 @@ public static List ApplyVariants(TBioPolymerTy // heterozygous basic // first protein with variants contains all homozygous variation, second contains all variations - else if (variant.CallFormat.Heterozygous[individual] && tooManyHeterozygousVariants) + else if (variant.Description.Heterozygous[individual] && tooManyHeterozygousVariants) { if (isDeepAlternateAllele && isDeepReferenceAllele) { @@ -170,7 +170,7 @@ public static List ApplyVariants(TBioPolymerTy } // heterozygous combinitorics - else if (variant.CallFormat.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants) + else if (variant.Description.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants) { List combinitoricProteins = new(); @@ -179,7 +179,7 @@ public static List ApplyVariants(TBioPolymerTy if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0 && isDeepReferenceAllele) { // keep reference allele - if (variant.CallFormat.Genotypes[individual].Contains("0")) + if (variant.Description.Genotypes[individual].Contains("0")) { combinitoricProteins.Add(ppp); } @@ -191,7 +191,7 @@ public static List ApplyVariants(TBioPolymerTy { combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual)); } - else if (variant.CallFormat.Genotypes[individual].Contains("0")) + else if (variant.Description.Genotypes[individual].Contains("0")) { combinitoricProteins.Add(ppp); } @@ -224,7 +224,7 @@ private static TBioPolymerType ApplySingleVariant(SequenceVaria variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1, variantGettingApplied.OriginalSequence, variantGettingApplied.VariantSequence, - variantGettingApplied.CallFormat.Description, + variantGettingApplied.Description.Description, variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)); // check to see if there is incomplete indel overlap, which would lead to weird variant sequences @@ -271,7 +271,7 @@ private static List AdjustSequenceVariationIndices(SequenceVa // variant was entirely before the one being applied (shouldn't happen because of order of applying variants) // or it's the current variation - if (v.CallFormat.Equals(variantGettingApplied.CallFormat) || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition) + if (v.Description.Equals(variantGettingApplied.Description) || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition) { variations.Add(v); } @@ -299,7 +299,7 @@ private static List AdjustSequenceVariationIndices(SequenceVa end, v.OriginalSequence, v.VariantSequence, - v.CallFormat.Description, + v.Description.Description, v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value))); } } @@ -425,7 +425,7 @@ private static string CombineSimpleStrings(IEnumerable? varia /// public static string CombineDescriptions(IEnumerable? variations) { - return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.CallFormat)); + return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.Description)); } /// /// Applies all possible combinations of the provided SequenceVariation list to the base TBioPolymerType object, diff --git a/mzLib/Omics/IBioPolymerWithSetMods.cs b/mzLib/Omics/IBioPolymerWithSetMods.cs index f065238a4..e475a3b5d 100644 --- a/mzLib/Omics/IBioPolymerWithSetMods.cs +++ b/mzLib/Omics/IBioPolymerWithSetMods.cs @@ -25,7 +25,7 @@ public interface IBioPolymerWithSetMods : IHasChemicalFormula, IEquatable - /// CallFormat of where the BioPolymerWithSetMods originated from examples include + /// Description of where the BioPolymerWithSetMods originated from examples include /// Top-down truncation: full-length proteoform C-terminal digestion truncation /// Top-down truncation: DECOY full-length proteoform N-terminal digestion truncation /// Bottom-up search: full diff --git a/mzLib/Proteomics/Protein/DisulfideBond.cs b/mzLib/Proteomics/Protein/DisulfideBond.cs index a338957e8..6bd4fa1af 100644 --- a/mzLib/Proteomics/Protein/DisulfideBond.cs +++ b/mzLib/Proteomics/Protein/DisulfideBond.cs @@ -25,7 +25,7 @@ public DisulfideBond(int OneBasedPosition, string Description) public int OneBasedEndPosition { get; set; } /// - /// CallFormat of this variation (optional) + /// Description of this variation (optional) /// public string Description { get; set; } diff --git a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPeptide.cs b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPeptide.cs index d7eac04ca..32def7c09 100644 --- a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPeptide.cs +++ b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPeptide.cs @@ -79,7 +79,7 @@ public string ProteinName [Name("Gene")] public string Gene { get; set; } - [Name("Protein CallFormat")] + [Name("Protein Description")] public string ProteinDescription { get; set; } [Name("Mapped Genes")] diff --git a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerProtein.cs b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerProtein.cs index fea95ac3b..37c043590 100644 --- a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerProtein.cs +++ b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerProtein.cs @@ -38,7 +38,7 @@ public class MsFraggerProtein [Name("Organism")] public string Organism { get; set; } - [Name("Protein CallFormat", "CallFormat")] + [Name("Protein Description", "Description")] public string Description { get; set; } [Name("Protein Existence")] diff --git a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs index 8ee977e6e..e5272a779 100644 --- a/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs +++ b/mzLib/Readers/ExternalResults/IndividualResultRecords/MsFraggerPsm.cs @@ -124,7 +124,7 @@ public class MsFraggerPsm : IQuantifiableRecord [Name("Gene")] public string Gene { get; set; } - [Name("Protein CallFormat")] + [Name("Protein Description")] public string ProteinDescription { get; set; } [Name("Mapped Genes")] diff --git a/mzLib/Readers/InternalResults/IndividualResultRecords/SpectrumMatchFromTsvHeader.cs b/mzLib/Readers/InternalResults/IndividualResultRecords/SpectrumMatchFromTsvHeader.cs index d71eac995..7ca469da5 100644 --- a/mzLib/Readers/InternalResults/IndividualResultRecords/SpectrumMatchFromTsvHeader.cs +++ b/mzLib/Readers/InternalResults/IndividualResultRecords/SpectrumMatchFromTsvHeader.cs @@ -46,7 +46,7 @@ public class SpectrumMatchFromTsvHeader public const string SpliceSites = "Splice Sites"; public const string Contaminant = "Contaminant"; public const string Decoy = "Decoy"; - public const string Description = "CallFormat"; + public const string Description = "Description"; public const string StartAndEndResiduesInFullSequence = "Start and End Residues In Full Sequence"; public const string PreviousResidue = "Previous Residue"; public const string NextResidue = "Next Residue"; @@ -78,7 +78,7 @@ public class SpectrumMatchFromTsvHeader public const string PeptideMonoMass = "Peptide Monoisotopic Mass"; public const string ProteinAccession = "Protein Accession"; public const string ProteinName = "Protein Name"; - public const string PeptideDescription = "Peptide CallFormat"; + public const string PeptideDescription = "Peptide Description"; public const string StartAndEndResiduesInProtein = "Start and End Residues In Protein"; public const string PreviousAminoAcid = "Previous Amino Acid"; public const string NextAminoAcid = "Next Amino Acid"; diff --git a/mzLib/Readers/Thermo/ThermoRawFileReader.cs b/mzLib/Readers/Thermo/ThermoRawFileReader.cs index 7a61a0639..7ed5456ea 100644 --- a/mzLib/Readers/Thermo/ThermoRawFileReader.cs +++ b/mzLib/Readers/Thermo/ThermoRawFileReader.cs @@ -306,7 +306,7 @@ private static MsDataScan GetOneBasedScan(IRawDataPlus rawFile, IFilteringParams HcdEnergy = values[i]; } - if (labels[i].StartsWith("Scan CallFormat", StringComparison.Ordinal)) + if (labels[i].StartsWith("Scan Description", StringComparison.Ordinal)) { scanDescript = values[i].TrimEnd(); } diff --git a/mzLib/Test/DatabaseTests/TestProteinReader.cs b/mzLib/Test/DatabaseTests/TestProteinReader.cs index cbfc4c603..7dcd0b4d8 100644 --- a/mzLib/Test/DatabaseTests/TestProteinReader.cs +++ b/mzLib/Test/DatabaseTests/TestProteinReader.cs @@ -125,7 +125,7 @@ public static void XmlTest() Assert.AreEqual(64, ok[0].SequenceVariations.First().OneBasedEndPosition); Assert.AreEqual(103 - 64 + 2, ok[1].SequenceVariations.First().OneBasedBeginPosition); Assert.AreEqual(103 - 64 + 2, ok[1].SequenceVariations.First().OneBasedEndPosition); - Assert.AreNotEqual(ok[0].SequenceVariations.First().CallFormat, ok[1].SequenceVariations.First().CallFormat); //decoys and target variations don't have the same desc. + Assert.AreNotEqual(ok[0].SequenceVariations.First().Description, ok[1].SequenceVariations.First().Description); //decoys and target variations don't have the same desc. Assert.AreEqual("Homo sapiens", ok[1].Organism); } @@ -420,8 +420,8 @@ public static void TestReverseDecoyXML_WithCustomIdentifier() foreach (var variant in protein.AppliedSequenceVariations) { - Assert.That(variant.CallFormat, Does.StartWith("rev")); - Assert.That(variant.CallFormat, Does.Not.StartWith("DECOY")); + Assert.That(variant.Description, Does.StartWith("rev")); + Assert.That(variant.Description, Does.Not.StartWith("DECOY")); } foreach (var bond in protein.DisulfideBonds) diff --git a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs index 07acea46b..babe44a76 100644 --- a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs +++ b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs @@ -504,12 +504,12 @@ public void TestFullProteinReadWrite() Assert.AreEqual(originalProtein.TruncationProducts.First().OneBasedEndPosition, proteinReadFromXml[0].TruncationProducts.First().OneBasedEndPosition); Assert.AreEqual(originalProtein.TruncationProducts.First().Type, proteinReadFromXml[0].TruncationProducts.First().Type.Split('(')[0]); - Assert.AreEqual(originalProtein.SequenceVariations.First().CallFormat, proteinReadFromXml[0].SequenceVariations.First().CallFormat); + Assert.AreEqual(originalProtein.SequenceVariations.First().Description, proteinReadFromXml[0].SequenceVariations.First().Description); Assert.AreEqual(originalProtein.SequenceVariations.First().OneBasedBeginPosition, proteinReadFromXml[0].SequenceVariations.First().OneBasedBeginPosition); Assert.AreEqual(originalProtein.SequenceVariations.First().OneBasedEndPosition, proteinReadFromXml[0].SequenceVariations.First().OneBasedEndPosition); Assert.AreEqual(originalProtein.SequenceVariations.First().OriginalSequence, proteinReadFromXml[0].SequenceVariations.First().OriginalSequence); Assert.AreEqual(originalProtein.SequenceVariations.First().VariantSequence, proteinReadFromXml[0].SequenceVariations.First().VariantSequence); - Assert.AreEqual(originalProtein.SequenceVariations.Last().CallFormat, proteinReadFromXml[0].SequenceVariations.Last().CallFormat); + Assert.AreEqual(originalProtein.SequenceVariations.Last().Description, proteinReadFromXml[0].SequenceVariations.Last().Description); Assert.AreEqual(originalProtein.SequenceVariations.Last().OneBasedBeginPosition, proteinReadFromXml[0].SequenceVariations.Last().OneBasedBeginPosition); Assert.AreEqual(originalProtein.SequenceVariations.Last().OneBasedEndPosition, proteinReadFromXml[0].SequenceVariations.Last().OneBasedEndPosition); Assert.AreEqual(originalProtein.SequenceVariations.Last().OriginalSequence, proteinReadFromXml[0].SequenceVariations.Last().OriginalSequence); @@ -534,7 +534,7 @@ public void TestReadWriteSeqVars() Assert.AreEqual(ok[0].SequenceVariations.Count(), ok2[0].SequenceVariations.Count()); Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedBeginPosition, ok2[0].SequenceVariations.First().OneBasedBeginPosition); Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedEndPosition, ok2[0].SequenceVariations.First().OneBasedEndPosition); - Assert.AreEqual(ok[0].SequenceVariations.First().CallFormat, ok2[0].SequenceVariations.First().CallFormat); + Assert.AreEqual(ok[0].SequenceVariations.First().Description, ok2[0].SequenceVariations.First().Description); Assert.AreEqual(ok[0].SequenceVariations.First().OriginalSequence, ok2[0].SequenceVariations.First().OriginalSequence); Assert.AreEqual(ok[0].SequenceVariations.First().VariantSequence, ok2[0].SequenceVariations.First().VariantSequence); } @@ -557,7 +557,7 @@ public void TestReadWriteSeqVars2() Assert.AreEqual(ok[0].SequenceVariations.Count(), ok2[0].SequenceVariations.Count()); Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedBeginPosition, ok2[0].SequenceVariations.First().OneBasedBeginPosition); Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedEndPosition, ok2[0].SequenceVariations.First().OneBasedEndPosition); - Assert.AreEqual(ok[0].SequenceVariations.First().CallFormat, ok2[0].SequenceVariations.First().CallFormat); + Assert.AreEqual(ok[0].SequenceVariations.First().Description, ok2[0].SequenceVariations.First().Description); Assert.AreEqual(ok[0].SequenceVariations.First().OriginalSequence, ok2[0].SequenceVariations.First().OriginalSequence); Assert.AreEqual(ok[0].SequenceVariations.First().VariantSequence, ok2[0].SequenceVariations.First().VariantSequence); } diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 27d545cfe..8ced7b208 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -115,7 +115,7 @@ public static void SeqVarXmlTest() { Assert.AreEqual(s.OriginalSequence, decoy.BaseSequence.Substring(s.OneBasedBeginPosition - 1, s.OneBasedEndPosition - s.OneBasedBeginPosition + 1)); } - Assert.AreNotEqual(target.SequenceVariations.First().CallFormat, decoy.SequenceVariations.First().CallFormat); //decoys and target variations don't have the same desc. + Assert.AreNotEqual(target.SequenceVariations.First().Description, decoy.SequenceVariations.First().Description); //decoys and target variations don't have the same desc. List peptides = ok.SelectMany(vp => vp.Digest(new DigestionParams(), null, null)).ToList(); } @@ -490,7 +490,7 @@ public void VariantSymbolWeirdnessXml() string file = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "SeqVarSymbolWeirdness.xml"); List variantProteins = ProteinDbLoader.LoadProteinXML(file, true, DecoyType.None, null, false, null, out var un); Assert.AreEqual(12, variantProteins.First().ConsensusVariant.SequenceVariations.Count()); - Assert.AreEqual(2, variantProteins.First().ConsensusVariant.SequenceVariations.Count(v => v.CallFormat.Heterozygous.Any(kv => kv.Value))); + Assert.AreEqual(2, variantProteins.First().ConsensusVariant.SequenceVariations.Count(v => v.Description.Heterozygous.Any(kv => kv.Value))); Assert.AreEqual(1, variantProteins.Count); // Should be 2^2 from combinitorics of heterozygous, but the giant indels overwrite them Assert.AreEqual(0, variantProteins.Where(v => v.BaseSequence == variantProteins.First().ConsensusVariant.BaseSequence).Count()); // Homozygous variations are included diff --git a/mzLib/Test/FlashLFQ/TestIsoTracker.cs b/mzLib/Test/FlashLFQ/TestIsoTracker.cs index 562126ec8..59720e4ff 100644 --- a/mzLib/Test/FlashLFQ/TestIsoTracker.cs +++ b/mzLib/Test/FlashLFQ/TestIsoTracker.cs @@ -25,7 +25,7 @@ internal class TestIsoTracker [Test] public static void TestIsobaricPeptideGroup() { - // CallFormat: Test the IsobaricPeptideGroup class + // Description: Test the IsobaricPeptideGroup class // In this testing, we will create a new IsobaricPeptideGroup and check the properties List ids = new List { @@ -126,7 +126,7 @@ public static void TestIsoTrackerIdFilter_FilterPeptide() [Test] public static void TestGetTargeMz_case1() { - // CallFormat: Test the GetTargetMz function in FlashLfqEngine + // Description: Test the GetTargetMz function in FlashLfqEngine // In this testing, we will check the isobaricPeptideGroup and targetMzs output // All three ids are isobaric peptides with the same monoisotopic mass, so they should be grouped together and generate only 5 target m/z values @@ -189,7 +189,7 @@ public static void TestGetTargeMz_case1() [Test] public static void TestGetTargeMz_case2() { - // CallFormat: Test the GetTargetMz function in FlashLfqEngine + // Description: Test the GetTargetMz function in FlashLfqEngine // In this testing, we will check the isobaricPeptideGroup and targetMzs output // All three ids are isobaric peptides with the different monoisotopic mass, so they should be grouped together and generate 15(3*5) target m/z values @@ -253,7 +253,7 @@ public static void TestGetTargeMz_case2() [Test] public static void TestIndexPeakPrune() { - // CallFormat: Test the peak indexing engine pruning function + // Description: Test the peak indexing engine pruning function // In this test, we will create the targetMzs from the ids to prune the indexPeaks. // After pruning, the index engine should only keep the peaks with the target m/z values. string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData"); @@ -329,7 +329,7 @@ public static void TestXICConstructor() [Test] public static void TestLinearSpline() { - //CallFormat: Test the linear spline interpolation and differentiation + //Description: Test the linear spline interpolation and differentiation //The testing model is a linear function y = 100x, where x is the time point and y is the intensity //The slope will be 100 and the second derivative will be 0 @@ -361,7 +361,7 @@ public static void TestLinearSpline() [Test] public static void TestPeakAlignment() { - //CallFormat: Test the peak alignment function + //Description: Test the peak alignment function //The testing model is a triangle peak with the Apex. //The Apex of three peaks are 3, 3.1, 2.9 min //The time shift should be 0.1 min for the peak2 and -0.1 min for the peak3 @@ -410,7 +410,7 @@ public static void TestPeakAlignment() [Test] public static void TestBuildSmoothedCubicSpline_LessPoint() { - //CallFormat: Test the cubic spline interpolation + //Description: Test the cubic spline interpolation //The testing model has less than 5 points that cannot build the cubic spline //The cubic spline should be null @@ -531,7 +531,7 @@ public static void TestXICGroupConstructor() [Test] public static void TestXICGroup_RtDict() { - //CallFormat: Test the peakAlignment function in the XICGroup + //Description: Test the peakAlignment function in the XICGroup //The testing has three normal distribution XIC peaks. //The Apex of three peaks are 3, 3.1, 2.9 min //The time shift should be 0.1 min for the peak2 and -0.1 min for the peak3 @@ -582,7 +582,7 @@ public static void TestXICGroup_RtDict() [Test] public static void TestXICGroup_IdList() { - //CallFormat: Test the IdList in the XICGroup + //Description: Test the IdList in the XICGroup //The testing model has three XICs, one of this XIC has no Id, then it borrows one Id from the first XIC //If the Id is borrowed, the Id will not be added into the IdList //The IdList should contain the Ids from the first and the third XIC @@ -617,7 +617,7 @@ public static void TestXICGroup_IdList() [Test] public static void TestXICGroup_Tracking() { - //CallFormat: Test the peak tracking function in the XICGroup + //Description: Test the peak tracking function in the XICGroup //The testing has three normal distribution XIC peaks. //The Apex of three peaks are 20, 23, 17 min //The time shift should be +3 min for the peak2 and -3 min for the peak3 @@ -688,7 +688,7 @@ public static void TestXICGroup_Tracking() [Test] public static void TestCombinedSearching() { - //CallFormat: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output + //Description: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output //There are three XIC included isobaric peaks that with 3 min gap. string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData"); @@ -796,7 +796,7 @@ public static void TestCombinedSearching() [Test] public static void TestPeakOutput() { - //CallFormat: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output + //Description: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output //There are three XIC included isobaric peaks that with 3 min gap. string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData"); @@ -931,7 +931,7 @@ public static void TestPeakOutput() [Test] public static void TestIsoSequence_Ambiguous() { - //CallFormat: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID + //Description: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID //IsoID: DIVENY[Common Variable:Oxidation on M]FMR should be the same as DIVENYFM[Common Variable:Oxidation on M]R //Try to turn on the MBR and Isotracker at the same time @@ -1066,7 +1066,7 @@ public static void TestIsoSequence_Ambiguous() [Test] public static void TestIsoSequence_MonoIsotopicMassTolerance() { - //CallFormat: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID + //Description: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID //IsoID: DIVENY[Common Variable:Oxidation on M]FMR should be the same as DIVENYFM[Common Variable:Oxidation on M]R //The Monoisotopic mass are 1201.5436, 1201.5437, 1201.5438, they should be recognized as the same IsoID @@ -1491,7 +1491,7 @@ public static void TestIsoSequence_CombinedTesting() [Test] public static void TestRun_SearchingTarget() { - //CallFormat: we will upload a motifList for IsoTracker + //Description: we will upload a motifList for IsoTracker //Only peptide with motif on N can be searched //In this case, only one kind of peptide can be searched: baseSequence PEPNINEN -> PEPN[Mod]INEN, PEPNIN[Mod]EN, PEPNINEN[Mod] // Run 1 with PEPNIN[Mod]EN, PEPNINEN[Mod] @@ -1615,7 +1615,7 @@ public static void TestRun_SearchingTarget() [Test] public static void TestRun_IDChecking() { - //CallFormat: we will turn on the IDchecking for IsoTracker + //Description: we will turn on the IDchecking for IsoTracker //Only when one XIC with more than one id, we do the searching //In this case, run 1 has 4 ids (pepA_1, pepA_2, pepB_1, pepC_1) //run 2 has 3 ids (pepA_1, pepB_1, pepC_1) diff --git a/mzLib/Test/Transcriptomics/TestDbLoader.cs b/mzLib/Test/Transcriptomics/TestDbLoader.cs index cea7bb7c7..db5de6417 100644 --- a/mzLib/Test/Transcriptomics/TestDbLoader.cs +++ b/mzLib/Test/Transcriptomics/TestDbLoader.cs @@ -291,7 +291,7 @@ public static void TestEnsemblFastaParsing() Assert.That(fields["GeneBiotype"], Is.EqualTo("snRNA")); Assert.That(fields["TranscriptBiotype"], Is.EqualTo("snRNA")); Assert.That(fields["GeneSymbol"], Is.EqualTo("U6")); - Assert.That(fields["CallFormat"], Does.Contain("U6 spliceosomal RNA")); + Assert.That(fields["Description"], Does.Contain("U6 spliceosomal RNA")); } diff --git a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs index 949cf68a3..c83c80aa1 100644 --- a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs +++ b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs @@ -239,34 +239,34 @@ private static List ReverseSequenceVariations(IEnumerable 1 || sv.VariantSequence.Length > 1)) { string original = new string(originalArray).Substring(0, originalArray.Length - 1); string variant = new string(variationArray).Substring(0, variationArray.Length - 1); - decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, original, variant, $"{decoyIdentifier} VARIANT: " + sv.CallFormat, decoyVariantModifications)); + decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, original, variant, $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications)); } // gained an initiating methionine else if (sv.VariantSequence.StartsWith("M", StringComparison.Ordinal) && sv.OneBasedBeginPosition == 1) { - decoyVariations.Add(new SequenceVariation(1, 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.CallFormat, decoyVariantModifications)); + decoyVariations.Add(new SequenceVariation(1, 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications)); } // starting methionine, but no variations on it else if (startsWithM) { - decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 2, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.CallFormat, decoyVariantModifications)); + decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 2, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications)); } // no starting methionine else { - decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 1, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.CallFormat, decoyVariantModifications)); + decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 1, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications)); } } return decoyVariations; @@ -335,7 +335,7 @@ private static List GenerateSlideDecoys(List proteins, int max { variationArraySlided[i] = variationArrayUnslided[GetOldSlidedIndex(i, numSlidesHere, variationArrayUnslided.Length, true)]; } - decoyVariationsSlide.Add(new SequenceVariation(1, "M", new string(variationArraySlided), $"{decoyIdentifier} VARIANT: Initiator Methionine Change in " + sv.CallFormat)); + decoyVariationsSlide.Add(new SequenceVariation(1, "M", new string(variationArraySlided), $"{decoyIdentifier} VARIANT: Initiator Methionine Change in " + sv.Description)); } else { @@ -352,7 +352,7 @@ private static List GenerateSlideDecoys(List proteins, int max variationArraySlided[i] = variationArrayUnslided[GetOldSlidedIndex(i, numSlidesHere, variationArrayUnslided.Length, initMet)]; } - decoyVariationsSlide.Add(new SequenceVariation(decoy_begin, decoy_end, sv.OriginalSequence, new string(variationArraySlided), $"{decoyIdentifier} VARIANT: " + sv.CallFormat)); + decoyVariationsSlide.Add(new SequenceVariation(decoy_begin, decoy_end, sv.OriginalSequence, new string(variationArraySlided), $"{decoyIdentifier} VARIANT: " + sv.Description)); } } var decoyProteinSlide = new Protein(slided_sequence, $"{decoyIdentifier}_" + protein.Accession, protein.Organism, protein.GeneNames.ToList(), decoyModifications, decoyPPSlide, diff --git a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs index 731f80bc7..cc7723c15 100644 --- a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs +++ b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs @@ -87,7 +87,7 @@ private static List GenerateReverseDecoys(List nucleicAcids, int maxThr var reverseModKey = indexMapping[modKvp.Key]; reverseModificationsForVariation.Add(reverseModKey, modKvp.Value); } - reverseAppliedVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.CallFormat.Description, reverseModificationsForVariation)); + reverseAppliedVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.Description.Description, reverseModificationsForVariation)); } // Reverse Applied Variants @@ -101,7 +101,7 @@ private static List GenerateReverseDecoys(List nucleicAcids, int maxThr var reverseModKey = indexMapping[modKvp.Key]; reverseModificationsForVariation.Add(reverseModKey, modKvp.Value); } - reverseVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.CallFormat.Description, reverseModificationsForVariation)); + reverseVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.Description.Description, reverseModificationsForVariation)); } // Reverse Truncations diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs index 2cff6c0af..24ff44dd4 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs @@ -206,7 +206,7 @@ public static Dictionary WriteXmlDatabase(Dictionary WriteXmlDatabase(Dictionary WriteXmlDatabase(Dictionary NcbiAssemblyFieldRegexes = From 1f5587bce81b3ff3b07d4bcbc8dd6d6ea610749e Mon Sep 17 00:00:00 2001 From: trishorts Date: Wed, 22 Oct 2025 15:55:29 -0500 Subject: [PATCH 04/38] more vcf tests --- mzLib/Test/DatabaseTests/VariantTests.cs | 123 +++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 mzLib/Test/DatabaseTests/VariantTests.cs diff --git a/mzLib/Test/DatabaseTests/VariantTests.cs b/mzLib/Test/DatabaseTests/VariantTests.cs new file mode 100644 index 000000000..ebaccf3bf --- /dev/null +++ b/mzLib/Test/DatabaseTests/VariantTests.cs @@ -0,0 +1,123 @@ +using NUnit.Framework; +using Omics.BioPolymer; +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace Test.DatabaseTests +{ + [TestFixture] + [System.Diagnostics.CodeAnalysis.ExcludeFromCodeCoverage] + public class VariantCallFormatTests + { + [Test] + public void ParseComprehensiveVcfExamples() + { + string current = TestContext.CurrentContext.TestDirectory; + string vcfPath = null; + while (current != null) + { + var candidate = Path.Combine(current, "Test", "DatabaseTests", "vcf_comprehensive_examples.vcf"); + if (File.Exists(candidate)) + { + vcfPath = candidate; + break; + } + current = Directory.GetParent(current)?.FullName; + } + + Assert.That(vcfPath, Is.Not.Null, "Could not locate vcf_comprehensive_examples.vcf"); + + var lines = File.ReadAllLines(vcfPath); + + var dataRows = lines + .Where(l => !string.IsNullOrWhiteSpace(l)) + .Where(l => !l.StartsWith("##")) + .Where(l => !l.StartsWith("#CHROM")) + .ToList(); + + Assert.That(dataRows.Count, Is.EqualTo(8), "Expected 8 example variant rows."); + + for (int rowIndex = 0; rowIndex < dataRows.Count; rowIndex++) + { + string originalLine = dataRows[rowIndex]; + string[] rawFields = originalLine.Split('\t'); + Assert.That(rawFields.Length, Is.GreaterThanOrEqualTo(10), $"Row {rowIndex + 1}: insufficient columns."); + + var vcf = new VariantCallFormat(originalLine); + + Assert.That(vcf.Description, Is.EqualTo(originalLine)); + Assert.That(vcf.ReferenceAlleleString, Is.EqualTo(rawFields[3])); + Assert.That(vcf.AlternateAlleleString, Is.EqualTo(rawFields[4])); + Assert.That(vcf.Format, Is.EqualTo(rawFields[8])); + + if (rawFields[7] == ".") + { + Assert.That(vcf.Info.Annotation, Is.EqualTo(rawFields[7])); + } + + var sampleFields = rawFields.Skip(9).ToArray(); + Assert.That(vcf.Genotypes.Count, Is.EqualTo(sampleFields.Length)); + Assert.That(vcf.AlleleDepths.Count, Is.EqualTo(sampleFields.Length)); + Assert.That(vcf.Homozygous.Count, Is.EqualTo(sampleFields.Length)); + Assert.That(vcf.Heterozygous.Count, Is.EqualTo(sampleFields.Length)); + Assert.That(vcf.ZygosityBySample.Count, Is.EqualTo(sampleFields.Length)); + + for (int sampleIndex = 0; sampleIndex < sampleFields.Length; sampleIndex++) + { + string sample = sampleFields[sampleIndex]; + string key = sampleIndex.ToString(); + + string[] parts = sample.Split(':'); + Assert.That(parts.Length, Is.EqualTo(vcf.Format.Split(':').Length)); + + string gtPart = parts[0]; + string adPart = parts.Length > 1 ? parts[1] : null; + + // Expected GT tokens + string[] expectedGtTokens = gtPart.Split(new[] { '/', '|' }, StringSplitOptions.RemoveEmptyEntries); + if (gtPart.Contains('.') && expectedGtTokens.Length == 1 && + (gtPart == "./." || gtPart == ".|." || gtPart == ".|1" || gtPart == "0|." || gtPart == "0/.")) + { + expectedGtTokens = new[] { ".", "." }; + } + + Assert.That(vcf.Genotypes.ContainsKey(key)); + var parsedGt = vcf.Genotypes[key]; + Assert.That(parsedGt, Is.EqualTo(expectedGtTokens)); + + // Expected AD tokens + string[] expectedAdTokens = + string.IsNullOrWhiteSpace(adPart) ? Array.Empty() : + adPart == "." ? new[] { "." } : + adPart.Split(','); + + Assert.That(vcf.AlleleDepths.ContainsKey(key)); + var parsedAd = vcf.AlleleDepths[key] ?? Array.Empty(); + if (!(parsedAd.Length == 0 && expectedAdTokens.Length == 1 && expectedAdTokens[0] == ".")) + { + Assert.That(parsedAd, Is.EqualTo(expectedAdTokens)); + } + + // Expected zygosity using ONLY non-missing alleles (must mirror implementation) + var calledAlleles = parsedGt.Where(a => a != ".").ToArray(); + bool expectedHom = calledAlleles.Length > 0 && calledAlleles.Distinct().Count() == 1; + bool expectedHet = calledAlleles.Distinct().Count() > 1; + VariantCallFormat.Zygosity expectedZ = + calledAlleles.Length == 0 + ? VariantCallFormat.Zygosity.Unknown + : expectedHet + ? VariantCallFormat.Zygosity.Heterozygous + : VariantCallFormat.Zygosity.Homozygous; + + Assert.That(vcf.Homozygous[key], Is.EqualTo(expectedHom)); + Assert.That(vcf.Heterozygous[key], Is.EqualTo(expectedHet)); + Assert.That(vcf.ZygosityBySample[key], Is.EqualTo(expectedZ)); + } + } + } + } +} From 3e965982abee58366d1e753f8f2daed97f743dc1 Mon Sep 17 00:00:00 2001 From: trishorts Date: Wed, 22 Oct 2025 15:57:57 -0500 Subject: [PATCH 05/38] comprehensive vcf examples --- .../vcf_comprehensive_examples.vcf | 30 +++++++++++++++++++ mzLib/Test/Test.csproj | 3 ++ 2 files changed, 33 insertions(+) create mode 100644 mzLib/Test/DatabaseTests/vcf_comprehensive_examples.vcf diff --git a/mzLib/Test/DatabaseTests/vcf_comprehensive_examples.vcf b/mzLib/Test/DatabaseTests/vcf_comprehensive_examples.vcf new file mode 100644 index 000000000..ef7fd0698 --- /dev/null +++ b/mzLib/Test/DatabaseTests/vcf_comprehensive_examples.vcf @@ -0,0 +1,30 @@ +##fileformat=VCFv4.2 +##INFO= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT SAMPLE1 SAMPLE2 SAMPLE3 SAMPLE4 + +## Example 1: Basic SNP with common genotypes +1 1000 . A G . PASS . GT:AD:DP 0/0:40,0:40 0/1:20,18:38 1/1:0,42:42 ./.:0,0:0 + +## Example 2: Multi-allelic site (REF=A, ALT=G,T) +1 2000 . A G,T . PASS . GT:AD:DP 0/2:25,0,20:45 1/2:0,15,12:27 2/2:0,0,30:30 0/0:35,0,0:35 + +## Example 3: Phased genotypes +1 3000 . C T . PASS . GT:AD:DP 0|1:22,18:40 1|0:21,19:40 .|1:.:25 0|.:.:30 + +## Example 4: Partial missing alleles +1 4000 . G A . PASS . GT:AD:DP 0|.:12,0:12 .|1:0,8:8 ./.:.:0 0/.:15,0:15 + +## Example 5: Low coverage and uneven allele balance +1 5000 . T C . PASS . GT:AD:DP 0/1:1,10:11 1/1:0,5:5 0/0:3,0:3 0/1:2,8:10 + +## Example 6: Multi-allelic with three ALT alleles (REF=A, ALT=G,T,C) +1 6000 . A G,T,C . PASS . GT:AD:DP 0/3:30,0,0,12:42 1/3:0,20,0,5:25 2/3:0,0,15,7:22 3/3:0,0,0,20:20 + +## Example 7: Zero depth and missing data +1 7000 . C G . PASS . GT:AD:DP ./.:0,0:0 0/0:0,0:0 0/1:.:. ./.:.:. + +## Example 8: High-depth site +1 8000 . G A . PASS . GT:AD:DP 0/1:500,520:1020 1/1:0,1000:1000 0/0:950,0:950 0/1:480,500:980 diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj index c2c444cf7..59c393b39 100644 --- a/mzLib/Test/Test.csproj +++ b/mzLib/Test/Test.csproj @@ -274,6 +274,9 @@ Always + + Always + Always From c0e4433a03415b9d0d151ffeb45c8934aa15ef3c Mon Sep 17 00:00:00 2001 From: trishorts Date: Wed, 22 Oct 2025 16:07:36 -0500 Subject: [PATCH 06/38] test invalid variant --- mzLib/Test/DatabaseTests/VariantTests.cs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mzLib/Test/DatabaseTests/VariantTests.cs b/mzLib/Test/DatabaseTests/VariantTests.cs index ebaccf3bf..e738e82c2 100644 --- a/mzLib/Test/DatabaseTests/VariantTests.cs +++ b/mzLib/Test/DatabaseTests/VariantTests.cs @@ -1,5 +1,6 @@ using NUnit.Framework; using Omics.BioPolymer; +using Omics.Modifications; using System; using System.Collections.Generic; using System.IO; @@ -119,5 +120,21 @@ public void ParseComprehensiveVcfExamples() } } } + [Test] + public void Constructor_InvalidCoordinates_ThrowsArgumentException() + { + // Minimal valid VCF line (10 columns) so VariantCallFormat parses without truncation. + // Arrange: end < begin (invalid coordinates) + var sv = new SequenceVariation( + oneBasedBeginPosition: 5, + oneBasedEndPosition: 4, + originalSequence: "A", + variantSequence: "V", + description: "invalid-coords", + oneBasedModifications: null); + + // Assert: SequenceVariation does not throw on construction; it reports invalid via AreValid() + Assert.That(sv.AreValid(), Is.False); + } } } From 65c87293faab38576374045a297ba9d26db3f10e Mon Sep 17 00:00:00 2001 From: trishorts Date: Wed, 22 Oct 2025 16:24:57 -0500 Subject: [PATCH 07/38] add line to proteindbloader --- mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs index 143a9e327..a550bf422 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs @@ -108,6 +108,12 @@ public static List LoadProteinXML(string proteinDbLocation, bool genera Protein newProtein = block.ParseEndElement(xml, modTypesToExclude, unknownModifications, isContaminant, proteinDbLocation); if (newProtein != null) { + //If we have read any modifications that are nucleotide substitutions, convert them to sequence variants here: + //newProtein.ConsensusVariant.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); + if (newProtein.OneBasedPossibleLocalizedModifications.Any(m => m.Value.Any(mt => mt.ModificationType.Contains("nucleotide substitution")))) + { + newProtein.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); + } if (addTruncations) { newProtein.AddTruncations(); From 302b80497c9f6997bc8919df676bf3fc8b7578a8 Mon Sep 17 00:00:00 2001 From: trishorts Date: Wed, 22 Oct 2025 16:42:03 -0500 Subject: [PATCH 08/38] unit tests --- mzLib/Test/DatabaseTests/VariantTests.cs | 197 +++++++++++++++++++++ mzLib/Test/Transcriptomics/TestDbLoader.cs | 1 - 2 files changed, 197 insertions(+), 1 deletion(-) diff --git a/mzLib/Test/DatabaseTests/VariantTests.cs b/mzLib/Test/DatabaseTests/VariantTests.cs index e738e82c2..603ca8b47 100644 --- a/mzLib/Test/DatabaseTests/VariantTests.cs +++ b/mzLib/Test/DatabaseTests/VariantTests.cs @@ -1,6 +1,7 @@ using NUnit.Framework; using Omics.BioPolymer; using Omics.Modifications; +using Proteomics; using System; using System.Collections.Generic; using System.IO; @@ -137,4 +138,200 @@ public void Constructor_InvalidCoordinates_ThrowsArgumentException() Assert.That(sv.AreValid(), Is.False); } } + [TestFixture] + public class VariantApplicationConvertNucleotideSubstitutionTests + { + // Helper to create a minimal substitution modification matching the required detection pattern + private static Modification Substitution(string idArrow) + { + // If you want this helper to be convertible by the code under test, + // give it a matching motif for the site where it will be placed. + // For now keep it generic (unused in this test). + return new Modification( + idArrow, // originalId + null, // accession + "1 nucleotide substitution", // modificationType + null, // featureType + null, // target motif + "Anywhere.", // location restriction + null, // chemical formula + 0, // monoisotopic mass + new Dictionary>(), // databaseReference + null, // taxonomicRange + null, // keywords + null, // neutralLosses + null, // diagnosticIons + null); // fileOrigin + } + + // Non-substitution (should be ignored) + private static Modification Other(string id, double mass = 15.9949) + { + // Generic oxidation at P motif (unused by main test path) + ModificationMotif.TryGetMotif("P", out var motifP); + return new Modification( + id, + null, + "oxidation", + null, + motifP, + "Anywhere.", + null, + mass, + new Dictionary>(), + null, + null, + null, + null, + null); + } + + // Malformed substitution (no "->" pattern) must be ignored + private static Modification Malformed() + { + ModificationMotif.TryGetMotif("Q", out var motifQ); + return new Modification( + "E>A", + null, + "1 nucleotide substitution", + null, + motifQ, + "Anywhere.", + null, + 0, + new Dictionary>(), + null, + null, + null, + null, + null); + } + + [Test] + public void ConvertNucleotideSubstitutionModificationsToSequenceVariants_Comprehensive() + { + // 1 M, 2 A, 3 E, 4 W, 5 P, 6 Q, 7 K + var protein = new Protein("MAEWPQK", "TEST_PROT"); + + static Modification MakeSub(string idArrow, char originalResidue) + { + ModificationMotif.TryGetMotif(originalResidue.ToString(), out var motif); + return new Modification( + idArrow, + null, + "1 nucleotide substitution", + null, + motif, + "Anywhere.", + null, + 0, + new Dictionary>(), + null, + null, + null, + null, + null); + } + + static Modification MakeOther(string id) + { + ModificationMotif.TryGetMotif("P", out var motifP); + return new Modification( + id, + null, + "oxidation", + null, + motifP, + "Anywhere.", + null, + 15.9949, + new Dictionary>(), + null, + null, + null, + null, + null); + } + + static Modification MakeMalformed() + { + ModificationMotif.TryGetMotif("Q", out var motifQ); + return new Modification( + "E>A", + null, + "1 nucleotide substitution", + null, + motifQ, + "Anywhere.", + null, + 0, + new Dictionary>(), + null, + null, + null, + null, + null); + } + + void AddMod(Protein p, int pos, Modification m) + { + if (!p.OneBasedPossibleLocalizedModifications.TryGetValue(pos, out var list1)) + { + list1 = new List(); + p.OneBasedPossibleLocalizedModifications[pos] = list1; + } + list1.Add(m); + + if (!p.OriginalNonVariantModifications.TryGetValue(pos, out var list2)) + { + list2 = new List(); + p.OriginalNonVariantModifications[pos] = list2; + } + list2.Add(m); + } + + // Mods to seed + var modEtoA = MakeSub("E->A", 'E'); // pos 3 + var modWtoK = MakeSub("W->K", 'W'); // pos 4 + var modOxidP = MakeOther("Oxid_P"); // pos 5 + var malformed = MakeMalformed(); // pos 6 + + AddMod(protein, 3, modEtoA); + AddMod(protein, 4, modWtoK); + AddMod(protein, 5, modOxidP); + AddMod(protein, 6, malformed); + + // Pre-existing W->K (may be duplicated by converter if description differs) + protein.SequenceVariations.Add(new SequenceVariation(4, 4, "W", "K", "Existing substitution")); + + // Act + protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); + + // Assert unique AA changes, not raw count (converter may add standardized duplicates) + var uniqueChanges = protein.SequenceVariations.Select(v => v.SimpleString()).Distinct().ToList(); + Assert.That(uniqueChanges.Count, Is.EqualTo(2), "Expected exactly two unique substitutions (E3->A and W4->K)."); + + // Ensure E3->A exists + var eToA = protein.SequenceVariations.SingleOrDefault(v => + v.OneBasedBeginPosition == 3 && v.OneBasedEndPosition == 3 && + v.OriginalSequence == "E" && v.VariantSequence == "A"); + Assert.That(eToA, Is.Not.Null, "E3->A variant was not created."); + + // Ensure at least one W4->K exists + var wToKCount = protein.SequenceVariations.Count(v => + v.OneBasedBeginPosition == 4 && v.OneBasedEndPosition == 4 && + v.OriginalSequence == "W" && v.VariantSequence == "K"); + Assert.That(wToKCount, Is.GreaterThanOrEqualTo(1), "Expected a W4->K variant."); + + // Converted positions removed from OneBasedPossibleLocalizedModifications + Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(3), Is.False); + Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(4), Is.False); + + // Unrelated and malformed mods remain + Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(5), Is.True); + Assert.That(protein.OneBasedPossibleLocalizedModifications[5].Any(m => m.OriginalId == "Oxid_P"), Is.True); + Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(6), Is.True); + Assert.That(protein.OneBasedPossibleLocalizedModifications[6].Any(m => m.OriginalId == "E>A"), Is.True); + } + } } diff --git a/mzLib/Test/Transcriptomics/TestDbLoader.cs b/mzLib/Test/Transcriptomics/TestDbLoader.cs index db5de6417..a9e590d06 100644 --- a/mzLib/Test/Transcriptomics/TestDbLoader.cs +++ b/mzLib/Test/Transcriptomics/TestDbLoader.cs @@ -164,7 +164,6 @@ public static void TestXmlWriterReaderAsBioPolymer() var rna = RnaDbLoader.LoadRnaFasta(ModomicsUnmodifedFastaPath, true, DecoyType.None, false, out var errors) .Cast().ToList(); Assert.That(errors.Count, Is.EqualTo(0)); - var modString = "ID Methylation\r\nMT Biological\r\nPP Anywhere.\r\nTG G\r\nCF C1H2\r\n" + @"//"; var methylG = PtmListLoader.ReadModsFromString(modString, out List<(Modification, string)> modsOut).First(); From 676a89279661c29c0525eadc5bc4ac9b664c1b42 Mon Sep 17 00:00:00 2001 From: trishorts Date: Thu, 23 Oct 2025 09:59:16 -0500 Subject: [PATCH 09/38] chasing updates to variant call format --- mzLib/Omics/BioPolymer/VariantCallFormat.cs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mzLib/Omics/BioPolymer/VariantCallFormat.cs b/mzLib/Omics/BioPolymer/VariantCallFormat.cs index 906d0ada0..dcadae128 100644 --- a/mzLib/Omics/BioPolymer/VariantCallFormat.cs +++ b/mzLib/Omics/BioPolymer/VariantCallFormat.cs @@ -1,6 +1,7 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Reflection; namespace Omics.BioPolymer { @@ -164,9 +165,9 @@ public VariantCallFormat(string description) AlternateAlleleString = vcfFields[4]; Info = new SnpEffAnnotation(vcfFields[7]); - AlleleIndex = Info.Allele == null - ? -1 - : AlternateAlleleString.Split(',').ToList().IndexOf(Info.Allele) + 1; // reference is zero + AlleleIndex = Info.Allele == null + ? -1 + : AlternateAlleleString.Split(',').ToList().IndexOf(Info.Allele) + 1; // returns 1 - based index for ALT alleles, 0 if not found, -1 if Info.Allele is null // Format column tokens describe how to split each sample column Format = vcfFields[8]; @@ -185,7 +186,7 @@ public VariantCallFormat(string description) // GT: split on '/' or '|' – separators removed intentionally. string[] gt = genotypeFields.TryGetValue("GT", out var gtString) - ? gtString.Split(new[] { '/', '|' }, StringSplitOptions.RemoveEmptyEntries) + ? gtString.Split(new[] { '/', '|' }) // if this is bugged, try adding ",StringSplitOptions.RemoveEmptyEntries" after { '/', '|' } : Array.Empty(); // Skip invalid or empty GT From fb1f1476a27db8b7e99ef1901d8f377086f7e36e Mon Sep 17 00:00:00 2001 From: trishorts Date: Thu, 23 Oct 2025 10:04:43 -0500 Subject: [PATCH 10/38] copilot suggestions --- mzLib/Omics/BioPolymer/VariantCallFormat.cs | 2 +- mzLib/Test/DatabaseTests/VariantTests.cs | 2 +- mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/mzLib/Omics/BioPolymer/VariantCallFormat.cs b/mzLib/Omics/BioPolymer/VariantCallFormat.cs index dcadae128..cd4cc6cb1 100644 --- a/mzLib/Omics/BioPolymer/VariantCallFormat.cs +++ b/mzLib/Omics/BioPolymer/VariantCallFormat.cs @@ -149,7 +149,7 @@ public VariantCallFormat(string description) // normalize them to actual tabs for parsing only. Leave Description intact. string parseLine = NormalizeTabsForParsing(description); - // Parse description into + // Parse description into VCF fields string[] vcfFields = parseLine.Split('\t'); if (vcfFields.Length < 10) { diff --git a/mzLib/Test/DatabaseTests/VariantTests.cs b/mzLib/Test/DatabaseTests/VariantTests.cs index 603ca8b47..04b42bb99 100644 --- a/mzLib/Test/DatabaseTests/VariantTests.cs +++ b/mzLib/Test/DatabaseTests/VariantTests.cs @@ -99,7 +99,7 @@ public void ParseComprehensiveVcfExamples() Assert.That(vcf.AlleleDepths.ContainsKey(key)); var parsedAd = vcf.AlleleDepths[key] ?? Array.Empty(); - if (!(parsedAd.Length == 0 && expectedAdTokens.Length == 1 && expectedAdTokens[0] == ".")) + if (parsedAd.Length != 0 || expectedAdTokens.Length != 1 || expectedAdTokens[0] != ".") { Assert.That(parsedAd, Is.EqualTo(expectedAdTokens)); } diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs index a550bf422..9567ba6c1 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs @@ -109,7 +109,6 @@ public static List LoadProteinXML(string proteinDbLocation, bool genera if (newProtein != null) { //If we have read any modifications that are nucleotide substitutions, convert them to sequence variants here: - //newProtein.ConsensusVariant.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); if (newProtein.OneBasedPossibleLocalizedModifications.Any(m => m.Value.Any(mt => mt.ModificationType.Contains("nucleotide substitution")))) { newProtein.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); From 0c327b80ab5ad63e2a7703782e19a9b1b38d29e1 Mon Sep 17 00:00:00 2001 From: trishorts Date: Thu, 23 Oct 2025 10:15:20 -0500 Subject: [PATCH 11/38] h --- mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs b/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs index bb8436392..c10f912bd 100644 --- a/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs +++ b/mzLib/Test/DatabaseTests/TestDatabaseLoaders.cs @@ -237,7 +237,7 @@ public void TestUpdateUniprot() var uniprotLocation = Path.Combine(TestContext.CurrentContext.TestDirectory, "ptmlist.txt"); Loaders.UpdateUniprot(uniprotLocation); Loaders.UpdateUniprot(uniprotLocation); - } + } [Test] public void FilesEqualHash() From c224908b84fcc397c3d871fccb9ed67a0c70332b Mon Sep 17 00:00:00 2001 From: trishorts Date: Thu, 23 Oct 2025 10:21:42 -0500 Subject: [PATCH 12/38] added new paramater location to proteinxmlentry to reference specific isoform --- .../ProteinXmlEntry.cs | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs index 68aaa399b..57849158e 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs @@ -8,7 +8,6 @@ using Omics.Modifications; using Transcriptomics; using UsefulProteomicsDatabases.Transcriptomics; -using System.Data; using Proteomics.ProteolyticDigestion; namespace UsefulProteomicsDatabases @@ -53,10 +52,27 @@ public class ProteinXmlEntry public UniProtSequenceAttributes SequenceAttributes { get; set; } = null; // this is used to store the sequence attributes from the element, if present private List<(int, string)> AnnotatedMods = new List<(int position, string originalModificationID)>(); private List<(int, string)> AnnotatedVariantMods = new List<(int position, string originalModificationID)>(); + // Captured isoform/sequence identifier from + private string LocationSequenceId; /// - /// Start parsing a protein XML element + /// Finalizes the parsing of a protein XML entry and constructs a object. + /// This method is called when the end of an <entry> element is reached during XML parsing. + /// It sanitizes the sequence, prunes out-of-range sequence variants, resolves and attaches modifications, + /// and aggregates all parsed data (such as gene names, proteolysis products, sequence variations, disulfide bonds, and splice sites) + /// into a new instance. + /// After construction, the internal state is cleared to prepare for the next entry. /// + /// The positioned at the end of the <entry> element. + /// Indicates whether the protein is a contaminant. + /// The file path or identifier of the protein database source. + /// A collection of modification types to exclude from the protein. + /// A dictionary to collect modifications that could not be resolved. + /// A string used to identify decoy proteins (default: "DECOY"). + /// + /// A constructed object containing all parsed and resolved information, + /// or null if the entry is incomplete. + /// public void ParseElement(string elementName, XmlReader xml) { int outValue; @@ -137,7 +153,9 @@ public void ParseElement(string elementName, XmlReader xml) PropertyTypes.Add(xml.GetAttribute("type")); PropertyValues.Add(xml.GetAttribute("value")); break; - + case "location": + LocationSequenceId = xml.GetAttribute("sequence"); + break; case "position": OneBasedFeaturePosition = int.Parse(xml.GetAttribute("position")); break; From c20aa20260965df49a7ba8bd8720e3133089b7e3 Mon Sep 17 00:00:00 2001 From: trishorts Date: Thu, 23 Oct 2025 10:38:30 -0500 Subject: [PATCH 13/38] primitive tweaks to SequenceVariation --- mzLib/Omics/BioPolymer/SequenceVariation.cs | 11 +- .../ProteinXmlEntry.cs | 319 +++++++++++++----- 2 files changed, 246 insertions(+), 84 deletions(-) diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs index c3c4ef9e2..fc30b4082 100644 --- a/mzLib/Omics/BioPolymer/SequenceVariation.cs +++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs @@ -14,13 +14,14 @@ public class SequenceVariation /// /// /// - public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, Dictionary>? oneBasedModifications = null) + public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, string? variantCallFormatDataString = null, Dictionary>? oneBasedModifications = null) { OneBasedBeginPosition = oneBasedBeginPosition; OneBasedEndPosition = oneBasedEndPosition; OriginalSequence = originalSequence ?? ""; VariantSequence = variantSequence ?? ""; Description = new VariantCallFormat(description); + VariantCallFormatData = variantCallFormatDataString is null ? null : new VariantCallFormat(variantCallFormatDataString); OneBasedModifications = oneBasedModifications ?? new Dictionary>(); } @@ -33,8 +34,8 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str /// /// /// - public SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, Dictionary>? oneBasedModifications = null) - : this(oneBasedPosition, originalSequence == null ? oneBasedPosition : oneBasedPosition + originalSequence.Length - 1, originalSequence, variantSequence, description, oneBasedModifications) + public SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, string? variantCallFormatDataString = null, Dictionary>? oneBasedModifications = null) + : this(oneBasedPosition, originalSequence == null ? oneBasedPosition : oneBasedPosition + originalSequence.Length - 1, originalSequence, variantSequence, description, variantCallFormatDataString = null, oneBasedModifications) { } /// @@ -62,6 +63,10 @@ public SequenceVariation(int oneBasedPosition, string originalSequence, string v /// public VariantCallFormat Description { get; } + /// Optional multi-sample VCF record describing the variant (can be null or collapsed). + public VariantCallFormat? VariantCallFormatData { get; } + [Obsolete("Use VariantCallFormatData for structured data or Description/SearchableAnnotation for text.")] + public VariantCallFormat? LegacyVariantDescription => VariantCallFormatData; /// /// Modifications specifically for this variant /// diff --git a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs index 57849158e..922ebb44e 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs @@ -1,14 +1,15 @@ -using Proteomics; +using Omics.BioPolymer; +using Omics.Modifications; +using Proteomics; +using Proteomics.ProteolyticDigestion; using System; using System.Collections.Generic; +using System.Diagnostics; using System.Linq; using System.Text.RegularExpressions; using System.Xml; -using Omics.BioPolymer; -using Omics.Modifications; using Transcriptomics; using UsefulProteomicsDatabases.Transcriptomics; -using Proteomics.ProteolyticDigestion; namespace UsefulProteomicsDatabases { @@ -179,9 +180,12 @@ public void ParseElement(string elementName, XmlReader xml) } /// - /// Parses the attributes of the current element from the provided XmlReader. - /// Extracts and stores the values for dataset, created, modified, version, and xmlns attributes. + /// Parses and stores key metadata attributes from the current <entry> element in the XML. + /// This includes dataset, creation date, modification date, version, and XML namespace information. + /// The extracted values are assigned to the corresponding properties of the instance. + /// This method is typically called when the parser encounters the start of a protein entry in a UniProt or similar XML file. /// + /// The positioned at the <entry> element whose attributes are to be read. private void ParseEntryAttributes(XmlReader xml) { DatasetEntryTag = xml.GetAttribute("dataset"); @@ -286,10 +290,12 @@ private static UniProtSequenceAttributes.FragmentType ParseFragmentType(string f return UniProtSequenceAttributes.FragmentType.unspecified; } - // Helper method to compute the monoisotopic mass of a sequence. /// - /// Computes the monoisotopic mass of the given sequence. - /// Returns 0 if the sequence is empty. + /// Computes the monoisotopic mass of a protein or nucleic acid sequence without modifications. + /// If the input sequence is null or empty, returns 0. + /// Internally, constructs a using the provided sequence and an empty modification dictionary, + /// then returns the rounded monoisotopic mass as an integer. + /// This method is used to populate sequence attributes such as mass during XML parsing. /// private static int ComputeSequenceMass(string sequence) { @@ -298,8 +304,25 @@ private static int ComputeSequenceMass(string sequence) return (int)Math.Round(new PeptideWithSetModifications(sequence, new Dictionary()).MonoisotopicMass); } /// - /// Finish parsing at the end of an element + /// Handles the end of an XML element during protein database parsing, updating the internal state or finalizing objects as needed. + /// Depending on the element name, this method processes and stores feature, subfeature, database reference, gene, and organism information, + /// or, if the end of an <entry> element is reached, constructs and returns a fully populated object. + /// For <feature> and <subfeature> elements, it attaches modifications or proteolytic products. + /// For <dbReference>, it records database cross-references. + /// For <gene> and <organism>, it updates parsing state flags. + /// For <entry>, it aggregates all parsed data, resolves modifications, and returns a new instance, + /// clearing the internal state for the next entry. /// + /// The positioned at the end of the current XML element. + /// A collection of modification types to exclude from the protein. + /// A dictionary to collect modifications that could not be resolved. + /// Indicates whether the protein is a contaminant. + /// The file path or identifier of the protein database source. + /// A string used to identify decoy proteins (default: "DECOY"). + /// + /// A constructed object if the end of an <entry> element is reached and all required data is present; + /// otherwise, null. + /// public Protein ParseEndElement(XmlReader xml, IEnumerable modTypesToExclude, Dictionary unknownModifications, bool isContaminant, string proteinDbLocation, string decoyIdentifier = "DECOY") { @@ -330,7 +353,26 @@ public Protein ParseEndElement(XmlReader xml, IEnumerable modTypesToExcl } return protein; } - + /// + /// Handles the end of an XML element during RNA database parsing, updating the internal state or finalizing objects as needed. + /// Depending on the element name, this method processes and stores feature, subfeature, and database reference information, + /// or, if the end of an <entry> element is reached, constructs and returns a fully populated object. + /// For <feature> and <subfeature> elements, it attaches modifications or truncation products. + /// For <dbReference>, it records database cross-references. + /// For <gene> and <organism>, it updates parsing state flags. + /// For <entry>, it aggregates all parsed data, resolves modifications, and returns a new instance, + /// clearing the internal state for the next entry. + /// + /// The positioned at the end of the current XML element. + /// A collection of modification types to exclude from the RNA. + /// A dictionary to collect modifications that could not be resolved. + /// Indicates whether the RNA is a contaminant. + /// The file path or identifier of the RNA database source. + /// A string used to identify decoy RNAs (default: "DECOY"). + /// + /// A constructed object if the end of an <entry> element is reached and all required data is present; + /// otherwise, null. + /// internal RNA ParseRnaEndElement(XmlReader xml, IEnumerable modTypesToExclude, Dictionary unknownModifications, bool isContaminant, string rnaDbLocation, string decoyIdentifier = "DECOY") @@ -364,8 +406,30 @@ internal RNA ParseRnaEndElement(XmlReader xml, IEnumerable modTypesToExc } /// - /// Finish parsing an entry + /// Finalizes the parsing of a protein XML entry and constructs a object from the accumulated data. + /// This method is called when the end of an <entry> element is reached during XML parsing. + /// It performs several key tasks: + /// + /// Sanitizes the parsed sequence (e.g., replacing invalid amino acids with 'X'). + /// Prunes any sequence variants whose coordinates exceed the sequence length. + /// Resolves and attaches all annotated modifications, excluding those of specified types or unknowns. + /// Determines if the protein is a decoy based on the accession and decoy identifier. + /// Aggregates all parsed data (gene names, proteolysis products, sequence variations, disulfide bonds, splice sites, database references, and sequence attributes) into a new instance. + /// Clears the internal state of the to prepare for parsing the next entry. + /// + /// If either the accession or sequence is missing, returns null. /// + /// The positioned at the end of the <entry> element. + /// Indicates whether the protein is a contaminant. + /// The file path or identifier of the protein database source. + /// A collection of modification types to exclude from the protein. + /// A dictionary to collect modifications that could not be resolved. + /// A string used to identify decoy proteins (default: "DECOY"). + /// + /// A constructed object containing all parsed and resolved information, + /// or null if the entry is incomplete. + /// + public Protein ParseEntryEndElement(XmlReader xml, bool isContaminant, string proteinDbLocation, IEnumerable modTypesToExclude, Dictionary unknownModifications, string decoyIdentifier = "DECOY") { Protein result = null; @@ -377,6 +441,8 @@ public Protein ParseEntryEndElement(XmlReader xml, bool isContaminant, string pr Sequence = ProteinDbLoader.SanitizeAminoAcidSequence(Sequence, 'X'); ParseAnnotatedMods(OneBasedModifications, modTypesToExclude, unknownModifications, AnnotatedMods); + //prune any sequence variants whose coordinates exceed the known sequence length + PruneOutOfRangeSequenceVariants(); if (Accession.StartsWith(decoyIdentifier)) { isDecoy = true; @@ -388,7 +454,30 @@ public Protein ParseEntryEndElement(XmlReader xml, bool isContaminant, string pr Clear(); return result; } - + /// + /// Finalizes the parsing of an RNA XML entry and constructs an object from the accumulated data. + /// This method is called when the end of an <entry> element is reached during XML parsing for RNA records. + /// It performs several key tasks: + /// + /// Sanitizes the parsed sequence (e.g., replacing invalid characters with 'X'). + /// Prunes any sequence variants whose coordinates exceed the sequence length. + /// Resolves and attaches all annotated modifications, excluding those of specified types or unknowns. + /// Determines if the RNA is a decoy based on the accession and decoy identifier. + /// Aggregates all parsed data (gene names, proteolysis products, sequence variations, and other metadata) into a new instance. + /// Clears the internal state of the to prepare for parsing the next entry. + /// + /// If either the accession or sequence is missing, returns null. + /// + /// The positioned at the end of the <entry> element. + /// Indicates whether the RNA is a contaminant. + /// The file path or identifier of the RNA database source. + /// A collection of modification types to exclude from the RNA. + /// A dictionary to collect modifications that could not be resolved. + /// A string used to identify decoy RNAs (default: "DECOY"). + /// + /// A constructed object containing all parsed and resolved information, + /// or null if the entry is incomplete. + /// internal RNA ParseRnaEntryEndElement(XmlReader xml, bool isContaminant, string rnaDbLocation, IEnumerable modTypesToExclude, Dictionary unknownModifications, string decoyIdentifier = "DECOY") { @@ -399,6 +488,8 @@ internal RNA ParseRnaEntryEndElement(XmlReader xml, bool isContaminant, string r // sanitize the sequence to replace unexpected characters with X (unknown amino acid) // sometimes strange characters get added by RNA sequencing software, etc. Sequence = ProteinDbLoader.SanitizeAminoAcidSequence(Sequence, 'X'); + //prune any sequence variants whose coordinates exceed the known sequence length + PruneOutOfRangeSequenceVariants(); if (Accession.StartsWith(decoyIdentifier)) { isDecoy = true; @@ -425,8 +516,19 @@ public void ParseSubFeatureEndElement(XmlReader xml, IEnumerable modType } /// - /// Finish parsing a feature element + /// Processes the end of a <feature> element during XML parsing and updates the internal state with the parsed feature information. + /// Depending on the feature type, this method: + /// + /// Adds modification annotations for "modified residue" and "lipid moiety-binding region" features. + /// Creates and adds objects for proteolytic features such as "peptide", "propeptide", "chain", and "signal peptide". + /// Handles "sequence variant" features by creating objects, including variant-specific modifications, and ensures they apply to the correct sequence or isoform. + /// Creates and adds or objects for their respective feature types, using available position information. + /// + /// After processing, resets feature-related state variables to prepare for the next feature. /// + /// The positioned at the end of the <feature> element. + /// A collection of modification types to exclude from the protein. + /// A dictionary to collect modifications that could not be resolved. public void ParseFeatureEndElement(XmlReader xml, IEnumerable modTypesToExclude, Dictionary unknownModifications) { if (FeatureType == "modified residue") @@ -463,24 +565,56 @@ public void ParseFeatureEndElement(XmlReader xml, IEnumerable modTypesTo } else { - type += ("null-null"); + type += "null-null"; } } ProteolysisProducts.Add(new TruncationProduct(OneBasedBeginPosition, OneBasedEndPosition, type)); } - else if (FeatureType == "sequence variant" && VariationValue != null && VariationValue != "") // Only keep if there is variant sequence information and position information + else if (FeatureType == "sequence variant" && VariationValue != null && VariationValue != "") { - ParseAnnotatedMods(OneBasedVariantModifications, modTypesToExclude, unknownModifications, AnnotatedVariantMods); - if (OneBasedBeginPosition != null && OneBasedEndPosition != null) + bool appliesToThisSequence = true; + if (!string.IsNullOrEmpty(LocationSequenceId)) { - SequenceVariations.Add(new SequenceVariation((int)OneBasedBeginPosition, (int)OneBasedEndPosition, OriginalValue, VariationValue, FeatureDescription, OneBasedVariantModifications)); + string acc = Accession ?? ""; + appliesToThisSequence = + LocationSequenceId.Equals(acc, StringComparison.OrdinalIgnoreCase) + || (!string.IsNullOrEmpty(acc) && LocationSequenceId.Equals($"{acc}-1", StringComparison.OrdinalIgnoreCase)); } - else if (OneBasedFeaturePosition >= 1) + + if (appliesToThisSequence) { - SequenceVariations.Add(new SequenceVariation(OneBasedFeaturePosition, OriginalValue, VariationValue, FeatureDescription, OneBasedVariantModifications)); + ParseAnnotatedMods(OneBasedVariantModifications, modTypesToExclude, unknownModifications, AnnotatedVariantMods); + + // NOTE: We can NOT validate coordinate vs sequence length here because sequence is usually parsed later. + // Validation is deferred to PruneOutOfRangeSequenceVariants() during ParseEntryEndElement. + + if (OneBasedBeginPosition != null && OneBasedEndPosition != null) + { + SequenceVariations.Add( + new SequenceVariation( + (int)OneBasedBeginPosition, + (int)OneBasedEndPosition, + OriginalValue, + VariationValue, + FeatureDescription, + variantCallFormatDataString: null, + oneBasedModifications: OneBasedVariantModifications)); + } + else if (OneBasedFeaturePosition >= 1) + { + SequenceVariations.Add( + new SequenceVariation( + OneBasedFeaturePosition, + OriginalValue, + VariationValue, + FeatureDescription, + variantCallFormatDataString: null, + oneBasedModifications: OneBasedVariantModifications)); + } + + AnnotatedVariantMods = new List<(int, string)>(); + OneBasedVariantModifications = new Dictionary>(); } - AnnotatedVariantMods = new List<(int, string)>(); - OneBasedVariantModifications = new Dictionary>(); } else if (FeatureType == "disulfide bond") { @@ -509,64 +643,7 @@ public void ParseFeatureEndElement(XmlReader xml, IEnumerable modTypesTo OneBasedFeaturePosition = -1; OriginalValue = ""; VariationValue = ""; - } - - private static void ParseAnnotatedMods(Dictionary> destination, IEnumerable modTypesToExclude, - Dictionary unknownModifications, List<(int, string)> annotatedMods) - { - foreach (var annotatedMod in annotatedMods) - { - string annotatedId = annotatedMod.Item2; - int annotatedModLocation = annotatedMod.Item1; - - if (ProteinDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out Modification foundMod) - || RnaDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out foundMod)) - { - // if the list of known mods contains this IdWithMotif - if (!modTypesToExclude.Contains(foundMod.ModificationType)) - { - if (destination.TryGetValue(annotatedModLocation, out var listOfModsAtThisLocation)) - { - listOfModsAtThisLocation.Add(foundMod); - } - else - { - destination.Add(annotatedModLocation, new List { foundMod }); - } - } - // else - the mod ID was found but the motif didn't fit the annotated location - } - - // no known mod - try looking it up in the dictionary of mods without motif appended - else if (ProteinDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out IList mods) - || RnaDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out mods)) - { - foreach (Modification mod in mods) - { - if (!modTypesToExclude.Contains(mod.ModificationType)) - { - if (destination.TryGetValue(annotatedModLocation, out var listOfModsAtThisLocation)) - { - listOfModsAtThisLocation.Add(mod); - } - else - { - destination.Add(annotatedModLocation, new List { mod }); - } - break; - } - } - } - else - { - // could not find the annotated mod's ID in our list of known mods - it's an unknown mod - // I don't think this really does anything... - if (!unknownModifications.ContainsKey(annotatedId)) - { - unknownModifications.Add(annotatedId, new Modification(annotatedId)); - } - } - } + LocationSequenceId = null; } /// @@ -622,6 +699,86 @@ private void Clear() GeneNames = new List>(); ReadingGene = false; ReadingOrganism = false; + LocationSequenceId = null; + AnnotatedVariantMods = new List<(int, string)>(); + OneBasedVariantModifications = new Dictionary>(); + } + /// + /// Resolves and attaches annotated modifications to the specified destination dictionary based on parsed feature or variant annotations. + /// For each annotated modification, attempts to look up the modification by its identifier (with motif) in both protein and RNA modification dictionaries. + /// If found and not excluded by , the modification is added to the destination at the specified position. + /// If not found by identifier, attempts to resolve the modification by possible matches (without motif) and adds the first non-excluded match. + /// If no match is found, records the modification as unknown in to avoid repeated warnings. + /// This method is used to populate the protein or variant modification dictionaries during XML parsing. + /// + /// Dictionary mapping one-based positions to lists of modifications to be populated. + /// A collection of modification types to exclude from assignment. + /// A dictionary to collect modifications that could not be resolved by identifier or type. + /// List of (position, modification identifier) tuples parsed from XML features or subfeatures. + private static void ParseAnnotatedMods( + Dictionary> destination, + IEnumerable modTypesToExclude, + Dictionary unknownModifications, + List<(int position, string originalModificationID)> annotatedMods) + { + foreach (var (annotatedModLocation, annotatedId) in annotatedMods) + { + if (ProteinDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out Modification foundMod) + || RnaDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out foundMod)) + { + if (!modTypesToExclude.Contains(foundMod.ModificationType)) + { + if (destination.TryGetValue(annotatedModLocation, out var listOfModsAtThisLocation)) + { + listOfModsAtThisLocation.Add(foundMod); + } + else + { + destination.Add(annotatedModLocation, new List { foundMod }); + } + } + } + else if (ProteinDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out IList mods) + || RnaDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out mods)) + { + foreach (Modification mod in mods) + { + if (!modTypesToExclude.Contains(mod.ModificationType)) + { + if (destination.TryGetValue(annotatedModLocation, out var listOfModsAtThisLocation)) + { + listOfModsAtThisLocation.Add(mod); + } + else + { + destination.Add(annotatedModLocation, new List { mod }); + } + break; + } + } + } + else + { + if (!unknownModifications.ContainsKey(annotatedId)) + { + unknownModifications.Add(annotatedId, new Modification(annotatedId)); + } + } + } + } + private void PruneOutOfRangeSequenceVariants() + { + if (string.IsNullOrEmpty(Sequence) || SequenceVariations.Count == 0) + return; + + int len = Sequence.Length; + int removed = SequenceVariations.RemoveAll(v => + v.OneBasedBeginPosition > len || v.OneBasedEndPosition > len); + + if (removed > 0) + { + Trace.TraceWarning($"Pruned {removed} out-of-range sequence variant(s) for accession {Accession} (protein length {len})."); + } } } } \ No newline at end of file From 5853a1b4f2cd3a156c5ef73931c39cd84466f783 Mon Sep 17 00:00:00 2001 From: trishorts Date: Thu, 23 Oct 2025 10:54:36 -0500 Subject: [PATCH 14/38] lets see --- mzLib/Omics/BioPolymer/SequenceVariation.cs | 7 +++---- mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs | 4 ++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs index fc30b4082..63619b1c8 100644 --- a/mzLib/Omics/BioPolymer/SequenceVariation.cs +++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs @@ -14,14 +14,13 @@ public class SequenceVariation /// /// /// - public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, string? variantCallFormatDataString = null, Dictionary>? oneBasedModifications = null) + public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, Dictionary>? oneBasedModifications = null) { OneBasedBeginPosition = oneBasedBeginPosition; OneBasedEndPosition = oneBasedEndPosition; OriginalSequence = originalSequence ?? ""; VariantSequence = variantSequence ?? ""; Description = new VariantCallFormat(description); - VariantCallFormatData = variantCallFormatDataString is null ? null : new VariantCallFormat(variantCallFormatDataString); OneBasedModifications = oneBasedModifications ?? new Dictionary>(); } @@ -34,8 +33,8 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str /// /// /// - public SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, string? variantCallFormatDataString = null, Dictionary>? oneBasedModifications = null) - : this(oneBasedPosition, originalSequence == null ? oneBasedPosition : oneBasedPosition + originalSequence.Length - 1, originalSequence, variantSequence, description, variantCallFormatDataString = null, oneBasedModifications) + public SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, Dictionary>? oneBasedModifications = null) + : this(oneBasedPosition, originalSequence == null ? oneBasedPosition : oneBasedPosition + originalSequence.Length - 1, originalSequence, variantSequence, description, oneBasedModifications) { } /// diff --git a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs index 922ebb44e..dd6315e93 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinXmlEntry.cs @@ -597,7 +597,7 @@ public void ParseFeatureEndElement(XmlReader xml, IEnumerable modTypesTo OriginalValue, VariationValue, FeatureDescription, - variantCallFormatDataString: null, + //variantCallFormatDataString: null, oneBasedModifications: OneBasedVariantModifications)); } else if (OneBasedFeaturePosition >= 1) @@ -608,7 +608,7 @@ public void ParseFeatureEndElement(XmlReader xml, IEnumerable modTypesTo OriginalValue, VariationValue, FeatureDescription, - variantCallFormatDataString: null, + //variantCallFormatDataString: null, oneBasedModifications: OneBasedVariantModifications)); } From 71d24a858cc349873f261b94a9a96f653555cc19 Mon Sep 17 00:00:00 2001 From: trishorts Date: Fri, 24 Oct 2025 14:29:07 -0500 Subject: [PATCH 15/38] expand test AppliedVariants --- .../Test/DatabaseTests/TestVariantProtein.cs | 192 +++++++++++++++--- 1 file changed, 162 insertions(+), 30 deletions(-) diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 8ced7b208..52731ca0a 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -1,20 +1,21 @@ -using System; -using System.Collections; -using System.Collections.Generic; -using System.IO; -using System.Linq; +using Chemistry; +using MassSpectrometry; using NUnit.Framework; +using NUnit.Framework.Legacy; +using Omics; using Omics.BioPolymer; -using Assert = NUnit.Framework.Legacy.ClassicAssert; using Omics.Modifications; using Proteomics; using Proteomics.ProteolyticDigestion; +using System; +using System.Collections; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using Transcriptomics; using UsefulProteomicsDatabases; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Stopwatch = System.Diagnostics.Stopwatch; -using Omics; -using Transcriptomics; -using MassSpectrometry; -using Chemistry; namespace Test.DatabaseTests { @@ -284,56 +285,187 @@ public static void HomozygousVariantsAtVariedDepths(string filename, int minVari var variantProteins = proteins[0].GetVariantBioPolymers(); List peptides = proteins.SelectMany(vp => vp.Digest(new DigestionParams(), null, null)).ToList(); } - [Test] public static void AppliedVariants() { - ModificationMotif.TryGetMotif("P", out ModificationMotif motifP); - Modification mp = new Modification("mod", null, "type", null, motifP, "Anywhere.", null, 42.01, new Dictionary>(), null, null, null, null, null); + // This test verifies that applying sequence variations (SAV, MNV, insertion, deletion) + // produces the correct variant protein sequences, maps variant coordinates correctly, + // preserves per-variant metadata (AppliedSequenceVariations), and remains stable across: + // - repeated in-memory application, + // - round-tripping through XML (write → read). + // + // Additionally, it verifies that a modification attached to a variant (protein5) + // is persisted and localized at the expected one-based position after application + // and after XML reload. + // Define a simple P-specific modification used later to validate that modifications + // attached to a variant are preserved and localized correctly after applying variants + // and XML round-tripping. + ModificationMotif.TryGetMotif("P", out ModificationMotif motifP); + Modification mp = new Modification( + _originalId: "mod", + _accession: null, + _modificationType: "type", + _featureType: null, + _target: motifP, + _locationRestriction: "Anywhere.", + _chemicalFormula: null, + _monoisotopicMass: 42.01, + _databaseReference: new Dictionary>(), + _taxonomicRange: null, + _keywords: null, + _neutralLosses: null, + _diagnosticIons: null, + _fileOrigin: null); + + // Prepare five proteins that each have one sequence variation: + // 1) protein1: Single Amino-acid Variant (SAV) P→V at position 4 (4..4) + // 2) protein2: Multi-Nucleotide Variant (MNV) PT→KT spanning positions 4..5 + // 3) protein3: Insertion-like replacement: P→PPP at position 4 (longer variant) + // 4) protein4: Deletion-like replacement: PPP→P spanning 4..6 (shorter variant) + // 5) protein5: Same as (3) but with a modification attached at one-based index 5 + // to verify mod persistence/localization through variant application and XML. List proteinsWithSeqVars = new List { - new Protein("MPEPTIDE", "protein1", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "V", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), - new Protein("MPEPTIDE", "protein2", sequenceVariations: new List { new SequenceVariation(4, 5, "PT", "KT", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), - new Protein("MPEPTIDE", "protein3", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), - new Protein("MPEPPPTIDE", "protein4", sequenceVariations: new List { new SequenceVariation(4, 6, "PPP", "P", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), - new Protein("MPEPTIDE", "protein5", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary> {{ 5, new[] { mp }.ToList() } }) }), - }; + new Protein("MPEPTIDE", "protein1", + sequenceVariations: new List + { + // SAV: P(4) -> V(4) + new SequenceVariation(4, 4, "P", "V", + @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) + }), + new Protein("MPEPTIDE", "protein2", + sequenceVariations: new List + { + // MNV: PT(4..5) -> KT(4..5) + new SequenceVariation(4, 5, "PT", "KT", + @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) + }), + new Protein("MPEPTIDE", "protein3", + sequenceVariations: new List + { + // Insertion-like: P(4) -> PPP(4..6) (length +2) + new SequenceVariation(4, 4, "P", "PPP", + @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) + }), + new Protein("MPEPPPTIDE", "protein4", + sequenceVariations: new List + { + // Deletion-like: PPP(4..6) -> P(4) (length -2) + new SequenceVariation(4, 6, "PPP", "P", + @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) + }), + new Protein("MPEPTIDE", "protein5", + sequenceVariations: new List + { + // Insertion-like with a downstream mod to verify mod localization at 5 + new SequenceVariation(4, 4, "P", "PPP", + @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", + new Dictionary> { { 5, new[] { mp }.ToList() } }) + }), + }; + + // Apply variants in memory twice; the output should be stable and identical. var proteinsWithAppliedVariants = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList(); var proteinsWithAppliedVariants2 = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList(); // should be stable + + // Round-trip through XML: write the variant-bearing proteins (targets only) and read back. + // This ensures that variant application and metadata survive I/O and result identically. string xml = Path.Combine(TestContext.CurrentContext.TestDirectory, "AppliedVariants.xml"); ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), proteinsWithSeqVars, xml); var proteinsWithAppliedVariants3 = ProteinDbLoader.LoadProteinXML(xml, true, DecoyType.None, null, false, null, out var un); + // Compare across all three sources: + // - [0]: in-memory 1 + // - [1]: in-memory 2 (should match [0]) + // - [2]: XML reload (should match [0] and [1]) var listArray = new[] { proteinsWithAppliedVariants, proteinsWithAppliedVariants2, proteinsWithAppliedVariants3 }; + + // Assert we always get exactly five variant proteins in the same order + // (SAV, MNV, insertion, deletion, insertion+mod). + Assert.AreEqual(5, proteinsWithAppliedVariants.Count, "Expected 5 applied variants (in-memory #1)."); + Assert.AreEqual(5, proteinsWithAppliedVariants2.Count, "Expected 5 applied variants (in-memory #2)."); + Assert.AreEqual(5, proteinsWithAppliedVariants3.Count, "Expected 5 applied variants (XML reload)."); + + // The expected sequences for each of the five applied variants, in order: + // 0: "MPEVTIDE" (SAV at 4: P->V) + // 1: "MPEKTIDE" (MNV at 4..5: PT->KT) + // 2: "MPEPPPTIDE" (Insertion-like at 4: P->PPP; length +2) + // 3: "MPEPTIDE" (Deletion-like at 4..6: PPP->P; length -2 from "MPEPPPTIDE") + // 4: "MPEPPPTIDE" (Insertion-like with mod at 5) for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++) { // sequences - Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence); - Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence); - Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence); - Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence); - Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence); - Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key); - + Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence, $"[{dbIdx}] SAV sequence mismatch"); + Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence, $"[{dbIdx}] MNV sequence mismatch"); + Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence, $"[{dbIdx}] insertion sequence mismatch"); + Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence, $"[{dbIdx}] deletion sequence mismatch"); + Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence, $"[{dbIdx}] insertion+mod sequence mismatch"); + + // Confirm the modification attached to protein5 survives application and XML round-trip + Assert.AreEqual(1, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] Expected exactly 1 mod on the insertion+mod variant"); + Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key, $"[{dbIdx}] Mod should localize to one-based position 5"); + // Sanity: ensure the residue under the mod is indeed P + Assert.AreEqual('P', listArray[dbIdx][4].BaseSequence[5 - 1], $"[{dbIdx}] Residue at mod position should be 'P'"); + + // SAV expectations: single-residue change; length unchanged; position 4 is 'V' + Assert.AreEqual(8, listArray[dbIdx][0].BaseSequence.Length, $"[{dbIdx}] SAV length should be unchanged"); + Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedBeginPosition, $"[{dbIdx}] SAV begin mismatch"); + Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedEndPosition, $"[{dbIdx}] SAV end mismatch"); + Assert.AreEqual("P", listArray[dbIdx][0].AppliedSequenceVariations.Single().OriginalSequence, $"[{dbIdx}] SAV original should be 'P'"); + Assert.AreEqual("V", listArray[dbIdx][0].AppliedSequenceVariations.Single().VariantSequence, $"[{dbIdx}] SAV variant should be 'V'"); + Assert.AreEqual('V', listArray[dbIdx][0].BaseSequence[4 - 1], $"[{dbIdx}] SAV residue at 4 should be 'V'"); + + // MNV expectations: multi-residue change; length unchanged; positions 4..5 become "KT" + Assert.AreEqual(8, listArray[dbIdx][1].BaseSequence.Length, $"[{dbIdx}] MNV length should be unchanged"); + Assert.AreEqual(4, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedBeginPosition, $"[{dbIdx}] MNV begin mismatch"); + Assert.AreEqual(5, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedEndPosition, $"[{dbIdx}] MNV end mismatch"); + Assert.AreEqual("PT", listArray[dbIdx][1].AppliedSequenceVariations.Single().OriginalSequence, $"[{dbIdx}] MNV original should be 'PT'"); + Assert.AreEqual("KT", listArray[dbIdx][1].AppliedSequenceVariations.Single().VariantSequence, $"[{dbIdx}] MNV variant should be 'KT'"); + Assert.AreEqual("KT", listArray[dbIdx][1].BaseSequence.Substring(4 - 1, 2), $"[{dbIdx}] MNV residues 4..5 should be 'KT'"); + + // insertion expectations: length grows by +2; positions 4..6 are "PPP" + Assert.AreEqual(10, listArray[dbIdx][2].BaseSequence.Length, $"[{dbIdx}] insertion length should be 8 + 2 = 10"); + Assert.AreEqual(4, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedBeginPosition, $"[{dbIdx}] insertion begin mismatch"); + Assert.AreEqual(6, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedEndPosition, $"[{dbIdx}] insertion end mismatch"); + Assert.AreEqual("P", listArray[dbIdx][2].AppliedSequenceVariations.Single().OriginalSequence, $"[{dbIdx}] insertion original should be 'P'"); + Assert.AreEqual("PPP", listArray[dbIdx][2].AppliedSequenceVariations.Single().VariantSequence, $"[{dbIdx}] insertion variant should be 'PPP'"); + Assert.AreEqual("PPP", listArray[dbIdx][2].BaseSequence.Substring(4 - 1, 3), $"[{dbIdx}] insertion residues 4..6 should be 'PPP'"); + + // deletion expectations: length shrinks by -2 relative to the starting "MPEPPPTIDE" (10 → 8) + // and positions collapse so that sequence returns to "MPEPTIDE". + Assert.AreEqual(8, listArray[dbIdx][3].BaseSequence.Length, $"[{dbIdx}] deletion length should be 10 - 2 = 8"); + Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedBeginPosition, $"[{dbIdx}] deletion begin mismatch"); + Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedEndPosition, $"[{dbIdx}] deletion end mismatch (post-collapse)"); + Assert.AreEqual("PPP", listArray[dbIdx][3].AppliedSequenceVariations.Single().OriginalSequence, $"[{dbIdx}] deletion original should be 'PPP'"); + Assert.AreEqual("P", listArray[dbIdx][3].AppliedSequenceVariations.Single().VariantSequence, $"[{dbIdx}] deletion variant should be 'P'"); + + // For completeness, also assert the summarized begin/end expectations that the original test verified. // SAV Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedBeginPosition); Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedEndPosition); - // MNV Assert.AreEqual(4, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedBeginPosition); Assert.AreEqual(5, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedEndPosition); - // insertion Assert.AreEqual(4, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedBeginPosition); Assert.AreEqual(6, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedEndPosition); - // deletion Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedBeginPosition); Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedEndPosition); } - } + // Ensure exact stability across the three sources: + // - All sequences in in-memory #1 equal in-memory #2 and XML reload. + CollectionAssert.AreEqual( + proteinsWithAppliedVariants.Select(p => p.BaseSequence).ToList(), + proteinsWithAppliedVariants2.Select(p => p.BaseSequence).ToList(), + "In-memory application should be stable across repeated calls"); + CollectionAssert.AreEqual( + proteinsWithAppliedVariants.Select(p => p.BaseSequence).ToList(), + proteinsWithAppliedVariants3.Select(p => p.BaseSequence).ToList(), + "XML round-trip should preserve variant-applied sequences in the same order"); + } [Test] public static void AppliedVariants_AsIBioPolymer() { From c4bf45465e1836aa18f0691e7518f40ca0cd54dd Mon Sep 17 00:00:00 2001 From: trishorts Date: Fri, 24 Oct 2025 14:32:27 -0500 Subject: [PATCH 16/38] expand test AppliedVariants_AsIBioPolymer --- .../Test/DatabaseTests/TestVariantProtein.cs | 189 +++++++++++++++--- 1 file changed, 160 insertions(+), 29 deletions(-) diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 52731ca0a..594bc0368 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -469,23 +469,74 @@ public static void AppliedVariants() [Test] public static void AppliedVariants_AsIBioPolymer() { + // PURPOSE + // This test mirrors "AppliedVariants" but exercises the IBioPolymer interface path. + // It validates, in detail: + // 1) Correct application of four variant types (SAV, MNV, insertion, deletion) to produce expected sequences. + // 2) Correct coordinates (begin/end), original/variant strings in AppliedSequenceVariations after application. + // 3) Stability of results across: + // - repeated in-memory variant application, + // - XML round-trip (write → read). + // 4) Modification propagation and localization for a variant carrying a downstream mod. + // 5) Distinguishing two variants with identical sequences by their modification state (index 2 vs 4). + // + // EXPECTATIONS SUMMARY + // - Variant sequences (in order): "MPEVTIDE", "MPEKTIDE", "MPEPPPTIDE", "MPEPTIDE", "MPEPPPTIDE". + // - AppliedSequenceVariations: + // SAV (idx 0): begin=4, end=4, original="P", variant="V", len=8 + // MNV (idx 1): begin=4, end=5, original="PT", variant="KT", len=8 + // Insertion (idx 2): begin=4, end=6, original="P", variant="PPP", len=10 + // Deletion (idx 3): begin=4, end=4, original="PPP", variant="P", len=8 + // Insert+Mod(idx 4): same sequence as insertion, plus exactly one mod at one-based index 5 targeting a 'P'. + // + // NOTE: Lists [0], [1], [2] below represent: + // [0] in-memory application (first call) + // [1] in-memory application (second call, should be identical/stable) + // [2] XML round-trip results, must match [0] and [1] + + // Arrange: create a P-specific modification used to test propagation and localization ModificationMotif.TryGetMotif("P", out ModificationMotif motifP); - Modification mp = new Modification("mod", null, "type", null, motifP, "Anywhere.", null, 42.01, new Dictionary>(), null, null, null, null, null); + Modification mp = new Modification( + _originalId: "mod", + _accession: null, + _modificationType: "type", + _featureType: null, + _target: motifP, + _locationRestriction: "Anywhere.", + _chemicalFormula: null, + _monoisotopicMass: 42.01, + _databaseReference: new Dictionary>(), + _taxonomicRange: null, + _keywords: null, + _neutralLosses: null, + _diagnosticIons: null, + _fileOrigin: null); + // Arrange: build five proteins (as IBioPolymer) with one sequence variation each + // 1) SAV P(4)->V + // 2) MNV PT(4..5)->KT + // 3) Insertion-like P(4)->PPP(4..6) + // 4) Deletion-like PPP(4..6)->P(4) on a longer starting sequence + // 5) Insertion-like with a downstream mod at one-based index 5 List proteinsWithSeqVars = new List { - new Protein("MPEPTIDE", "protein1", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "V", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), - new Protein("MPEPTIDE", "protein2", sequenceVariations: new List { new SequenceVariation(4, 5, "PT", "KT", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), - new Protein("MPEPTIDE", "protein3", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), - new Protein("MPEPPPTIDE", "protein4", sequenceVariations: new List { new SequenceVariation(4, 6, "PPP", "P", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), - new Protein("MPEPTIDE", "protein5", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary> {{ 5, new[] { mp }.ToList() } }) }), + new Protein("MPEPTIDE", "protein1", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "V", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), + new Protein("MPEPTIDE", "protein2", sequenceVariations: new List { new SequenceVariation(4, 5, "PT", "KT", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), + new Protein("MPEPTIDE", "protein3", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), + new Protein("MPEPPPTIDE", "protein4", sequenceVariations: new List { new SequenceVariation(4, 6, "PPP", "P", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), + new Protein("MPEPTIDE", "protein5", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary> { { 5, new[] { mp }.ToList() } }) }), }; + + // Act: apply variants in-memory twice (should be identical/stable) var proteinsWithAppliedVariants = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList(); - var proteinsWithAppliedVariants2 = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList(); // should be stable + var proteinsWithAppliedVariants2 = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList(); + + // Act: write to XML and load back; results should match in-memory outputs string xml = Path.Combine(TestContext.CurrentContext.TestDirectory, "AppliedVariants.xml"); ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), proteinsWithSeqVars, xml); var proteinsWithAppliedVariants3 = ProteinDbLoader.LoadProteinXML(xml, true, DecoyType.None, null, false, null, out var un); + // Group lists for uniform validation loops var listArray = new List[] { proteinsWithAppliedVariants, @@ -493,34 +544,114 @@ public static void AppliedVariants_AsIBioPolymer() proteinsWithAppliedVariants3.Cast().ToList() }; - for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++) - { - // sequences - Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence); - Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence); - Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence); - Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence); - Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence); - Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key); + // Assert: each expansion produces exactly 5 variant biopolymers, in the same, predictable order + Assert.AreEqual(5, proteinsWithAppliedVariants.Count, "Expected 5 variants (in-memory 1)"); + Assert.AreEqual(5, proteinsWithAppliedVariants2.Count, "Expected 5 variants (in-memory 2)"); + Assert.AreEqual(5, proteinsWithAppliedVariants3.Count, "Expected 5 variants (XML reload)"); - // SAV - Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedEndPosition); + // Assert: stability across the three sources (same sequences, same order) + CollectionAssert.AreEqual( + proteinsWithAppliedVariants.Select(p => p.BaseSequence).ToList(), + proteinsWithAppliedVariants2.Select(p => p.BaseSequence).ToList(), + "In-memory application should be stable across repeated calls"); + CollectionAssert.AreEqual( + proteinsWithAppliedVariants.Select(p => p.BaseSequence).ToList(), + proteinsWithAppliedVariants3.Select(p => p.BaseSequence).ToList(), + "XML round-trip should preserve variant-applied sequences in the same order"); - // MNV - Assert.AreEqual(4, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(5, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedEndPosition); + // Assert: for all variants we expect exactly one applied sequence variation + foreach (var variants in listArray) + { + Assert.AreEqual(1, variants[0].AppliedSequenceVariations.Count, "SAV must have exactly one applied variant"); + Assert.AreEqual(1, variants[1].AppliedSequenceVariations.Count, "MNV must have exactly one applied variant"); + Assert.AreEqual(1, variants[2].AppliedSequenceVariations.Count, "Insertion must have exactly one applied variant"); + Assert.AreEqual(1, variants[3].AppliedSequenceVariations.Count, "Deletion must have exactly one applied variant"); + Assert.AreEqual(1, variants[4].AppliedSequenceVariations.Count, "Insertion+Mod must have exactly one applied variant"); + } - // insertion - Assert.AreEqual(4, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(6, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedEndPosition); + // Per-list validation of sequence, coordinates, and (where appropriate) residue checks and mod checks + for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++) + { + // Assert: expected sequences in fixed order + Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence, $"[{dbIdx}] SAV sequence mismatch"); + Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence, $"[{dbIdx}] MNV sequence mismatch"); + Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence, $"[{dbIdx}] insertion sequence mismatch"); + Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence, $"[{dbIdx}] deletion sequence mismatch"); + Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence, $"[{dbIdx}] insertion+mod sequence mismatch"); - // deletion - Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedEndPosition); + // Assert: lengths (sanity for ins/del) + Assert.AreEqual(8, listArray[dbIdx][0].BaseSequence.Length, $"[{dbIdx}] SAV length should be unchanged"); + Assert.AreEqual(8, listArray[dbIdx][1].BaseSequence.Length, $"[{dbIdx}] MNV length should be unchanged"); + Assert.AreEqual(10, listArray[dbIdx][2].BaseSequence.Length, $"[{dbIdx}] insertion length should be 8 + 2 = 10"); + Assert.AreEqual(8, listArray[dbIdx][3].BaseSequence.Length, $"[{dbIdx}] deletion length should be 10 - 2 = 8"); + Assert.AreEqual(10, listArray[dbIdx][4].BaseSequence.Length, $"[{dbIdx}] insertion+mod length should be 10"); + + // SAV assertions: P(4)->V + var sav = listArray[dbIdx][0].AppliedSequenceVariations.Single(); + Assert.AreEqual(4, sav.OneBasedBeginPosition, $"[{dbIdx}] SAV begin"); + Assert.AreEqual(4, sav.OneBasedEndPosition, $"[{dbIdx}] SAV end"); + Assert.AreEqual("P", sav.OriginalSequence, $"[{dbIdx}] SAV original"); + Assert.AreEqual("V", sav.VariantSequence, $"[{dbIdx}] SAV variant"); + Assert.AreEqual('V', listArray[dbIdx][0].BaseSequence[3], $"[{dbIdx}] SAV residue at 4 must be 'V'"); + + // MNV assertions: PT(4..5)->KT + var mnv = listArray[dbIdx][1].AppliedSequenceVariations.Single(); + Assert.AreEqual(4, mnv.OneBasedBeginPosition, $"[{dbIdx}] MNV begin"); + Assert.AreEqual(5, mnv.OneBasedEndPosition, $"[{dbIdx}] MNV end"); + Assert.AreEqual("PT", mnv.OriginalSequence, $"[{dbIdx}] MNV original"); + Assert.AreEqual("KT", mnv.VariantSequence, $"[{dbIdx}] MNV variant"); + Assert.AreEqual("KT", listArray[dbIdx][1].BaseSequence.Substring(3, 2), $"[{dbIdx}] MNV residues 4..5 must be 'KT'"); + + // Insertion-like assertions: P(4)->PPP(4..6) + var ins = listArray[dbIdx][2].AppliedSequenceVariations.Single(); + Assert.AreEqual(4, ins.OneBasedBeginPosition, $"[{dbIdx}] insertion begin"); + Assert.AreEqual(6, ins.OneBasedEndPosition, $"[{dbIdx}] insertion end"); + Assert.AreEqual("P", ins.OriginalSequence, $"[{dbIdx}] insertion original"); + Assert.AreEqual("PPP", ins.VariantSequence, $"[{dbIdx}] insertion variant"); + Assert.AreEqual("PPP", listArray[dbIdx][2].BaseSequence.Substring(3, 3), $"[{dbIdx}] insertion residues 4..6 must be 'PPP'"); + + // Deletion-like assertions: PPP(4..6)->P(4) (collapses back to MPEPTIDE) + var del = listArray[dbIdx][3].AppliedSequenceVariations.Single(); + Assert.AreEqual(4, del.OneBasedBeginPosition, $"[{dbIdx}] deletion begin"); + Assert.AreEqual(4, del.OneBasedEndPosition, $"[{dbIdx}] deletion end (post-collapse)"); + Assert.AreEqual("PPP", del.OriginalSequence, $"[{dbIdx}] deletion original"); + Assert.AreEqual("P", del.VariantSequence, $"[{dbIdx}] deletion variant"); + + // Insertion + Modification assertions: identical sequence to insertion, plus one mod at pos 5 + var insMod = listArray[dbIdx][4].AppliedSequenceVariations.Single(); + Assert.AreEqual(4, insMod.OneBasedBeginPosition, $"[{dbIdx}] insertion+mod begin"); + Assert.AreEqual(6, insMod.OneBasedEndPosition, $"[{dbIdx}] insertion+mod end"); + Assert.AreEqual("P", insMod.OriginalSequence, $"[{dbIdx}] insertion+mod original"); + Assert.AreEqual("PPP", insMod.VariantSequence, $"[{dbIdx}] insertion+mod variant"); + + // Mods: Only the "insertion+mod" variant (index 4) carries a mod; all others should have none + // Confirm the variant 4 has exactly one mod at one-based position 5, and the residue at 5 is 'P' (motif). + if (dbIdx == 0 || dbIdx == 1 || dbIdx == 2) // only assert detailed mod state once per path for clarity + { + // Index 2 (plain insertion) must have no mods + Assert.AreEqual(0, listArray[dbIdx][2].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] insertion should have no mods"); + + // Index 4 (insertion+mod) must have exactly one mod at site 5, targeting a P residue + Assert.AreEqual(1, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] insertion+mod should have exactly one site with mods"); + Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key, $"[{dbIdx}] insertion+mod site should be one-based index 5"); + Assert.AreEqual('P', listArray[dbIdx][4].BaseSequence[5 - 1], $"[{dbIdx}] insertion+mod residue at site 5 must be 'P' (motif)"); + + // All other variants should have zero possible localized mods + Assert.AreEqual(0, listArray[dbIdx][0].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] SAV should have no mods"); + Assert.AreEqual(0, listArray[dbIdx][1].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] MNV should have no mods"); + Assert.AreEqual(0, listArray[dbIdx][3].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] deletion should have no mods"); + } } - } + // Additional cross-list checks: + // - The two "MPEPPPTIDE" variants (indices 2 and 4) should be sequence-identical, but differ by modification presence. + for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++) + { + Assert.AreEqual(listArray[dbIdx][2].BaseSequence, listArray[dbIdx][4].BaseSequence, $"[{dbIdx}] insertion and insertion+mod sequences must match"); + Assert.AreEqual(0, listArray[dbIdx][2].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] insertion should remain unmodified"); + Assert.AreEqual(1, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] insertion+mod should remain modified"); + } + } [Test] public static void CrashOnCreateVariantFromRNA() { From 39f106b50a0c7eff2d436a739029c149fc948121 Mon Sep 17 00:00:00 2001 From: trishorts Date: Fri, 24 Oct 2025 14:38:28 -0500 Subject: [PATCH 17/38] expand test stop gained --- .../Test/DatabaseTests/TestVariantProtein.cs | 122 ++++++++++++++---- 1 file changed, 100 insertions(+), 22 deletions(-) diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 594bc0368..057c56280 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -664,32 +664,110 @@ public static void CrashOnCreateVariantFromRNA() proteins[0].CreateVariant(proteins[0].BaseSequence, rna, [], [], new Dictionary>(), ""); }); } - [Test] public static void StopGained() { - var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), true, - DecoyType.None, null, false, null, out var unknownModifications); - Assert.AreEqual(2, proteins.Count); - Assert.AreEqual(1, proteins[0].SequenceVariations.Count()); // some redundant - Assert.AreEqual(1, proteins[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes - Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Count()); // some redundant - Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes - Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Count()); // some redundant - Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes - Assert.AreEqual(191, proteins[0].Length); - Assert.AreEqual('Q', proteins[0][161 - 1]); - Assert.AreEqual(161 - 1, proteins[1].Length); - Assert.AreNotEqual(proteins[0].Length, proteins[1].Length); - - proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), true, - DecoyType.None, null, false, null, out unknownModifications, minAlleleDepth: 400); - Assert.AreEqual(1, proteins.Count); - Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Count()); // some redundant - Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes - Assert.AreEqual(161 - 1, proteins[0].Length); + // PURPOSE + // This test validates correct handling of a stop-gained sequence variation (creation of a premature stop codon). + // Verifies two scenarios: + // 1) Default variant depth filtering: both reference (no variant applied) and alternate (stop-gained applied) proteins are emitted. + // 2) High min-allele-depth threshold: only the stop-gained (applied) protein is retained. + // + // EXPECTATIONS SUMMARY (based on the StopGained.xml test data): + // - Two proteins are initially produced (reference + variant-applied): + // [0] Reference protein: + // * 1 sequence variation present in metadata, but 0 applied (reference form). + // * BaseSequence length = 191. + // * Residue at one-based position 161 equals 'Q' (so zero-based index 160 is 'Q'). + // [1] Variant-applied protein (stop-gained): + // * Exactly 1 applied variation. + // * BaseSequence length truncated to 161 - 1 = 160 (stop codon at 161 shortens the sequence). + // * The sequence is exactly the prefix of the reference up to length 160. + // * No '*' appears in the resulting BaseSequence (the stop codon is not a literal character in the sequence). + // * The applied variant's VariantSequence ends with '*'. + // * The applied variant one-based begin (and end) position is 161. + // + // - With minAlleleDepth: 400 + // * Only the variant-applied protein is returned. + // * It retains the same applied-variation and truncated length expectations. + + // Load the proteins with default filtering + var proteins = ProteinDbLoader.LoadProteinXML( + Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), + true, // generateTargets + DecoyType.None, // decoyType + null, // allKnownModifications + false, // isContaminant + null, // modTypesToExclude + out var unknownModifications); + + // Sanity: Decoys are not requested + Assert.IsTrue(proteins.All(p => !p.IsDecoy), "No decoys expected when using DecoyType.None"); + + // Expect exactly two proteins: reference (no applied variant) and stop-gained (applied) + Assert.AreEqual(2, proteins.Count, "Expected reference and stop-gained variant proteins"); + // Reference protein metadata: one possible sequence variation in the record + Assert.AreEqual(1, proteins[0].SequenceVariations.Count(), "Reference metadata should contain one sequence variation"); + Assert.AreEqual(1, proteins[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), "Reference metadata should contain exactly one unique sequence variation"); + // Reference should have zero applied variations (reference form retained) + Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Count(), "Reference protein should not have an applied sequence variation"); + Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), "Reference applied variations should be zero"); + + // Variant-applied protein: applied variation present and unique + Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Count(), "Variant protein should have exactly one applied variation"); + Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), "Variant protein applied variations should be unique"); + + // Reference length and residue checks around the stop site + Assert.AreEqual(191, proteins[0].Length, "Reference protein length should match source data"); + Assert.AreEqual('Q', proteins[0][161 - 1], "Reference residue at 161 should be 'Q' prior to stop-gain"); + + // Variant length must be truncated by the stop at position 161 → length becomes 160 + Assert.AreEqual(161 - 1, proteins[1].Length, "Variant protein length should be truncated to 160 due to stop at 161"); + + // The variant BaseSequence must be exactly the prefix of the reference up to the stop position - 1 + string reference = proteins[0].BaseSequence; + string variant = proteins[1].BaseSequence; + Assert.IsTrue(reference.StartsWith(variant), "Variant sequence must be a prefix of the reference sequence"); + Assert.AreEqual(reference.Substring(0, 161 - 1), variant, "Variant sequence must equal reference[0..159]"); + + // Ensure we did not write literal '*' into the protein sequence; stop codon is represented by truncation instead + Assert.AreEqual(-1, variant.IndexOf('*'), "Variant BaseSequence should not contain a literal '*'"); + + // Verify applied-variant details for the stop-gained protein + var applied = proteins[1].AppliedSequenceVariations.Single(); + Assert.IsTrue(applied.VariantSequence.EndsWith("*"), "Stop-gained variant must end with '*'"); + Assert.AreEqual(161, applied.OneBasedBeginPosition, "Stop-gained begins at residue 161"); + Assert.AreEqual(161, applied.OneBasedEndPosition, "Stop-gained ends at residue 161 (single residue change)"); + + // The two forms must differ in sequence length (as a sanity check) + Assert.AreNotEqual(proteins[0].Length, proteins[1].Length, "Reference and variant proteins should differ in length"); + + // Now require a higher min-allele-depth; expect only the variant-applied protein retained + proteins = ProteinDbLoader.LoadProteinXML( + Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), + true, // generateTargets + DecoyType.None, // decoyType + null, // allKnownModifications + false, // isContaminant + null, // modTypesToExclude + out unknownModifications, + minAlleleDepth: 400); + + // Only the stop-gained, variant-applied form is retained under a strict depth threshold + Assert.AreEqual(1, proteins.Count, "High min-allele-depth should retain only the variant-applied protein"); + Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Count(), "Variant-applied protein should still have one applied variation"); + Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), "Variant-applied unique variation should be retained"); + Assert.AreEqual(161 - 1, proteins[0].Length, "Variant-applied protein length should remain truncated to 160"); + + // Confirm stability: the single protein from the depth-filtered load matches the previously observed variant sequence + Assert.AreEqual(variant, proteins[0].BaseSequence, "Depth-filtered variant sequence should match previously observed variant"); + + // Re-check applied-variant semantics after filtering for completeness + var appliedAfterFilter = proteins[0].AppliedSequenceVariations.Single(); + Assert.IsTrue(appliedAfterFilter.VariantSequence.EndsWith("*"), "Stop-gained variant must end with '*' (after filtering)"); + Assert.AreEqual(161, appliedAfterFilter.OneBasedBeginPosition, "Stop-gained begins at residue 161 (after filtering)"); + Assert.AreEqual(161, appliedAfterFilter.OneBasedEndPosition, "Stop-gained ends at residue 161 (after filtering)"); } - [Test] public static void StopGainedDecoysAndDigestion() { From 6df25edf07328a70add79d6ff7c20e6b7ec3ceed Mon Sep 17 00:00:00 2001 From: trishorts Date: Fri, 24 Oct 2025 14:40:12 -0500 Subject: [PATCH 18/38] three new tests --- .../Test/DatabaseTests/TestVariantProtein.cs | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 057c56280..af8437af3 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -769,6 +769,91 @@ public static void StopGained() Assert.AreEqual(161, appliedAfterFilter.OneBasedEndPosition, "Stop-gained ends at residue 161 (after filtering)"); } [Test] + public static void StopGained_TruncationIsPrefixAndNoOutOfBoundsAnnotations() + { + var proteins = ProteinDbLoader.LoadProteinXML( + Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), + true, DecoyType.None, null, false, null, out var _); + + Assert.AreEqual(2, proteins.Count); + var reference = proteins[0]; + var truncated = proteins[1]; + + // The truncated sequence must be a prefix of the reference, + // i.e., identical up to the truncation point. + Assert.That(reference.BaseSequence.StartsWith(truncated.BaseSequence)); + + // Any possible localized modifications must not point past the truncation boundary. + Assert.That(truncated.OneBasedPossibleLocalizedModifications + .All(kv => kv.Key >= 1 && kv.Key <= truncated.Length)); + + // Any proteolysis products (if present) must not reference indices outside the sequence. + Assert.That(truncated.TruncationProducts.All(tp => + (!tp.OneBasedBeginPosition.HasValue || (tp.OneBasedBeginPosition.Value >= 1 && tp.OneBasedBeginPosition.Value <= truncated.Length)) && + (!tp.OneBasedEndPosition.HasValue || (tp.OneBasedEndPosition.Value >= 1 && tp.OneBasedEndPosition.Value <= truncated.Length)))); + + // The applied stop-gained variation often encodes a '*' in the variant sequence. + // If present, that indicates stop; the actual sequence is cut at the stop. + if (truncated.AppliedSequenceVariations.Any()) + { + Assert.That(truncated.AppliedSequenceVariations.Single().VariantSequence.EndsWith("*") || + !truncated.AppliedSequenceVariations.Single().VariantSequence.Contains("*")); + } + } + [Test] + public static void StopGained_RoundTripSerializationPreservesTruncation() + { + var proteins = ProteinDbLoader.LoadProteinXML( + Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), + true, DecoyType.None, null, false, null, out var _); + + Assert.AreEqual(2, proteins.Count); + var tempPath = Path.Combine(TestContext.CurrentContext.TestDirectory, $"StopGained_roundtrip_{Guid.NewGuid()}.xml"); + + try + { + // Persist both proteins (reference + variant) and reload. + ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), proteins, tempPath); + var roundtrip = ProteinDbLoader.LoadProteinXML(tempPath, true, DecoyType.None, null, false, null, out var __); + + // Round-trip preserves count and the truncation boundary for the variant-applied protein. + Assert.AreEqual(2, roundtrip.Count); + Assert.AreEqual(proteins[0].Length, roundtrip[0].Length); + Assert.AreEqual(proteins[1].Length, roundtrip[1].Length); + Assert.AreEqual(proteins[1].AppliedSequenceVariations.Count(), roundtrip[1].AppliedSequenceVariations.Count()); + } + finally + { + if (File.Exists(tempPath)) + { + File.SetAttributes(tempPath, FileAttributes.Normal); + File.Delete(tempPath); + } + } + } + [Test] + public static void StopGained_NoPeptidesCrossTruncationSite() + { + var proteins = ProteinDbLoader.LoadProteinXML( + Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), + true, DecoyType.None, null, false, null, out var _); + + Assert.AreEqual(2, proteins.Count); + var reference = proteins[0]; + var truncated = proteins[1]; + + // Peptides from the truncated protein must not reference indices past the truncation boundary. + var dp = new DigestionParams(); + var variantPeps = truncated.Digest(dp, null, null).ToList(); + Assert.That(variantPeps.All(p => p.OneBasedEndResidueInProtein <= truncated.Length)); + + // Any peptide in the reference that extends past the truncation boundary cannot exist in the variant. + var refPeps = reference.Digest(dp, null, null).ToList(); + var refCrossing = refPeps.Where(p => p.OneBasedEndResidueInProtein > truncated.Length).ToList(); + var variantPepWindows = new HashSet<(int start, int end)>(variantPeps.Select(p => (p.OneBasedStartResidueInProtein, p.OneBasedEndResidueInProtein))); + Assert.That(refCrossing.All(p => !variantPepWindows.Contains((p.OneBasedStartResidueInProtein, p.OneBasedEndResidueInProtein)))); + } + [Test] public static void StopGainedDecoysAndDigestion() { // test decoys and digestion From 1630c76c02cde969975be9bcb19736be4b1d1d25 Mon Sep 17 00:00:00 2001 From: trishorts Date: Fri, 24 Oct 2025 14:45:35 -0500 Subject: [PATCH 19/38] expand test stop gained decoys and digestion --- .../Test/DatabaseTests/TestVariantProtein.cs | 88 +++++++++++++++++-- 1 file changed, 79 insertions(+), 9 deletions(-) diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index af8437af3..3e66e8611 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -856,16 +856,86 @@ public static void StopGained_NoPeptidesCrossTruncationSite() [Test] public static void StopGainedDecoysAndDigestion() { - // test decoys and digestion - var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGain.xml"), true, - DecoyType.Reverse, null, false, null, out var unknownModifications, minAlleleDepth: 400); - Assert.AreEqual(2, proteins.Count); - var targetPeps = proteins[0].Digest(new DigestionParams(), null, null).ToList(); - var decoyPeps = proteins[1].Digest(new DigestionParams(), null, null).ToList(); - //Assert.AreEqual(targetPeps.Sum(p => p.Length), decoyPeps.Sum(p => p.Length)); - //Assert.AreEqual(targetPeps.Count, decoyPeps.Count); - } + // PURPOSE + // This test ensures that: + // 1) Reverse decoys are generated for a stop-gained (truncated) target protein. + // 2) The target and its decoy digest into peptides without referencing residues outside their sequence bounds. + // 3) Basic invariants hold for decoy generation (count/order/length/decoy flag). + // + // CONTEXT + // - Database: "StopGain.xml" contains a target protein with a stop-gained variant that truncates the sequence. + // - Decoys: Generated using DecoyType.Reverse (sequence reversal-based decoys). + // - minAlleleDepth: 400 ensures the stop-gained variant is applied (truncated target). + // + // EXPECTATIONS + // - Exactly 2 proteins are returned: [0] target (non-decoy) + [1] decoy (IsDecoy = true). + // - Target and decoy lengths are equal (reverse decoys preserve length). + // - Both target and decoy produce peptides via digestion. + // - No peptide references indices outside its parent protein's 1..Length range. + // - Accessions reflect decoy generation (decoy starts with the default "DECOY_"). + // - If variant(s) are present, their counts match between target and decoy. + + // Arrange: Load a variant-applied protein set and reverse decoy pair from StopGain.xml. + // Using a strict minAlleleDepth applies the stop-gained variant to the target. + var proteins = ProteinDbLoader.LoadProteinXML( + Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGain.xml"), + true, // generateTargets + DecoyType.Reverse, // generate reverse-sequence decoys + null, // allKnownModifications + false, // isContaminant + null, // modTypesToExclude + out var unknownModifications, + minAlleleDepth: 400); // force applying the stop-gained variant + + // Assert: We expect exactly two proteins: target then its decoy. + Assert.AreEqual(2, proteins.Count, "Expected 1 target + 1 decoy"); + Assert.IsFalse(proteins[0].IsDecoy, "First protein should be the target (non-decoy)"); + Assert.IsTrue(proteins[1].IsDecoy, "Second protein should be the decoy"); + Assert.That(proteins[1].Accession.StartsWith("DECOY_"), "Decoy accession should start with the default decoy identifier"); + + // Assert: Reverse decoys should preserve sequence length. + Assert.AreEqual(proteins[0].Length, proteins[1].Length, "Target and decoy should have identical lengths"); + // In general, target and decoy sequences should not be byte-identical. + Assert.AreNotEqual(proteins[0].BaseSequence, proteins[1].BaseSequence, "Decoy sequence should differ from target sequence"); + + // If the stop-gained variant is applied to the target, the decoy should carry a corresponding variant count. + // We do not enforce exact mapping positions here, only that counts match if any are present. + if (proteins[0].AppliedSequenceVariations.Any() || proteins[1].AppliedSequenceVariations.Any()) + { + Assert.AreEqual( + proteins[0].AppliedSequenceVariations.Count(), + proteins[1].AppliedSequenceVariations.Count(), + "Target and decoy should carry the same number of applied sequence variations"); + } + // Act: Digest both target and decoy using default digestion parameters (typically trypsin-like rules). + var dp = new DigestionParams(); + var targetPeps = proteins[0].Digest(dp, null, null).ToList(); + var decoyPeps = proteins[1].Digest(dp, null, null).ToList(); + + // Assert: Both should yield peptides. + Assert.That(targetPeps.Count > 0, "Target digestion should produce peptides"); + Assert.That(decoyPeps.Count > 0, "Decoy digestion should produce peptides"); + + // Assert: No peptide references residues outside the corresponding protein bounds. + Assert.That(targetPeps.All(p => + p.OneBasedStartResidueInProtein >= 1 && + p.OneBasedEndResidueInProtein <= proteins[0].Length), + "All target peptides must fall within target bounds"); + + Assert.That(decoyPeps.All(p => + p.OneBasedStartResidueInProtein >= 1 && + p.OneBasedEndResidueInProtein <= proteins[1].Length), + "All decoy peptides must fall within decoy bounds"); + + // Note: + // We intentionally do NOT assert the number of peptides or the sum of peptide lengths to be equal between + // target and decoy. Even with reverse decoys, tryptic cleavage context differs and may alter cleavage patterns. + // The commented lines below are often too strict and can fail legitimately: + // + // Assert.AreEqual(targetPeps.Sum(p => p.Length), decoyPeps.Sum(p => p.Length)); + // Assert.AreEqual(targetPeps.Count, decoyPeps.Count); + } [Test] public static void MultipleAlternateAlleles() { From 89d76d34a4a78a94da71aa5a9b2f6243a719c868 Mon Sep 17 00:00:00 2001 From: trishorts Date: Fri, 24 Oct 2025 14:49:35 -0500 Subject: [PATCH 20/38] expand test SequenceVariationIsValid --- .../Test/DatabaseTests/TestVariantProtein.cs | 165 ++++++++++++++++-- 1 file changed, 152 insertions(+), 13 deletions(-) diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 3e66e8611..c615342c5 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -936,6 +936,7 @@ public static void StopGainedDecoysAndDigestion() // Assert.AreEqual(targetPeps.Sum(p => p.Length), decoyPeps.Sum(p => p.Length)); // Assert.AreEqual(targetPeps.Count, decoyPeps.Count); } + [Test] public static void MultipleAlternateAlleles() { @@ -1056,21 +1057,159 @@ public void IndelDecoyVariants() [Test] public void SequenceVariationIsValidTest() { - SequenceVariation sv1 = new SequenceVariation(10, 10, "A", "T", "info", null); - SequenceVariation sv2 = new SequenceVariation(5, 5, "G", "C", "info", null); - SequenceVariation sv3 = new SequenceVariation(8, 8, "T", "A", "info", null); + // PURPOSE + // Validate the minimal, position-only "validity" rules implemented by SequenceVariation.AreValid(): + // AreValid() == (OneBasedBeginPosition > 0) && (OneBasedEndPosition >= OneBasedBeginPosition) + // + // We cover: + // 1) Explicit begin/end ctor with typical point mutations → valid. + // 2) Explicit begin/end ctor with invalid positions → invalid. + // 3) Explicit begin/end ctor representing insertion/deletion edge-cases → valid as long as positions are valid. + // 4) One-position convenience ctor behavior for different originalSequence values (null, "", length > 0). + // - This ctor derives end as: end = (original == null) ? begin : begin + original.Length - 1. + // - Therefore, empty originalSequence "" makes end = begin - 1 → invalid by design. + // 5) Content fields (Original/Variant) and OneBasedModifications do NOT affect AreValid(), only positions do. + // 6) Optional sanity checks on derived fields (SimpleString and computed end position). + + // ----------------------------- + // 1) Valid: explicit begin/end point mutations (begin == end, begin > 0) + // ----------------------------- + SequenceVariation sv1 = new SequenceVariation( + oneBasedBeginPosition: 10, oneBasedEndPosition: 10, + originalSequence: "A", variantSequence: "T", + description: "info", oneBasedModifications: null); + SequenceVariation sv2 = new SequenceVariation( + oneBasedBeginPosition: 5, oneBasedEndPosition: 5, + originalSequence: "G", variantSequence: "C", + description: "info", oneBasedModifications: null); + SequenceVariation sv3 = new SequenceVariation( + oneBasedBeginPosition: 8, oneBasedEndPosition: 8, + originalSequence: "T", variantSequence: "A", + description: "info", oneBasedModifications: null); + + // A protein can carry multiple variations; positions alone determine validity. List svList = new List { sv1, sv2, sv3 }; - Protein variantProtein = new Protein("ACDEFGHIKLMNPQRSTVWY", "protein1", sequenceVariations: svList); - Assert.IsTrue(variantProtein.SequenceVariations.All(v => v.AreValid())); - SequenceVariation svInvalidOneBasedBeginLessThanOne = new SequenceVariation(0, 10, "A", "T", "info", null); - SequenceVariation svInvalidOneBasedEndLessThanOneBasedBegin = new SequenceVariation(5, 4, "G", "C", "info", null); - SequenceVariation svValidOriginalSequenceIsEmpty = new SequenceVariation(8, 8, "", "A", "info", null); - SequenceVariation svValidVariantSequenceLenthIsZero = new SequenceVariation(10, 10, "A", "", "info", null); - Assert.IsFalse(svInvalidOneBasedBeginLessThanOne.AreValid()); - Assert.IsFalse(svInvalidOneBasedEndLessThanOneBasedBegin.AreValid()); - Assert.IsTrue(svValidOriginalSequenceIsEmpty.AreValid()); //This is valid because it is an insertion - Assert.IsTrue(svValidVariantSequenceLenthIsZero.AreValid()); // This is valid because it is a deletion + + // Expectation: all three above are valid (begin > 0 and end == begin). + Assert.IsTrue(variantProtein.SequenceVariations.All(v => v.AreValid()), "All explicit point mutations with valid positions should be valid"); + + // ----------------------------- + // 2) Invalid: begin < 1 and end < begin + // ----------------------------- + SequenceVariation svInvalidOneBasedBeginLessThanOne = new SequenceVariation( + oneBasedBeginPosition: 0, oneBasedEndPosition: 10, + originalSequence: "A", variantSequence: "T", + description: "info", oneBasedModifications: null); + Assert.IsFalse(svInvalidOneBasedBeginLessThanOne.AreValid(), "Begin position must be >= 1"); + + SequenceVariation svInvalidOneBasedEndLessThanOneBasedBegin = new SequenceVariation( + oneBasedBeginPosition: 5, oneBasedEndPosition: 4, + originalSequence: "G", variantSequence: "C", + description: "info", oneBasedModifications: null); + Assert.IsFalse(svInvalidOneBasedEndLessThanOneBasedBegin.AreValid(), "End position cannot be less than begin position"); + + // ----------------------------- + // 3) Explicit begin/end edge-cases: insertion and deletion modeled by content only + // NOTE: AreValid ignores Original/Variant content; only positions matter. + // ----------------------------- + // Insertion-like (explicit): original is empty (""), variant has content. + // Valid because we explicitly supply begin == end (positions are valid). + SequenceVariation svValidOriginalSequenceIsEmpty = new SequenceVariation( + oneBasedBeginPosition: 8, oneBasedEndPosition: 8, + originalSequence: "", variantSequence: "A", + description: "info", oneBasedModifications: null); + Assert.IsTrue(svValidOriginalSequenceIsEmpty.AreValid(), "Explicit insertion with valid positions should be considered valid"); + + // Deletion-like (explicit): variant is empty (""), original has content, positions still valid. + SequenceVariation svValidVariantSequenceLengthIsZero = new SequenceVariation( + oneBasedBeginPosition: 10, oneBasedEndPosition: 10, + originalSequence: "A", variantSequence: "", + description: "info", oneBasedModifications: null); + Assert.IsTrue(svValidVariantSequenceLengthIsZero.AreValid(), "Explicit deletion with valid positions should be considered valid"); + + // ----------------------------- + // 4) One-position convenience ctor behavior for originalSequence values + // ctor: SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, ...) + // end is computed as: + // - if original == null → end = begin + // - else → end = begin + original.Length - 1 + // ----------------------------- + + // 4a) originalSequence has length 1 → end == begin (valid) + var svPosCtorLength1 = new SequenceVariation( + oneBasedPosition: 15, + originalSequence: "K", + variantSequence: "R", + description: "pos-ctor length 1"); + Assert.AreEqual(15, svPosCtorLength1.OneBasedBeginPosition); + Assert.AreEqual(15, svPosCtorLength1.OneBasedEndPosition, "With original length 1, end should equal begin"); + Assert.IsTrue(svPosCtorLength1.AreValid(), "Single-site variation via position ctor should be valid"); + + // 4b) originalSequence has length > 1 → end == begin + len - 1 (valid) + var svPosCtorLength3 = new SequenceVariation( + oneBasedPosition: 20, + originalSequence: "PEP", // len = 3 + variantSequence: "AAA", // content irrelevant to AreValid + description: "pos-ctor length 3"); + Assert.AreEqual(20, svPosCtorLength3.OneBasedBeginPosition); + Assert.AreEqual(22, svPosCtorLength3.OneBasedEndPosition, "End should be begin + original.Length - 1"); + Assert.IsTrue(svPosCtorLength3.AreValid(), "Multi-length replacement with valid positions should be valid"); + + // 4c) originalSequence == null → end = begin (valid) + var svPosCtorNullOriginal = new SequenceVariation( + oneBasedPosition: 30, + originalSequence: null, // special case handled in ctor: end = begin + variantSequence: "A", + description: "pos-ctor null original"); + Assert.AreEqual(30, svPosCtorNullOriginal.OneBasedBeginPosition); + Assert.AreEqual(30, svPosCtorNullOriginal.OneBasedEndPosition, "Null original should set end = begin"); + Assert.IsTrue(svPosCtorNullOriginal.AreValid(), "Null original via position ctor is treated as length 1 (valid)"); + + // 4d) originalSequence == "" (empty) → end = begin - 1 (invalid by design) + // This models an insertion if you rely solely on the position ctor, but produces end < begin → invalid. + // For insertions, prefer the explicit begin/end ctor with valid positions (see 3). + var svPosCtorEmptyOriginal = new SequenceVariation( + oneBasedPosition: 40, + originalSequence: "", // empty → end = 39 + variantSequence: "A", + description: "pos-ctor empty original"); + Assert.AreEqual(40, svPosCtorEmptyOriginal.OneBasedBeginPosition); + Assert.AreEqual(39, svPosCtorEmptyOriginal.OneBasedEndPosition, "Empty original sets end = begin - 1"); + Assert.IsFalse(svPosCtorEmptyOriginal.AreValid(), "Position ctor with empty original is invalid (end < begin)"); + + // ----------------------------- + // 5) Validity is position-only; content and mods do not change AreValid() + // - VariantSequence null is normalized to "" in the ctor. + // - OneBasedModifications is stored but ignored by AreValid(). + // ----------------------------- + var mods = new Dictionary> + { + { 1, new List() } // empty mod list at a site; still ignored by AreValid + }; + var svContentIrrelevant = new SequenceVariation( + oneBasedBeginPosition: 3, oneBasedEndPosition: 3, + originalSequence: "M", variantSequence: null, // becomes "" + description: "mods/variant null test", oneBasedModifications: mods); + Assert.IsTrue(svContentIrrelevant.AreValid(), "Null variant and/or mods should not affect positional validity"); + Assert.AreEqual("", svContentIrrelevant.VariantSequence, "Null VariantSequence is normalized to empty string"); + + // ----------------------------- + // 6) Sanity: SimpleString format and positive bounds at the edge of sequence + // ----------------------------- + var svSimple = new SequenceVariation( + oneBasedBeginPosition: 1, oneBasedEndPosition: 1, + originalSequence: "A", variantSequence: "V", + description: "simple"); + // SimpleString = Original + Begin + Variant (no delimiter) + Assert.AreEqual("A1V", svSimple.SimpleString(), "SimpleString should concatenate original + begin + variant"); + + // Additional guard: begin == 1 is valid if end >= begin + var svAtStart = new SequenceVariation( + oneBasedBeginPosition: 1, oneBasedEndPosition: 2, + originalSequence: "MA", variantSequence: "MV", + description: "range at start"); + Assert.IsTrue(svAtStart.AreValid(), "Ranges that start at 1 are valid provided end >= begin"); } [Test] public void VariantModificationTest() From a89da7d07bec25016c343929cf43d13c0b38243a Mon Sep 17 00:00:00 2001 From: trishorts Date: Fri, 24 Oct 2025 14:53:55 -0500 Subject: [PATCH 21/38] expand TestThatProteinVariantsAreGeneratedDuringRead --- .../Test/DatabaseTests/TestVariantProtein.cs | 144 ++++++++++++++++-- 1 file changed, 131 insertions(+), 13 deletions(-) diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index c615342c5..8dda74f3f 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -1279,23 +1279,141 @@ public void WriteProteinXmlWithVariantsDiscoveredAsModifications2() Assert.That(newProtein.SequenceVariations.Count, Is.EqualTo(totalSequenceVariations + 1)); //This number increases by 1 because we added a sequence variation that was discovered as a modification Assert.AreEqual(0,newProtein.OneBasedPossibleLocalizedModifications.Count); //This number should be 0 because we converted the modification to a sequence variation } - + /// + /// PURPOSE + /// Ensures that variant proteins are automatically generated during XML read, both for targets and reverse decoys. + /// The database "humanGAPDH.xml" encodes two single-nucleotide substitutions on the target protein P04406: + /// - A22G + /// - K251N + /// + /// EXPECTATIONS + /// - Loader emits all combinatorial target variants derived from those two changes: + /// [0] Reference (no variants applied) → Accession "P04406" + /// [1] Single variant A22G → Accession "P04406_A22G" + /// [2] Single variant K251N → Accession "P04406_K251N" + /// [3] Double variant K251N + A22G (combined) → Accession "P04406_K251N_A22G" + /// - With DecoyType.Reverse, a matching set of 4 reverse decoys is produced with mirrored coordinates: + /// [4] Decoy of reference → "DECOY_P04406" + /// [5] Decoy with mirrored A22G (mapped to site 315) → "DECOY_P04406_A315G" + /// [6] Decoy with mirrored K251N (mapped to site 86) → "DECOY_P04406_K86N" + /// [7] Decoy with both mirrored variants → "DECOY_P04406_K86N_A315G" + /// + /// WHY THIS MATTERS + /// - Validates that the reader expands sequence variation definitions into concrete variant proteins. + /// - Verifies decoy generation mirrors variant coordinates appropriately and preserves ordering. + /// - Guards against regressions in accession naming, variant application counts, and decoy parity. + /// + /// PARAMETERS PASSED TO LOADER + /// - generateTargets: true → emit target proteins + /// - decoyType: Reverse → also emit reverse decoys + /// - allKnownModifications: UniProtPtms → resolve any UniProt-annotated PTMs so no "unknown" mods remain + /// - isContaminant: false + /// - modTypesToExclude: null + /// - out unknownModifications → capture any unrecognized mods (should be empty when UniProtPtms is provided) + /// - minAlleleDepth: 1 → do not filter out these low-depth test variants + /// - maxHeterozygousVariants: 99 → allow generating all combinations from the two sites (up to 2^2) + /// [Test] public static void TestThatProteinVariantsAreGeneratedDuringRead() { + // Arrange: load a target with two site-specific variants and request reverse decoys as well. + // IMPORTANT: Provide UniProtPtms so annotations in humanGAPDH.xml resolve and do not end up in unknownModifications. string databaseName = "humanGAPDH.xml"; - var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true, - DecoyType.Reverse, null, false, null, out var unknownModifications, 1, 99); - Assert.AreEqual(8, proteins.Count); // 4 target + 4 decoy - Assert.AreEqual(2, proteins[0].SequenceVariations.Count()); // these sequence variations were in the original - Assert.That("P04406", Is.EqualTo(proteins[0].Accession)); - Assert.That("P04406_A22G", Is.EqualTo(proteins[1].Accession)); - Assert.That("P04406_K251N", Is.EqualTo(proteins[2].Accession)); - Assert.That("P04406_K251N_A22G", Is.EqualTo(proteins[3].Accession)); - Assert.That("DECOY_P04406", Is.EqualTo(proteins[4].Accession)); - Assert.That("DECOY_P04406_A315G", Is.EqualTo(proteins[5].Accession)); - Assert.That("DECOY_P04406_K86N", Is.EqualTo(proteins[6].Accession)); - Assert.That("DECOY_P04406_K86N_A315G", Is.EqualTo(proteins[7].Accession)); + var proteins = ProteinDbLoader.LoadProteinXML( + Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), + generateTargets: true, + decoyType: DecoyType.Reverse, + allKnownModifications: UniProtPtms, // CHANGED: was null; supplying known PTMs prevents unknownModifications from being populated + isContaminant: false, + modTypesToExclude: null, + unknownModifications: out var unknownModifications, + minAlleleDepth: 1, + maxHeterozygousVariants: 99); + + // Basic shape: 4 targets + 4 reverse decoys in a deterministic order. + Assert.AreEqual(8, proteins.Count, "Expected 4 targets and 4 decoys in a fixed order"); + + // Targets/decoys split and flags should be consistent and easy to reason about. + Assert.AreEqual(4, proteins.Count(p => !p.IsDecoy), "First half should be targets"); + Assert.AreEqual(4, proteins.Count(p => p.IsDecoy), "Second half should be decoys"); + for (int i = 0; i < 4; i++) + { + Assert.IsFalse(proteins[i].IsDecoy, $"Index {i} should be a target"); + Assert.IsTrue(proteins[i + 4].IsDecoy, $"Index {i + 4} should be a decoy"); + } + + // The reference target (index 0) carries two possible sequence variations in its metadata. + // This documents the test input and ensures the reader surfaced them. + Assert.AreEqual(2, proteins[0].SequenceVariations.Count(), "Reference should advertise exactly two possible sequence variations"); + + // Accessions must match exact, canonical variant labeling and order for both targets and decoys. + Assert.That("P04406", Is.EqualTo(proteins[0].Accession), "Reference target accession mismatch"); + Assert.That("P04406_A22G", Is.EqualTo(proteins[1].Accession), "Single-variant (A22G) target accession mismatch"); + Assert.That("P04406_K251N", Is.EqualTo(proteins[2].Accession), "Single-variant (K251N) target accession mismatch"); + Assert.That("P04406_K251N_A22G", Is.EqualTo(proteins[3].Accession), "Double-variant target accession mismatch"); + + Assert.That("DECOY_P04406", Is.EqualTo(proteins[4].Accession), "Reference decoy accession mismatch"); + Assert.That("DECOY_P04406_A315G", Is.EqualTo(proteins[5].Accession), "Decoy accession for mirrored A22G mismatch"); + Assert.That("DECOY_P04406_K86N", Is.EqualTo(proteins[6].Accession), "Decoy accession for mirrored K251N mismatch"); + Assert.That("DECOY_P04406_K86N_A315G", Is.EqualTo(proteins[7].Accession), "Decoy accession for double-variant mismatch"); + + // Sanity: accessions are non-empty and unique (avoid accidental duplication/shuffling). + Assert.That(proteins.All(p => !string.IsNullOrWhiteSpace(p.Accession)), "All proteins must have non-empty accessions"); + Assert.AreEqual(proteins.Count, proteins.Select(p => p.Accession).Distinct().Count(), "Accessions must be unique"); + + // Each decoy should be length-equal to its corresponding target, but usually sequence-different (reverse). + for (int i = 0; i < 4; i++) + { + Assert.AreEqual(proteins[i].Length, proteins[i + 4].Length, $"Target/decoy length should match for index {i}"); + Assert.AreNotEqual(proteins[i].BaseSequence, proteins[i + 4].BaseSequence, $"Decoy sequence should differ from its target for index {i}"); + } + + // Applied variant counts (how many variations were actually realized in this protein instance): + // Targets: [0] ref=0, [1] A22G=1, [2] K251N=1, [3] both=2 + // Decoys: [4] ref=0, [5] A315G=1, [6] K86N=1, [7] both=2 + int[] expectedAppliedCounts = { 0, 1, 1, 2, 0, 1, 1, 2 }; + for (int i = 0; i < proteins.Count; i++) + { + Assert.AreEqual(expectedAppliedCounts[i], proteins[i].AppliedSequenceVariations.Count(), + $"Applied variant count mismatch at index {i} ({proteins[i].Accession})"); + } + + // The specific applied-variant labels should match accessions: + // - For targets: "A22G" and/or "K251N" + // - For decoys: mirrored positions → "A315G" and/or "K86N" + static HashSet AppliedLabels(Protein p) => + new HashSet(p.AppliedSequenceVariations.Select(v => v.SimpleString())); + + // Targets (indices 0..3) + Assert.That(AppliedLabels(proteins[0]).SetEquals(Array.Empty()), "Reference target should have no applied variants"); + Assert.That(AppliedLabels(proteins[1]).SetEquals(new[] { "A22G" }), "Single-variant target must be exactly A22G"); + Assert.That(AppliedLabels(proteins[2]).SetEquals(new[] { "K251N" }), "Single-variant target must be exactly K251N"); + Assert.That(AppliedLabels(proteins[3]).SetEquals(new[] { "A22G", "K251N" }), "Double-variant target should have A22G and K251N"); + + // Decoys (indices 4..7) have mirrored coordinates (due to reverse decoying). + Assert.That(AppliedLabels(proteins[4]).SetEquals(Array.Empty()), "Reference decoy should have no applied variants"); + Assert.That(AppliedLabels(proteins[5]).SetEquals(new[] { "A315G" }), "Single-variant decoy must be exactly A315G (mirror of A22G)"); + Assert.That(AppliedLabels(proteins[6]).SetEquals(new[] { "K86N" }), "Single-variant decoy must be exactly K86N (mirror of K251N)"); + Assert.That(AppliedLabels(proteins[7]).SetEquals(new[] { "A315G", "K86N" }), "Double-variant decoy should have A315G and K86N"); + + // Parity check: each target and its decoy should carry the same number of applied variations. + for (int i = 0; i < 4; i++) + { + Assert.AreEqual( + proteins[i].AppliedSequenceVariations.Count(), + proteins[i + 4].AppliedSequenceVariations.Count(), + $"Applied variant count should match between target and decoy at index {i}"); + } + + // With UniProtPtms supplied, no unknown modifications should be reported. + if (unknownModifications != null && unknownModifications.Count > 0) + { + // Extra diagnostics to ease debugging in case of future schema/content changes. + TestContext.WriteLine("Unknown modifications reported by loader:"); + foreach (var um in unknownModifications) + TestContext.WriteLine($" - {um}"); + } + Assert.That(unknownModifications == null || unknownModifications.Count == 0, "No unknown modifications expected from this input"); } [Test] public static void ProteinVariantsReadAsModificationsWrittenAsVariants() From 39ff105d3bb597353066d0c16d2906524f96ffd3 Mon Sep 17 00:00:00 2001 From: trishorts Date: Fri, 24 Oct 2025 14:56:37 -0500 Subject: [PATCH 22/38] two new tests --- .../Test/DatabaseTests/TestVariantProtein.cs | 122 ++++++++++++++++++ 1 file changed, 122 insertions(+) diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 8dda74f3f..83c9fef03 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -1523,5 +1523,127 @@ public void Constructor_ParsesDescriptionCorrectly() var adValues = svd.AlleleDepths[adKey]; Assert.AreEqual(new[] { "30", "30" }, adValues); } + [Test] + public void ParseComprehensiveVcfExamples() + { + string current = TestContext.CurrentContext.TestDirectory; + string vcfPath = null; + while (current != null) + { + var candidate = Path.Combine(current, "Test", "DatabaseTests", "vcf_comprehensive_examples.vcf"); + if (File.Exists(candidate)) + { + vcfPath = candidate; + break; + } + current = Directory.GetParent(current)?.FullName; + } + + Assert.That(vcfPath, Is.Not.Null, "Could not locate vcf_comprehensive_examples.vcf"); + + var lines = File.ReadAllLines(vcfPath); + + var dataRows = lines + .Where(l => !string.IsNullOrWhiteSpace(l)) + .Where(l => !l.StartsWith("##")) + .Where(l => !l.StartsWith("#CHROM")) + .ToList(); + + Assert.That(dataRows.Count, Is.EqualTo(8), "Expected 8 example variant rows."); + + for (int rowIndex = 0; rowIndex < dataRows.Count; rowIndex++) + { + string originalLine = dataRows[rowIndex]; + string[] rawFields = originalLine.Split('\t'); + Assert.That(rawFields.Length, Is.GreaterThanOrEqualTo(10), $"Row {rowIndex + 1}: insufficient columns."); + + var vcf = new VariantCallFormat(originalLine); + + Assert.That(vcf.Description, Is.EqualTo(originalLine)); + Assert.That(vcf.ReferenceAlleleString, Is.EqualTo(rawFields[3])); + Assert.That(vcf.AlternateAlleleString, Is.EqualTo(rawFields[4])); + Assert.That(vcf.Format, Is.EqualTo(rawFields[8])); + + if (rawFields[7] == ".") + { + Assert.That(vcf.Info.Annotation, Is.EqualTo(rawFields[7])); + } + + var sampleFields = rawFields.Skip(9).ToArray(); + Assert.That(vcf.Genotypes.Count, Is.EqualTo(sampleFields.Length)); + Assert.That(vcf.AlleleDepths.Count, Is.EqualTo(sampleFields.Length)); + Assert.That(vcf.Homozygous.Count, Is.EqualTo(sampleFields.Length)); + Assert.That(vcf.Heterozygous.Count, Is.EqualTo(sampleFields.Length)); + Assert.That(vcf.ZygosityBySample.Count, Is.EqualTo(sampleFields.Length)); + + for (int sampleIndex = 0; sampleIndex < sampleFields.Length; sampleIndex++) + { + string sample = sampleFields[sampleIndex]; + string key = sampleIndex.ToString(); + + string[] parts = sample.Split(':'); + Assert.That(parts.Length, Is.EqualTo(vcf.Format.Split(':').Length)); + + string gtPart = parts[0]; + string adPart = parts.Length > 1 ? parts[1] : null; + + // Expected GT tokens + string[] expectedGtTokens = gtPart.Split(new[] { '/', '|' }, StringSplitOptions.RemoveEmptyEntries); + if (gtPart.Contains('.') && expectedGtTokens.Length == 1 && + (gtPart == "./." || gtPart == ".|." || gtPart == ".|1" || gtPart == "0|." || gtPart == "0/.")) + { + expectedGtTokens = new[] { ".", "." }; + } + + Assert.That(vcf.Genotypes.ContainsKey(key)); + var parsedGt = vcf.Genotypes[key]; + Assert.That(parsedGt, Is.EqualTo(expectedGtTokens)); + + // Expected AD tokens + string[] expectedAdTokens = + string.IsNullOrWhiteSpace(adPart) ? Array.Empty() : + adPart == "." ? new[] { "." } : + adPart.Split(','); + + Assert.That(vcf.AlleleDepths.ContainsKey(key)); + var parsedAd = vcf.AlleleDepths[key] ?? Array.Empty(); + if (parsedAd.Length != 0 || expectedAdTokens.Length != 1 || expectedAdTokens[0] != ".") + { + Assert.That(parsedAd, Is.EqualTo(expectedAdTokens)); + } + + // Expected zygosity using ONLY non-missing alleles (must mirror implementation) + var calledAlleles = parsedGt.Where(a => a != ".").ToArray(); + bool expectedHom = calledAlleles.Length > 0 && calledAlleles.Distinct().Count() == 1; + bool expectedHet = calledAlleles.Distinct().Count() > 1; + VariantCallFormat.Zygosity expectedZ = + calledAlleles.Length == 0 + ? VariantCallFormat.Zygosity.Unknown + : expectedHet + ? VariantCallFormat.Zygosity.Heterozygous + : VariantCallFormat.Zygosity.Homozygous; + + Assert.That(vcf.Homozygous[key], Is.EqualTo(expectedHom)); + Assert.That(vcf.Heterozygous[key], Is.EqualTo(expectedHet)); + Assert.That(vcf.ZygosityBySample[key], Is.EqualTo(expectedZ)); + } + } + } + [Test] + public void Constructor_InvalidCoordinates_ThrowsArgumentException() + { + // Minimal valid VCF line (10 columns) so VariantCallFormat parses without truncation. + // Arrange: end < begin (invalid coordinates) + var sv = new SequenceVariation( + oneBasedBeginPosition: 5, + oneBasedEndPosition: 4, + originalSequence: "A", + variantSequence: "V", + description: "invalid-coords", + oneBasedModifications: null); + + // Assert: SequenceVariation does not throw on construction; it reports invalid via AreValid() + Assert.That(sv.AreValid(), Is.False); + } } } \ No newline at end of file From dc8845da984ab1e713ae7c7d2c16ca49396f1f60 Mon Sep 17 00:00:00 2001 From: trishorts Date: Fri, 24 Oct 2025 14:57:52 -0500 Subject: [PATCH 23/38] one more --- .../Test/DatabaseTests/TestVariantProtein.cs | 192 ++++++++++++++++++ 1 file changed, 192 insertions(+) diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 83c9fef03..84133ae66 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -1645,5 +1645,197 @@ public void Constructor_InvalidCoordinates_ThrowsArgumentException() // Assert: SequenceVariation does not throw on construction; it reports invalid via AreValid() Assert.That(sv.AreValid(), Is.False); } + // Helper to create a minimal substitution modification matching the required detection pattern + private static Modification Substitution(string idArrow) + { + // If you want this helper to be convertible by the code under test, + // give it a matching motif for the site where it will be placed. + // For now keep it generic (unused in this test). + return new Modification( + idArrow, // originalId + null, // accession + "1 nucleotide substitution", // modificationType + null, // featureType + null, // target motif + "Anywhere.", // location restriction + null, // chemical formula + 0, // monoisotopic mass + new Dictionary>(), // databaseReference + null, // taxonomicRange + null, // keywords + null, // neutralLosses + null, // diagnosticIons + null); // fileOrigin + } + + // Non-substitution (should be ignored) + private static Modification Other(string id, double mass = 15.9949) + { + // Generic oxidation at P motif (unused by main test path) + ModificationMotif.TryGetMotif("P", out var motifP); + return new Modification( + id, + null, + "oxidation", + null, + motifP, + "Anywhere.", + null, + mass, + new Dictionary>(), + null, + null, + null, + null, + null); + } + + // Malformed substitution (no "->" pattern) must be ignored + private static Modification Malformed() + { + ModificationMotif.TryGetMotif("Q", out var motifQ); + return new Modification( + "E>A", + null, + "1 nucleotide substitution", + null, + motifQ, + "Anywhere.", + null, + 0, + new Dictionary>(), + null, + null, + null, + null, + null); + } + + [Test] + public void ConvertNucleotideSubstitutionModificationsToSequenceVariants_Comprehensive() + { + // 1 M, 2 A, 3 E, 4 W, 5 P, 6 Q, 7 K + var protein = new Protein("MAEWPQK", "TEST_PROT"); + + static Modification MakeSub(string idArrow, char originalResidue) + { + ModificationMotif.TryGetMotif(originalResidue.ToString(), out var motif); + return new Modification( + idArrow, + null, + "1 nucleotide substitution", + null, + motif, + "Anywhere.", + null, + 0, + new Dictionary>(), + null, + null, + null, + null, + null); + } + + static Modification MakeOther(string id) + { + ModificationMotif.TryGetMotif("P", out var motifP); + return new Modification( + id, + null, + "oxidation", + null, + motifP, + "Anywhere.", + null, + 15.9949, + new Dictionary>(), + null, + null, + null, + null, + null); + } + + static Modification MakeMalformed() + { + ModificationMotif.TryGetMotif("Q", out var motifQ); + return new Modification( + "E>A", + null, + "1 nucleotide substitution", + null, + motifQ, + "Anywhere.", + null, + 0, + new Dictionary>(), + null, + null, + null, + null, + null); + } + + void AddMod(Protein p, int pos, Modification m) + { + if (!p.OneBasedPossibleLocalizedModifications.TryGetValue(pos, out var list1)) + { + list1 = new List(); + p.OneBasedPossibleLocalizedModifications[pos] = list1; + } + list1.Add(m); + + if (!p.OriginalNonVariantModifications.TryGetValue(pos, out var list2)) + { + list2 = new List(); + p.OriginalNonVariantModifications[pos] = list2; + } + list2.Add(m); + } + + // Mods to seed + var modEtoA = MakeSub("E->A", 'E'); // pos 3 + var modWtoK = MakeSub("W->K", 'W'); // pos 4 + var modOxidP = MakeOther("Oxid_P"); // pos 5 + var malformed = MakeMalformed(); // pos 6 + + AddMod(protein, 3, modEtoA); + AddMod(protein, 4, modWtoK); + AddMod(protein, 5, modOxidP); + AddMod(protein, 6, malformed); + + // Pre-existing W->K (may be duplicated by converter if description differs) + protein.SequenceVariations.Add(new SequenceVariation(4, 4, "W", "K", "Existing substitution")); + + // Act + protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); + + // Assert unique AA changes, not raw count (converter may add standardized duplicates) + var uniqueChanges = protein.SequenceVariations.Select(v => v.SimpleString()).Distinct().ToList(); + Assert.That(uniqueChanges.Count, Is.EqualTo(2), "Expected exactly two unique substitutions (E3->A and W4->K)."); + + // Ensure E3->A exists + var eToA = protein.SequenceVariations.SingleOrDefault(v => + v.OneBasedBeginPosition == 3 && v.OneBasedEndPosition == 3 && + v.OriginalSequence == "E" && v.VariantSequence == "A"); + Assert.That(eToA, Is.Not.Null, "E3->A variant was not created."); + + // Ensure at least one W4->K exists + var wToKCount = protein.SequenceVariations.Count(v => + v.OneBasedBeginPosition == 4 && v.OneBasedEndPosition == 4 && + v.OriginalSequence == "W" && v.VariantSequence == "K"); + Assert.That(wToKCount, Is.GreaterThanOrEqualTo(1), "Expected a W4->K variant."); + + // Converted positions removed from OneBasedPossibleLocalizedModifications + Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(3), Is.False); + Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(4), Is.False); + + // Unrelated and malformed mods remain + Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(5), Is.True); + Assert.That(protein.OneBasedPossibleLocalizedModifications[5].Any(m => m.OriginalId == "Oxid_P"), Is.True); + Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(6), Is.True); + Assert.That(protein.OneBasedPossibleLocalizedModifications[6].Any(m => m.OriginalId == "E>A"), Is.True); + } } } \ No newline at end of file From 6a06a9069c11e7163a6cc627439dd33e9f7c957a Mon Sep 17 00:00:00 2001 From: trishorts Date: Fri, 24 Oct 2025 16:12:38 -0500 Subject: [PATCH 24/38] this fucking test --- .../Test/DatabaseTests/TestVariantProtein.cs | 158 +++++++++++-- .../DatabaseTests/truncationsExpected.tsv | 137 ++++++++++++ mzLib/Test/Test.csproj | 3 + mzLib/Test/TestPeptideWithSetMods.cs | 210 +++++++++++++----- 4 files changed, 440 insertions(+), 68 deletions(-) create mode 100644 mzLib/Test/DatabaseTests/truncationsExpected.tsv diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 84133ae66..edcc6cf5e 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -1415,36 +1415,158 @@ static HashSet AppliedLabels(Protein p) => } Assert.That(unknownModifications == null || unknownModifications.Count == 0, "No unknown modifications expected from this input"); } + [Test] public static void ProteinVariantsReadAsModificationsWrittenAsVariants() { + // PURPOSE + // This test verifies the I/O pipeline that converts "nucleotide substitution" modifications + // embedded in an input protein XML into canonical "sequence variant" features when read, + // and persists them back out as proper entries when written. + // + // WHY + // - Some sources (e.g., GPTMD discovery) encode AA substitutions as modifications with + // ModificationType = "1 nucleotide substitution". Internally, we want these represented + // as SequenceVariations, not remaining as generic modifications. + // - On read: these substitution mods must be removed from the protein’s modifications collections + // and turned into SequenceVariations. + // - On write: they must be serialized as sequence variant features with a standardized description + // ("Putative GPTMD Substitution") and NOT re-serialized as modifications. + // + // EXPECTATIONS SUMMARY + // - Input file has 57 lines that contain "1 nucleotide substitution" (source encoding as mods). + // - After LoadProteinXML: + // * We get exactly 9 proteins (DecoyType.None). + // * Total SequenceVariations across all proteins = 194. + // * Total OneBasedPossibleLocalizedModifications count across all proteins = 0 + // (i.e., all substitution mods became sequence variations). + // * No unknown modifications should remain. + // * No applied variants are expected (metadata only; we are not expanding combinatorics here). + // - After WriteXmlDatabase then re-load: + // * Count and totals remain the same (9 proteins; 194 total sequence variations; 0 mods). + // * The output file contains: + // - Exactly 194 lines with feature type="sequence variant". + // - Exactly 194 lines with "Putative GPTMD Substitution". + // - Exactly 0 lines with "1 nucleotide substitution" (proved that mods were not serialized). + // + // NOTE + // - We keep DecoyType.None to avoid expanding the protein list. + // - We use minAlleleDepth = 1 and maxHeterozygousVariants = 0 to avoid variant-proteoform expansion. + + // Arrange: locate the source database that encodes nucleotide substitutions as modifications string databaseName = "nucleotideVariantsAsModifications.xml"; + string inputPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName); + Assert.That(File.Exists(inputPath), Is.True, "Input database file must exist for this test"); - Assert.That(File.ReadAllLines(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName)).Count(l => l.Contains("1 nucleotide substitution")), Is.EqualTo(57)); + // Sanity: confirm the source encodes substitutions as modifications in the raw XML + // (57 lines that mention "1 nucleotide substitution" in the file). + var inputLines = File.ReadAllLines(inputPath); + int inputSubstitutionModLines = inputLines.Count(l => l.Contains("1 nucleotide substitution")); + Assert.That(inputSubstitutionModLines, Is.EqualTo(57), "Source XML should contain 57 substitution-mod lines"); - var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true, - DecoyType.None, null, false, null, out var unknownModifications, 1, 0); - Assert.AreEqual(9, proteins.Count); // 1 target - Assert.AreEqual(194, proteins.Select(v=>v.SequenceVariations.Count).Sum()); // there are no sequence variations in the original proteins - Assert.AreEqual(0, proteins.Select(m => m.OneBasedPossibleLocalizedModifications.Sum(list=>list.Value.Count)).Sum()); // there are 194 sequence variants as modifications in the original proteins + // Optional sanity: the source should not already be in sequence-variant form + int inputSeqVarFeatureLines = inputLines.Count(l => l.Contains("feature type=\"sequence variant\"")); + Assert.That(inputSeqVarFeatureLines, Is.EqualTo(0), "Source XML should not already encode sequence variants"); + + // Act: read the database. Expect conversion to SequenceVariations and removal from modifications + var proteins = ProteinDbLoader.LoadProteinXML( + inputPath, + generateTargets: true, + decoyType: DecoyType.None, + allKnownModifications: null, + isContaminant: false, + modTypesToExclude: null, + unknownModifications: out var unknownModifications, + minAlleleDepth: 1, + maxHeterozygousVariants: 0); + + // Assert: no decoys requested, so all should be targets + Assert.That(proteins.All(p => !p.IsDecoy), "All proteins should be targets when DecoyType.None is used"); + + // Assert: exactly 9 proteins + Assert.AreEqual(9, proteins.Count, "Expected exactly 9 proteins from the input"); + + // Assert: the loader converted all substitution modifications into proper SequenceVariations + // and removed them from OneBasedPossibleLocalizedModifications + int totalSequenceVariations = proteins.Sum(p => p.SequenceVariations.Count); + int totalPossibleLocalizedMods = proteins.Sum(p => p.OneBasedPossibleLocalizedModifications.Values.Sum(v => v.Count)); + Assert.AreEqual(194, totalSequenceVariations, "Total number of sequence variations after load must be 194"); + Assert.AreEqual(0, totalPossibleLocalizedMods, "All substitution modifications should have been converted; none should remain as mods"); + // FIX: Safely log unknown modifications (Dictionary), if any appear unexpectedly. + if (unknownModifications != null && unknownModifications.Count > 0) + { + TestContext.WriteLine("Unknown modifications encountered during read (unexpected):"); + foreach (var kvp in unknownModifications) + { + var mod = kvp.Value; + var id = mod?.OriginalId ?? ""; + var type = mod?.ModificationType ?? ""; + TestContext.WriteLine($" - key={kvp.Key}, id={id}, type={type}"); + } + } + Assert.That(unknownModifications == null || unknownModifications.Count == 0, "No unknown modifications should remain after conversion"); + + // Assert: No applied variants expected in this test (we are validating representation, not expansion) + int totalAppliedVariants = proteins.Sum(p => p.AppliedSequenceVariations.Count()); + Assert.That(totalAppliedVariants, Is.EqualTo(0), "No applied variants expected; variants are metadata here"); + + // Assert: None of the proteins should carry "1 nucleotide substitution" as OriginalNonVariantModifications anymore + int residualSubstitutionMods = + proteins.Sum(p => p.OriginalNonVariantModifications.Values.Sum(list => list.Count(m => m.ModificationType == "1 nucleotide substitution"))); + Assert.That(residualSubstitutionMods, Is.EqualTo(0), "No '1 nucleotide substitution' mods should remain in OriginalNonVariantModifications"); + + // Persist the converted representation and read it back; this file should contain only sequence-variant features string tempDir = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString()); Directory.CreateDirectory(tempDir); string tempFile = Path.Combine(tempDir, "xmlWithSequenceVariantsAndNoModifications.txt"); - ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), proteins.Where(p => !p.IsDecoy).ToList(), tempFile); - proteins = ProteinDbLoader.LoadProteinXML(tempFile, true, - DecoyType.None, null, false, null, out unknownModifications, 1, 0); - Assert.AreEqual(9, proteins.Count); // 1 target - Assert.AreEqual(194, proteins.Select(v => v.SequenceVariations.Count).Sum()); // there are 194 sequence variations in the revised proteins - Assert.AreEqual(0, proteins.Select(m => m.OneBasedPossibleLocalizedModifications.Sum(list => list.Value.Count)).Sum()); // there are 0 sequence variants as modifications in the original proteins - - Assert.That(File.ReadAllLines(tempFile).Count(l => l.Contains("feature type=\"sequence variant\"")), Is.EqualTo(194)); - Assert.That(File.ReadAllLines(tempFile).Count(l => l.Contains("Putative GPTMD Substitution")), Is.EqualTo(194)); - Assert.That(File.ReadAllLines(tempFile).Count(l => l.Contains("1 nucleotide substitution")), Is.EqualTo(0)); - if (Directory.Exists(tempDir)) Directory.Delete(tempDir, true); + try + { + // Write only the targets (all are targets here). Expect the writer to emit sequence variant features. + ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), proteins.Where(p => !p.IsDecoy).ToList(), tempFile); + + // Inspect the written file contents directly for ground truth + var writtenLines = File.ReadAllLines(tempFile); + int writtenSeqVarFeatures = writtenLines.Count(l => l.Contains("feature type=\"sequence variant\"")); + int writtenPutativeGptmd = writtenLines.Count(l => l.Contains("Putative GPTMD Substitution")); + int writtenSubstitutionMods = writtenLines.Count(l => l.Contains("1 nucleotide substitution")); + + // Assert: writer produced only sequence variants (not substitution mods) + Assert.That(writtenSeqVarFeatures, Is.EqualTo(194), "All 194 substitutions must be serialized as sequence variant features"); + Assert.That(writtenPutativeGptmd, Is.EqualTo(194), "All 194 variants should have the standardized description label"); + Assert.That(writtenSubstitutionMods, Is.EqualTo(0), "No '1 nucleotide substitution' strings should remain in the written XML"); + + // Re-load the written file to confirm round-trip stability + proteins = ProteinDbLoader.LoadProteinXML( + tempFile, + generateTargets: true, + decoyType: DecoyType.None, + allKnownModifications: null, + isContaminant: false, + modTypesToExclude: null, + unknownModifications: out unknownModifications, + minAlleleDepth: 1, + maxHeterozygousVariants: 0); + + // Assert: the round-tripped representation is identical in shape and counts + Assert.AreEqual(9, proteins.Count, "Round-trip must preserve protein count"); + Assert.AreEqual(194, proteins.Sum(v => v.SequenceVariations.Count), "Round-trip must preserve total number of sequence variations"); + Assert.AreEqual(0, proteins.Sum(m => m.OneBasedPossibleLocalizedModifications.Values.Sum(list => list.Count)), + "Round-trip must preserve the fact that no substitution mods remain"); + Assert.That(unknownModifications == null || unknownModifications.Count == 0, "Round-trip should not introduce unknown modifications"); + Assert.That(proteins.Sum(p => p.AppliedSequenceVariations.Count()), Is.EqualTo(0), "Round-trip should not apply any variants"); + } + finally + { + // Cleanup test artifacts + if (Directory.Exists(tempDir)) + { + try { File.SetAttributes(tempFile, FileAttributes.Normal); } catch { /* ignore */ } + Directory.Delete(tempDir, true); + } + } } - [Test] public void Constructor_ParsesDescriptionCorrectly() { diff --git a/mzLib/Test/DatabaseTests/truncationsExpected.tsv b/mzLib/Test/DatabaseTests/truncationsExpected.tsv new file mode 100644 index 000000000..8a35f98a3 --- /dev/null +++ b/mzLib/Test/DatabaseTests/truncationsExpected.tsv @@ -0,0 +1,137 @@ +Sequence Type Begin End RetainedMethionine +MALWMRLLPLLALLALWGPDPAAA Target 1 24 TRUE +FVNQHLCGSHLVEALYLVCGERGFFYTPKT Target 25 54 FALSE +EAEDLQVGQVELGGGPGAGSLQPLALEGSLQ Target 57 87 FALSE +GIVEQCCTSICSLYQLENYCN Target 90 110 FALSE +MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 1 110 TRUE +ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 2 110 FALSE +LWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 3 110 FALSE +WMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 4 110 FALSE +MRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 5 110 FALSE +RLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 6 110 FALSE +LLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 7 110 FALSE +ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYC Target 2 109 FALSE +ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENY Target 2 108 FALSE +ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLEN Target 2 107 FALSE +ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLE Target 2 106 FALSE +ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQL Target 2 105 FALSE +MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYC Target 1 109 TRUE +MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENY Target 1 108 TRUE +MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLEN Target 1 107 TRUE +MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLE Target 1 106 TRUE +MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQL Target 1 105 TRUE +ALWMRLLPLLALLALWGPDPAAA Target 2 24 FALSE +LWMRLLPLLALLALWGPDPAAA Target 3 24 FALSE +WMRLLPLLALLALWGPDPAAA Target 4 24 FALSE +MRLLPLLALLALWGPDPAAA Target 5 24 FALSE +RLLPLLALLALWGPDPAAA Target 6 24 FALSE +LLPLLALLALWGPDPAAA Target 7 24 FALSE +ALWMRLLPLLALLALWGPDPAA Target 2 23 FALSE +ALWMRLLPLLALLALWGPDPA Target 2 22 FALSE +ALWMRLLPLLALLALWGPDP Target 2 21 FALSE +ALWMRLLPLLALLALWGPD Target 2 20 FALSE +ALWMRLLPLLALLALWGP Target 2 19 FALSE +MALWMRLLPLLALLALWGPDPAA Target 1 23 TRUE +MALWMRLLPLLALLALWGPDPA Target 1 22 TRUE +MALWMRLLPLLALLALWGPDP Target 1 21 TRUE +MALWMRLLPLLALLALWGPD Target 1 20 TRUE +MALWMRLLPLLALLALWGP Target 1 19 TRUE +VNQHLCGSHLVEALYLVCGERGFFYTPKT Target 26 54 FALSE +NQHLCGSHLVEALYLVCGERGFFYTPKT Target 27 54 FALSE +QHLCGSHLVEALYLVCGERGFFYTPKT Target 28 54 FALSE +HLCGSHLVEALYLVCGERGFFYTPKT Target 29 54 FALSE +LCGSHLVEALYLVCGERGFFYTPKT Target 30 54 FALSE +FVNQHLCGSHLVEALYLVCGERGFFYTPK Target 25 53 FALSE +FVNQHLCGSHLVEALYLVCGERGFFYTP Target 25 52 FALSE +FVNQHLCGSHLVEALYLVCGERGFFYT Target 25 51 FALSE +FVNQHLCGSHLVEALYLVCGERGFFY Target 25 50 FALSE +FVNQHLCGSHLVEALYLVCGERGFF Target 25 49 FALSE +AEDLQVGQVELGGGPGAGSLQPLALEGSLQ Target 58 87 FALSE +EDLQVGQVELGGGPGAGSLQPLALEGSLQ Target 59 87 FALSE +DLQVGQVELGGGPGAGSLQPLALEGSLQ Target 60 87 FALSE +LQVGQVELGGGPGAGSLQPLALEGSLQ Target 61 87 FALSE +QVGQVELGGGPGAGSLQPLALEGSLQ Target 62 87 FALSE +EAEDLQVGQVELGGGPGAGSLQPLALEGSL Target 57 86 FALSE +EAEDLQVGQVELGGGPGAGSLQPLALEGS Target 57 85 FALSE +EAEDLQVGQVELGGGPGAGSLQPLALEG Target 57 84 FALSE +EAEDLQVGQVELGGGPGAGSLQPLALE Target 57 83 FALSE +EAEDLQVGQVELGGGPGAGSLQPLAL Target 57 82 FALSE +IVEQCCTSICSLYQLENYCN Target 91 110 FALSE +VEQCCTSICSLYQLENYCN Target 92 110 FALSE +EQCCTSICSLYQLENYCN Target 93 110 FALSE +QCCTSICSLYQLENYCN Target 94 110 FALSE +CCTSICSLYQLENYCN Target 95 110 FALSE +GIVEQCCTSICSLYQLENYC Target 90 109 FALSE +GIVEQCCTSICSLYQLENY Target 90 108 FALSE +GIVEQCCTSICSLYQLEN Target 90 107 FALSE +GIVEQCCTSICSLYQLE Target 90 106 FALSE +GIVEQCCTSICSLYQL Target 90 105 FALSE +FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 25 110 FALSE +MNCYNELQYLSCISTCCQEVIGRK Decoy 1 24 TRUE +QLSGELALPQLSGAGPGGGLEVQGVQLDEA Decoy 25 54 FALSE +RTKPTYFFGREGCVLYLAEVLHSGCLHQNVF Decoy 57 87 FALSE +APDPGWLALLALLPLLRMWLA Decoy 90 110 FALSE +MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 1 110 TRUE +NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 2 110 FALSE +CYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 3 110 FALSE +YNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 4 110 FALSE +NELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 5 110 FALSE +ELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 6 110 FALSE +LQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 7 110 FALSE +NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWL Decoy 2 109 FALSE +NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMW Decoy 2 108 FALSE +NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRM Decoy 2 107 FALSE +NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLR Decoy 2 106 FALSE +NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLL Decoy 2 105 FALSE +MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWL Decoy 1 109 TRUE +MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMW Decoy 1 108 TRUE +MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRM Decoy 1 107 TRUE +MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLR Decoy 1 106 TRUE +MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLL Decoy 1 105 TRUE +NCYNELQYLSCISTCCQEVIGRK Decoy 2 24 FALSE +CYNELQYLSCISTCCQEVIGRK Decoy 3 24 FALSE +YNELQYLSCISTCCQEVIGRK Decoy 4 24 FALSE +NELQYLSCISTCCQEVIGRK Decoy 5 24 FALSE +ELQYLSCISTCCQEVIGRK Decoy 6 24 FALSE +LQYLSCISTCCQEVIGRK Decoy 7 24 FALSE +NCYNELQYLSCISTCCQEVIGR Decoy 2 23 FALSE +NCYNELQYLSCISTCCQEVIG Decoy 2 22 FALSE +NCYNELQYLSCISTCCQEVI Decoy 2 21 FALSE +NCYNELQYLSCISTCCQEV Decoy 2 20 FALSE +NCYNELQYLSCISTCCQE Decoy 2 19 FALSE +MNCYNELQYLSCISTCCQEVIGR Decoy 1 23 TRUE +MNCYNELQYLSCISTCCQEVIG Decoy 1 22 TRUE +MNCYNELQYLSCISTCCQEVI Decoy 1 21 TRUE +MNCYNELQYLSCISTCCQEV Decoy 1 20 TRUE +MNCYNELQYLSCISTCCQE Decoy 1 19 TRUE +LSGELALPQLSGAGPGGGLEVQGVQLDEA Decoy 26 54 FALSE +SGELALPQLSGAGPGGGLEVQGVQLDEA Decoy 27 54 FALSE +GELALPQLSGAGPGGGLEVQGVQLDEA Decoy 28 54 FALSE +ELALPQLSGAGPGGGLEVQGVQLDEA Decoy 29 54 FALSE +LALPQLSGAGPGGGLEVQGVQLDEA Decoy 30 54 FALSE +QLSGELALPQLSGAGPGGGLEVQGVQLDE Decoy 25 53 FALSE +QLSGELALPQLSGAGPGGGLEVQGVQLD Decoy 25 52 FALSE +QLSGELALPQLSGAGPGGGLEVQGVQL Decoy 25 51 FALSE +QLSGELALPQLSGAGPGGGLEVQGVQ Decoy 25 50 FALSE +QLSGELALPQLSGAGPGGGLEVQGV Decoy 25 49 FALSE +TKPTYFFGREGCVLYLAEVLHSGCLHQNVF Decoy 58 87 FALSE +KPTYFFGREGCVLYLAEVLHSGCLHQNVF Decoy 59 87 FALSE +PTYFFGREGCVLYLAEVLHSGCLHQNVF Decoy 60 87 FALSE +TYFFGREGCVLYLAEVLHSGCLHQNVF Decoy 61 87 FALSE +YFFGREGCVLYLAEVLHSGCLHQNVF Decoy 62 87 FALSE +RTKPTYFFGREGCVLYLAEVLHSGCLHQNV Decoy 57 86 FALSE +RTKPTYFFGREGCVLYLAEVLHSGCLHQN Decoy 57 85 FALSE +RTKPTYFFGREGCVLYLAEVLHSGCLHQ Decoy 57 84 FALSE +RTKPTYFFGREGCVLYLAEVLHSGCLH Decoy 57 83 FALSE +RTKPTYFFGREGCVLYLAEVLHSGCL Decoy 57 82 FALSE +PDPGWLALLALLPLLRMWLA Decoy 91 110 FALSE +DPGWLALLALLPLLRMWLA Decoy 92 110 FALSE +PGWLALLALLPLLRMWLA Decoy 93 110 FALSE +GWLALLALLPLLRMWLA Decoy 94 110 FALSE +WLALLALLPLLRMWLA Decoy 95 110 FALSE +APDPGWLALLALLPLLRMWL Decoy 90 109 FALSE +APDPGWLALLALLPLLRMW Decoy 90 108 FALSE +APDPGWLALLALLPLLRM Decoy 90 107 FALSE +APDPGWLALLALLPLLR Decoy 90 106 FALSE +APDPGWLALLALLPLL Decoy 90 105 FALSE +QLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 25 110 FALSE diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj index 59c393b39..7f0449d12 100644 --- a/mzLib/Test/Test.csproj +++ b/mzLib/Test/Test.csproj @@ -262,6 +262,9 @@ Always + + Always + Always diff --git a/mzLib/Test/TestPeptideWithSetMods.cs b/mzLib/Test/TestPeptideWithSetMods.cs index ccc6d950e..d235759ed 100644 --- a/mzLib/Test/TestPeptideWithSetMods.cs +++ b/mzLib/Test/TestPeptideWithSetMods.cs @@ -1046,67 +1046,177 @@ public static void TestPeptideWithSetModsReturnsTruncationsInTopDown() List insulinTruncations = insulin.Digest(new DigestionParams(protease: protease.Name), new List(), new List(), topDownTruncationSearch: true).ToList(); Assert.AreEqual(68, insulinTruncations.Count); } - [Test] public static void TestPeptideWithSetModsReturnsDecoyTruncationsInTopDown() { + // PURPOSE + // Generate a comprehensive TSV of all top-down truncation peptides for target and decoy insulin entries, + // then compare the result against the stored expected file for regression checking. + // + // OUTPUT COLUMNS + // - Sequence: peptide/proteoform base sequence + // - Type: "Target" or "Decoy" based on parent protein + // - Begin: one-based start residue within the parent protein + // - End: one-based end residue within the parent protein + // - RetainedMethionine: TRUE when the peptide includes the protein’s N-terminal Met (Begin == 1 and Parent.BaseSequence[0] == 'M'), else FALSE + + // Arrange: load insulin with reverse decoys and truncations enabled string xmlDatabase = Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", "humanInsulin.xml"); - List insulinProteins = ProteinDbLoader.LoadProteinXML(xmlDatabase, true, - DecoyType.Reverse, null, false, null, out var unknownModifications, addTruncations: true); + List insulinProteins = ProteinDbLoader.LoadProteinXML( + xmlDatabase, + generateTargets: true, + decoyType: DecoyType.Reverse, + allKnownModifications: null, + isContaminant: false, + modTypesToExclude: null, + unknownModifications: out var unknownModifications, + addTruncations: true); + + Assert.That(insulinProteins.Any(p => !p.IsDecoy), "Expected at least one target protein"); + Assert.That(insulinProteins.Any(p => p.IsDecoy), "Expected at least one decoy protein"); + Assert.That(unknownModifications == null || unknownModifications.Count == 0, "No unknown modifications expected from insulin XML"); + + // Digest: enumerate truncations for a representative target/decoy pair (parity sanity) + static string Reverse(string s) => new string(s.Reverse().ToArray()); + var target = insulinProteins.First(p => !p.IsDecoy); + string expectedDecoySeq = target.BaseSequence.Length > 0 && target.BaseSequence[0] == 'M' + ? "M" + Reverse(target.BaseSequence.Substring(1)) + : Reverse(target.BaseSequence); + var decoy = insulinProteins.FirstOrDefault(p => p.IsDecoy && p.BaseSequence == expectedDecoySeq) + ?? insulinProteins.First(p => p.IsDecoy && p.Length == target.Length); + + Assert.IsFalse(string.IsNullOrWhiteSpace(target.Accession)); + Assert.IsTrue(decoy.Accession.StartsWith("DECOY_")); + Assert.AreEqual(target.Length, decoy.Length); + Assert.AreNotEqual(target.BaseSequence, decoy.BaseSequence); + Assert.AreEqual(expectedDecoySeq, decoy.BaseSequence, "Decoy must follow 'retain M, reverse remainder' rule"); + + var dp = new DigestionParams(protease: "top-down"); + List targetTruncs = target + .Digest(dp, new List(), new List(), topDownTruncationSearch: true) + .Cast().ToList(); + List decoyTruncs = decoy + .Digest(dp, new List(), new List(), topDownTruncationSearch: true) + .Cast().ToList(); + + // Parity and sanity checks for the selected pair + Assert.AreEqual(68, targetTruncs.Count, "Target should yield 68 truncation products in top-down mode"); + Assert.AreEqual(68, decoyTruncs.Count, "Decoy should yield 68 truncation products in top-down mode"); + Assert.That(targetTruncs.All(p => p.DigestionParams?.DigestionAgent?.Name == "top-down")); + Assert.That(decoyTruncs.All(p => p.DigestionParams?.DigestionAgent?.Name == "top-down")); + Assert.AreEqual(targetTruncs.Count, targetTruncs.Select(p => p.BaseSequence).Distinct().Count()); + Assert.AreEqual(decoyTruncs.Count, decoyTruncs.Select(p => p.BaseSequence).Distinct().Count()); + Assert.IsTrue(targetTruncs.Any(p => p.BaseSequence == target.BaseSequence)); + Assert.IsTrue(decoyTruncs.Any(p => p.BaseSequence == decoy.BaseSequence)); + + // Build the table rows + static bool HasRetainedMet(PeptideWithSetModifications p) => + p.OneBasedStartResidueInProtein == 1 && + p.Parent?.BaseSequence?.Length > 0 && + p.Parent.BaseSequence[0] == 'M'; + + // We only compare the combined truncations for the chosen target/decoy in this test (68 + 68 rows) + var rows = targetTruncs.Concat(decoyTruncs) + .Select(pep => + { + bool isDecoy = (pep.Parent as Protein)?.IsDecoy == true; + string type = isDecoy ? "Decoy" : "Target"; + string retained = HasRetainedMet(pep) ? "TRUE" : "FALSE"; // normalize to match expected + return string.Join("\t", new[] + { + pep.BaseSequence, + type, + pep.OneBasedStartResidueInProtein.ToString(), + pep.OneBasedEndResidueInProtein.ToString(), + retained + }); + }) + // Sort deterministically to avoid platform/iteration-order differences + .OrderBy(r => r, StringComparer.Ordinal) + .ToList(); + + var header = "Sequence\tType\tBegin\tEnd\tRetainedMethionine"; + var outputLines = new List(capacity: rows.Count + 1) { header }; + outputLines.AddRange(rows); + + // Persist the generated table for inspection + string workPath = Path.Combine(TestContext.CurrentContext.WorkDirectory, "topdown_truncations_table.tsv"); + File.WriteAllLines(workPath, outputLines); + Console.WriteLine($"Generated truncation table: {workPath}"); + + // Load expected and compare + string expectedPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "truncationsExpected.tsv"); + Assert.That(File.Exists(expectedPath), $"Expected file not found: {expectedPath}"); + var expectedAll = File.ReadAllLines(expectedPath) + .Where(l => l is not null) + .Select(l => l.TrimEnd('\r', '\n')) + .ToList(); + + Assert.That(expectedAll.Count > 0, "Expected file is empty"); + string expectedHeader = expectedAll[0]; + var expectedRows = expectedAll.Skip(1) + .Where(l => !string.IsNullOrWhiteSpace(l)) + .OrderBy(l => l, StringComparer.Ordinal) + .ToList(); + + // Header check + if (!string.Equals(header, expectedHeader, StringComparison.Ordinal)) + { + TestContext.Out.WriteLine($"Header mismatch:"); + TestContext.Out.WriteLine($" Expected: {expectedHeader}"); + TestContext.Out.WriteLine($" Actual : {header}"); + } - Protease protease = new Protease("top-down", CleavageSpecificity.None, "", "", new List(), null); - List insulintTargetTruncations = insulinProteins.Where(p=>!p.IsDecoy).First().Digest(new DigestionParams(protease: protease.Name), new List(), new List(), topDownTruncationSearch: true).ToList(); - Assert.AreEqual(68, insulintTargetTruncations.Count); - List insulintDecoyTruncations = insulinProteins.Where(p => p.IsDecoy).First().Digest(new DigestionParams(protease: protease.Name), new List(), new List(), topDownTruncationSearch: true).ToList(); - Assert.AreEqual(68, insulintDecoyTruncations.Count); - } + // Multiset comparison for rows (counts of duplicates matter) + static Dictionary ToCounts(IEnumerable lines) + => lines.GroupBy(x => x, StringComparer.Ordinal) + .ToDictionary(g => g.Key, g => g.Count(), StringComparer.Ordinal); - [Test] - public static void CheckFullChemicalFormula() - { - PeptideWithSetModifications small_pep = new PeptideWithSetModifications(new Protein("PEPTIDE", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 7, CleavageSpecificity.Full, null, 0, new Dictionary(), 0, null); - ChemicalFormula small_pep_cf = ChemicalFormula.ParseFormula("C34H53N7O15"); - Assert.AreEqual(small_pep.FullChemicalFormula, small_pep_cf); + var expCounts = ToCounts(expectedRows); + var gotCounts = ToCounts(rows); - PeptideWithSetModifications large_pep = new PeptideWithSetModifications(new Protein("PEPTIDEKRNSPEPTIDEKECUEIRQUV", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 28, CleavageSpecificity.Full, null, 0, new Dictionary(), 0, null); - ChemicalFormula large_pep_cf = ChemicalFormula.ParseFormula("C134H220N38O50S1Se2"); - Assert.AreEqual(large_pep.FullChemicalFormula, large_pep_cf); + var missing = new List(); // in expected more times than in actual + var extra = new List(); // in actual more times than in expected + + foreach (var kv in expCounts) + { + gotCounts.TryGetValue(kv.Key, out int got); + if (got < kv.Value) + { + int deficit = kv.Value - got; + for (int i = 0; i < deficit; i++) missing.Add(kv.Key); + } + } + foreach (var kv in gotCounts) + { + expCounts.TryGetValue(kv.Key, out int exp); + if (kv.Value > exp) + { + int surplus = kv.Value - exp; + for (int i = 0; i < surplus; i++) extra.Add(kv.Key); + } + } - ModificationMotif.TryGetMotif("S", out ModificationMotif motif_s); - Modification phosphorylation = new Modification(_originalId: "phospho", _modificationType: "CommonBiological", _target: motif_s, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("H1O3P1")); - Dictionary modDict_small = new Dictionary(); - modDict_small.Add(4, phosphorylation); + if (missing.Count == 0 && extra.Count == 0) + { + TestContext.Out.WriteLine("Top-down truncation table matches expected."); + TestContext.Out.WriteLine("Sample (first 5 rows):"); + foreach (var l in outputLines.Take(6)) TestContext.Out.WriteLine(l); + } + else + { + TestContext.Out.WriteLine("Top-down truncation table differs from expected."); + TestContext.Out.WriteLine($"Missing rows (expected but not found or under-counted): {missing.Count}"); + foreach (var l in missing.Take(20)) TestContext.Out.WriteLine($" MISSING: {l}"); + if (missing.Count > 20) TestContext.Out.WriteLine($" ...and {missing.Count - 20} more"); - PeptideWithSetModifications small_pep_mod = new PeptideWithSetModifications(new Protein("PEPSIDE", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 7, CleavageSpecificity.Full, null, 0, modDict_small, 0, null); - ChemicalFormula small_pep_mod_cf = ChemicalFormula.ParseFormula("C33H52N7O18P1"); - Assert.AreEqual(small_pep_mod.FullChemicalFormula, small_pep_mod_cf); + TestContext.Out.WriteLine($"Extra rows (found but not expected or over-counted): {extra.Count}"); + foreach (var l in extra.Take(20)) TestContext.Out.WriteLine($" EXTRA: {l}"); + if (extra.Count > 20) TestContext.Out.WriteLine($" ...and {extra.Count - 20} more"); - ModificationMotif.TryGetMotif("K", out ModificationMotif motif_k); - Modification acetylation = new Modification(_originalId: "acetyl", _modificationType: "CommonBiological", _target: motif_k, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("C2H3O")); - Dictionary modDict_large = new Dictionary(); - modDict_large.Add(4, phosphorylation); - modDict_large.Add(11, phosphorylation); - modDict_large.Add(8, acetylation); - - PeptideWithSetModifications large_pep_mod = new PeptideWithSetModifications(new Protein("PEPSIDEKRNSPEPTIDEKECUEIRQUV", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 28, CleavageSpecificity.Full, null, 0, modDict_large, 0, null); - ChemicalFormula large_pep_mod_cf = ChemicalFormula.ParseFormula("C135H223N38O57P2S1Se2"); - Assert.AreEqual(large_pep_mod.FullChemicalFormula, large_pep_mod_cf); - - ModificationMotif.TryGetMotif("C", out var motif_c); - ModificationMotif.TryGetMotif("G", out var motif_g); - Dictionary modDict = - new() - { - { "Carbamidomethyl on C", new Modification(_originalId: "Carbamidomethyl", _modificationType: "Common Fixed", - _target: motif_c, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("C2H3ON")) }, - { "BS on G" , new Modification(_originalId: "BS on G", _modificationType: "BS", _target: motif_g, _monoisotopicMass: 96.0875)} - }; - PeptideWithSetModifications pwsmWithMissingCfMods = new PeptideWithSetModifications( - "ENQGDETQG[Speculative:BS on G]C[Common Fixed:Carbamidomethyl on C]PPQR", modDict, p: new Protein("ENQGDETQGCPPQR", "FakeProtein"), digestionParams: new DigestionParams(), - oneBasedStartResidueInProtein: 1, oneBasedEndResidueInProtein: 14); - Assert.Null(pwsmWithMissingCfMods.FullChemicalFormula); + Assert.Fail($"Generated top-down truncation table does not match expected.\nExpected file: {expectedPath}\nActual file: {workPath}\nMissing: {missing.Count}, Extra: {extra.Count}"); + } } - [Test] public static void CheckMostAbundantMonoisotopicMass() { From 829208c5390a280eda2176e301aeb9de56ee0a00 Mon Sep 17 00:00:00 2001 From: trishorts Date: Fri, 24 Oct 2025 16:24:54 -0500 Subject: [PATCH 25/38] name changeroo --- mzLib/Omics/BioPolymer/SequenceVariation.cs | 15 ++++----- mzLib/Omics/BioPolymer/VariantApplication.cs | 32 +++++++++---------- mzLib/Omics/BioPolymer/VariantCallFormat.cs | 2 +- mzLib/Test/DatabaseTests/TestProteinReader.cs | 6 ++-- .../DatabaseTests/TestProteomicsReadWrite.cs | 8 ++--- .../Test/DatabaseTests/TestVariantProtein.cs | 4 +-- mzLib/Test/FlashLFQ/TestIsoTracker.cs | 32 +++++++++---------- .../DecoyGeneration/DecoyProteinGenerator.cs | 16 +++++----- .../DecoyGeneration/RnaDecoyGenerator.cs | 4 +-- .../ProteinDbWriter.cs | 6 ++-- 10 files changed, 61 insertions(+), 64 deletions(-) diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs index 63619b1c8..875f556bd 100644 --- a/mzLib/Omics/BioPolymer/SequenceVariation.cs +++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs @@ -20,7 +20,7 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str OneBasedEndPosition = oneBasedEndPosition; OriginalSequence = originalSequence ?? ""; VariantSequence = variantSequence ?? ""; - Description = new VariantCallFormat(description); + VariantCallFormatDataString = new VariantCallFormat(description); OneBasedModifications = oneBasedModifications ?? new Dictionary>(); } @@ -58,14 +58,11 @@ public SequenceVariation(int oneBasedPosition, string originalSequence, string v public string VariantSequence { get; } /// - /// Description of this variation (optional) + /// VariantCallFormatDataString of this variation (optional) /// - public VariantCallFormat Description { get; } + public VariantCallFormat? VariantCallFormatDataString { get; } + - /// Optional multi-sample VCF record describing the variant (can be null or collapsed). - public VariantCallFormat? VariantCallFormatData { get; } - [Obsolete("Use VariantCallFormatData for structured data or Description/SearchableAnnotation for text.")] - public VariantCallFormat? LegacyVariantDescription => VariantCallFormatData; /// /// Modifications specifically for this variant /// @@ -79,7 +76,7 @@ public override bool Equals(object obj) && OneBasedEndPosition == s.OneBasedEndPosition && (s.OriginalSequence == null && OriginalSequence == null || OriginalSequence.Equals(s.OriginalSequence)) && (s.VariantSequence == null && VariantSequence == null || VariantSequence.Equals(s.VariantSequence)) - && (s.Description == null && Description == null || Description.Equals(s.Description)) + && (s.VariantCallFormatDataString == null && VariantCallFormatDataString == null || VariantCallFormatDataString.Equals(s.VariantCallFormatDataString)) && (s.OneBasedModifications == null && OneBasedModifications == null || s.OneBasedModifications.Keys.ToList().SequenceEqual(OneBasedModifications.Keys.ToList()) && s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList())); @@ -91,7 +88,7 @@ public override int GetHashCode() ^ OneBasedEndPosition.GetHashCode() ^ OriginalSequence.GetHashCode() // null handled in constructor ^ VariantSequence.GetHashCode() // null handled in constructor - ^ Description.GetHashCode(); // always constructed in constructor + ^ VariantCallFormatDataString.GetHashCode(); // always constructed in constructor } /// diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs index 1669c3afe..897ffb9a7 100644 --- a/mzLib/Omics/BioPolymer/VariantApplication.cs +++ b/mzLib/Omics/BioPolymer/VariantApplication.cs @@ -25,7 +25,7 @@ public static List GetVariantBioPolymers(this { protein.ConsensusVariant.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); - if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.Description == null || v.Description.Genotypes.Count == 0)) + if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.VariantCallFormatDataString == null || v.VariantCallFormatDataString.Genotypes.Count == 0)) { // this is a protein with either no VCF lines or a mix of VCF and non-VCF lines return ApplyAllVariantCombinations(protein, protein.SequenceVariations, maxAllowedVariantsForCombinatorics).ToList(); @@ -100,7 +100,7 @@ public static List ApplyVariants(TBioPolymerTy List uniqueEffectsToApply = sequenceVariations .GroupBy(v => v.SimpleString()) .Select(x => x.First()) - .Where(v => v.Description.Genotypes.Count > 0) // this is a VCF line + .Where(v => v.VariantCallFormatDataString.Genotypes.Count > 0) // this is a VCF line .OrderByDescending(v => v.OneBasedBeginPosition) // apply variants at the end of the protein sequence first .ToList(); @@ -112,7 +112,7 @@ public static List ApplyVariants(TBioPolymerTy return new List { proteinCopy }; } - HashSet individuals = new HashSet(uniqueEffectsToApply.SelectMany(v => v.Description.Genotypes.Keys)); + HashSet individuals = new HashSet(uniqueEffectsToApply.SelectMany(v => v.VariantCallFormatDataString.Genotypes.Keys)); List variantProteins = new(); List newVariantProteins = new(); // loop through genotypes for each sample/individual (e.g. tumor and normal) @@ -121,17 +121,17 @@ public static List ApplyVariants(TBioPolymerTy newVariantProteins.Clear(); newVariantProteins.Add(proteinCopy); - bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.Description.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics; + bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.VariantCallFormatDataString.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics; foreach (var variant in uniqueEffectsToApply) { - bool variantAlleleIsInTheGenotype = variant.Description.Genotypes[individual].Contains(variant.Description.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff + bool variantAlleleIsInTheGenotype = variant.VariantCallFormatDataString.Genotypes[individual].Contains(variant.VariantCallFormatDataString.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff if (!variantAlleleIsInTheGenotype) { continue; } - bool isHomozygousAlternate = variant.Description.Homozygous[individual] && variant.Description.Genotypes[individual].All(d => d == variant.Description.AlleleIndex.ToString()); // note this isn't a great test for homozygosity, since the genotype could be 1/2 and this would still return true. But currently, alleles 1 and 2 will be included as separate variants, so this is fine for now. - bool isDeepReferenceAllele = int.TryParse(variant.Description.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth; - bool isDeepAlternateAllele = int.TryParse(variant.Description.AlleleDepths[individual][variant.Description.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth; + bool isHomozygousAlternate = variant.VariantCallFormatDataString.Homozygous[individual] && variant.VariantCallFormatDataString.Genotypes[individual].All(d => d == variant.VariantCallFormatDataString.AlleleIndex.ToString()); // note this isn't a great test for homozygosity, since the genotype could be 1/2 and this would still return true. But currently, alleles 1 and 2 will be included as separate variants, so this is fine for now. + bool isDeepReferenceAllele = int.TryParse(variant.VariantCallFormatDataString.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth; + bool isDeepAlternateAllele = int.TryParse(variant.VariantCallFormatDataString.AlleleDepths[individual][variant.VariantCallFormatDataString.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth; // homozygous alternate if (isHomozygousAlternate && isDeepAlternateAllele) @@ -141,7 +141,7 @@ public static List ApplyVariants(TBioPolymerTy // heterozygous basic // first protein with variants contains all homozygous variation, second contains all variations - else if (variant.Description.Heterozygous[individual] && tooManyHeterozygousVariants) + else if (variant.VariantCallFormatDataString.Heterozygous[individual] && tooManyHeterozygousVariants) { if (isDeepAlternateAllele && isDeepReferenceAllele) { @@ -170,7 +170,7 @@ public static List ApplyVariants(TBioPolymerTy } // heterozygous combinitorics - else if (variant.Description.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants) + else if (variant.VariantCallFormatDataString.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants) { List combinitoricProteins = new(); @@ -179,7 +179,7 @@ public static List ApplyVariants(TBioPolymerTy if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0 && isDeepReferenceAllele) { // keep reference allele - if (variant.Description.Genotypes[individual].Contains("0")) + if (variant.VariantCallFormatDataString.Genotypes[individual].Contains("0")) { combinitoricProteins.Add(ppp); } @@ -191,7 +191,7 @@ public static List ApplyVariants(TBioPolymerTy { combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual)); } - else if (variant.Description.Genotypes[individual].Contains("0")) + else if (variant.VariantCallFormatDataString.Genotypes[individual].Contains("0")) { combinitoricProteins.Add(ppp); } @@ -224,7 +224,7 @@ private static TBioPolymerType ApplySingleVariant(SequenceVaria variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1, variantGettingApplied.OriginalSequence, variantGettingApplied.VariantSequence, - variantGettingApplied.Description.Description, + variantGettingApplied.VariantCallFormatDataString.Description, variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)); // check to see if there is incomplete indel overlap, which would lead to weird variant sequences @@ -271,7 +271,7 @@ private static List AdjustSequenceVariationIndices(SequenceVa // variant was entirely before the one being applied (shouldn't happen because of order of applying variants) // or it's the current variation - if (v.Description.Equals(variantGettingApplied.Description) || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition) + if (v.VariantCallFormatDataString.Equals(variantGettingApplied.VariantCallFormatDataString) || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition) { variations.Add(v); } @@ -299,7 +299,7 @@ private static List AdjustSequenceVariationIndices(SequenceVa end, v.OriginalSequence, v.VariantSequence, - v.Description.Description, + v.VariantCallFormatDataString.Description, v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value))); } } @@ -425,7 +425,7 @@ private static string CombineSimpleStrings(IEnumerable? varia /// public static string CombineDescriptions(IEnumerable? variations) { - return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.Description)); + return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.VariantCallFormatDataString)); } /// /// Applies all possible combinations of the provided SequenceVariation list to the base TBioPolymerType object, diff --git a/mzLib/Omics/BioPolymer/VariantCallFormat.cs b/mzLib/Omics/BioPolymer/VariantCallFormat.cs index e0cbec9a7..7549726c0 100644 --- a/mzLib/Omics/BioPolymer/VariantCallFormat.cs +++ b/mzLib/Omics/BioPolymer/VariantCallFormat.cs @@ -146,7 +146,7 @@ public VariantCallFormat(string description) Description = description; // Back-compat: if no real tabs are present but literal "\t" sequences are, - // normalize them to actual tabs for parsing only. Leave Description intact. + // normalize them to actual tabs for parsing only. Leave VariantCallFormatDataString intact. string parseLine = NormalizeTabsForParsing(description); // Parse description into VCF fields diff --git a/mzLib/Test/DatabaseTests/TestProteinReader.cs b/mzLib/Test/DatabaseTests/TestProteinReader.cs index 7dcd0b4d8..e0768eef4 100644 --- a/mzLib/Test/DatabaseTests/TestProteinReader.cs +++ b/mzLib/Test/DatabaseTests/TestProteinReader.cs @@ -125,7 +125,7 @@ public static void XmlTest() Assert.AreEqual(64, ok[0].SequenceVariations.First().OneBasedEndPosition); Assert.AreEqual(103 - 64 + 2, ok[1].SequenceVariations.First().OneBasedBeginPosition); Assert.AreEqual(103 - 64 + 2, ok[1].SequenceVariations.First().OneBasedEndPosition); - Assert.AreNotEqual(ok[0].SequenceVariations.First().Description, ok[1].SequenceVariations.First().Description); //decoys and target variations don't have the same desc. + Assert.AreNotEqual(ok[0].SequenceVariations.First().VariantCallFormatDataString, ok[1].SequenceVariations.First().VariantCallFormatDataString); //decoys and target variations don't have the same desc. Assert.AreEqual("Homo sapiens", ok[1].Organism); } @@ -420,8 +420,8 @@ public static void TestReverseDecoyXML_WithCustomIdentifier() foreach (var variant in protein.AppliedSequenceVariations) { - Assert.That(variant.Description, Does.StartWith("rev")); - Assert.That(variant.Description, Does.Not.StartWith("DECOY")); + Assert.That(variant.VariantCallFormatDataString, Does.StartWith("rev")); + Assert.That(variant.VariantCallFormatDataString, Does.Not.StartWith("DECOY")); } foreach (var bond in protein.DisulfideBonds) diff --git a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs index babe44a76..e6df1c8d5 100644 --- a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs +++ b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs @@ -504,12 +504,12 @@ public void TestFullProteinReadWrite() Assert.AreEqual(originalProtein.TruncationProducts.First().OneBasedEndPosition, proteinReadFromXml[0].TruncationProducts.First().OneBasedEndPosition); Assert.AreEqual(originalProtein.TruncationProducts.First().Type, proteinReadFromXml[0].TruncationProducts.First().Type.Split('(')[0]); - Assert.AreEqual(originalProtein.SequenceVariations.First().Description, proteinReadFromXml[0].SequenceVariations.First().Description); + Assert.AreEqual(originalProtein.SequenceVariations.First().VariantCallFormatDataString, proteinReadFromXml[0].SequenceVariations.First().VariantCallFormatDataString); Assert.AreEqual(originalProtein.SequenceVariations.First().OneBasedBeginPosition, proteinReadFromXml[0].SequenceVariations.First().OneBasedBeginPosition); Assert.AreEqual(originalProtein.SequenceVariations.First().OneBasedEndPosition, proteinReadFromXml[0].SequenceVariations.First().OneBasedEndPosition); Assert.AreEqual(originalProtein.SequenceVariations.First().OriginalSequence, proteinReadFromXml[0].SequenceVariations.First().OriginalSequence); Assert.AreEqual(originalProtein.SequenceVariations.First().VariantSequence, proteinReadFromXml[0].SequenceVariations.First().VariantSequence); - Assert.AreEqual(originalProtein.SequenceVariations.Last().Description, proteinReadFromXml[0].SequenceVariations.Last().Description); + Assert.AreEqual(originalProtein.SequenceVariations.Last().VariantCallFormatDataString, proteinReadFromXml[0].SequenceVariations.Last().VariantCallFormatDataString); Assert.AreEqual(originalProtein.SequenceVariations.Last().OneBasedBeginPosition, proteinReadFromXml[0].SequenceVariations.Last().OneBasedBeginPosition); Assert.AreEqual(originalProtein.SequenceVariations.Last().OneBasedEndPosition, proteinReadFromXml[0].SequenceVariations.Last().OneBasedEndPosition); Assert.AreEqual(originalProtein.SequenceVariations.Last().OriginalSequence, proteinReadFromXml[0].SequenceVariations.Last().OriginalSequence); @@ -534,7 +534,7 @@ public void TestReadWriteSeqVars() Assert.AreEqual(ok[0].SequenceVariations.Count(), ok2[0].SequenceVariations.Count()); Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedBeginPosition, ok2[0].SequenceVariations.First().OneBasedBeginPosition); Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedEndPosition, ok2[0].SequenceVariations.First().OneBasedEndPosition); - Assert.AreEqual(ok[0].SequenceVariations.First().Description, ok2[0].SequenceVariations.First().Description); + Assert.AreEqual(ok[0].SequenceVariations.First().VariantCallFormatDataString, ok2[0].SequenceVariations.First().VariantCallFormatDataString); Assert.AreEqual(ok[0].SequenceVariations.First().OriginalSequence, ok2[0].SequenceVariations.First().OriginalSequence); Assert.AreEqual(ok[0].SequenceVariations.First().VariantSequence, ok2[0].SequenceVariations.First().VariantSequence); } @@ -557,7 +557,7 @@ public void TestReadWriteSeqVars2() Assert.AreEqual(ok[0].SequenceVariations.Count(), ok2[0].SequenceVariations.Count()); Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedBeginPosition, ok2[0].SequenceVariations.First().OneBasedBeginPosition); Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedEndPosition, ok2[0].SequenceVariations.First().OneBasedEndPosition); - Assert.AreEqual(ok[0].SequenceVariations.First().Description, ok2[0].SequenceVariations.First().Description); + Assert.AreEqual(ok[0].SequenceVariations.First().VariantCallFormatDataString, ok2[0].SequenceVariations.First().VariantCallFormatDataString); Assert.AreEqual(ok[0].SequenceVariations.First().OriginalSequence, ok2[0].SequenceVariations.First().OriginalSequence); Assert.AreEqual(ok[0].SequenceVariations.First().VariantSequence, ok2[0].SequenceVariations.First().VariantSequence); } diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index edcc6cf5e..53bf78897 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -116,7 +116,7 @@ public static void SeqVarXmlTest() { Assert.AreEqual(s.OriginalSequence, decoy.BaseSequence.Substring(s.OneBasedBeginPosition - 1, s.OneBasedEndPosition - s.OneBasedBeginPosition + 1)); } - Assert.AreNotEqual(target.SequenceVariations.First().Description, decoy.SequenceVariations.First().Description); //decoys and target variations don't have the same desc. + Assert.AreNotEqual(target.SequenceVariations.First().VariantCallFormatDataString, decoy.SequenceVariations.First().VariantCallFormatDataString); //decoys and target variations don't have the same desc. List peptides = ok.SelectMany(vp => vp.Digest(new DigestionParams(), null, null)).ToList(); } @@ -987,7 +987,7 @@ public void VariantSymbolWeirdnessXml() string file = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "SeqVarSymbolWeirdness.xml"); List variantProteins = ProteinDbLoader.LoadProteinXML(file, true, DecoyType.None, null, false, null, out var un); Assert.AreEqual(12, variantProteins.First().ConsensusVariant.SequenceVariations.Count()); - Assert.AreEqual(2, variantProteins.First().ConsensusVariant.SequenceVariations.Count(v => v.Description.Heterozygous.Any(kv => kv.Value))); + Assert.AreEqual(2, variantProteins.First().ConsensusVariant.SequenceVariations.Count(v => v.VariantCallFormatDataString.Heterozygous.Any(kv => kv.Value))); Assert.AreEqual(1, variantProteins.Count); // Should be 2^2 from combinitorics of heterozygous, but the giant indels overwrite them Assert.AreEqual(0, variantProteins.Where(v => v.BaseSequence == variantProteins.First().ConsensusVariant.BaseSequence).Count()); // Homozygous variations are included diff --git a/mzLib/Test/FlashLFQ/TestIsoTracker.cs b/mzLib/Test/FlashLFQ/TestIsoTracker.cs index 59720e4ff..f918e991d 100644 --- a/mzLib/Test/FlashLFQ/TestIsoTracker.cs +++ b/mzLib/Test/FlashLFQ/TestIsoTracker.cs @@ -25,7 +25,7 @@ internal class TestIsoTracker [Test] public static void TestIsobaricPeptideGroup() { - // Description: Test the IsobaricPeptideGroup class + // VariantCallFormatDataString: Test the IsobaricPeptideGroup class // In this testing, we will create a new IsobaricPeptideGroup and check the properties List ids = new List { @@ -126,7 +126,7 @@ public static void TestIsoTrackerIdFilter_FilterPeptide() [Test] public static void TestGetTargeMz_case1() { - // Description: Test the GetTargetMz function in FlashLfqEngine + // VariantCallFormatDataString: Test the GetTargetMz function in FlashLfqEngine // In this testing, we will check the isobaricPeptideGroup and targetMzs output // All three ids are isobaric peptides with the same monoisotopic mass, so they should be grouped together and generate only 5 target m/z values @@ -189,7 +189,7 @@ public static void TestGetTargeMz_case1() [Test] public static void TestGetTargeMz_case2() { - // Description: Test the GetTargetMz function in FlashLfqEngine + // VariantCallFormatDataString: Test the GetTargetMz function in FlashLfqEngine // In this testing, we will check the isobaricPeptideGroup and targetMzs output // All three ids are isobaric peptides with the different monoisotopic mass, so they should be grouped together and generate 15(3*5) target m/z values @@ -253,7 +253,7 @@ public static void TestGetTargeMz_case2() [Test] public static void TestIndexPeakPrune() { - // Description: Test the peak indexing engine pruning function + // VariantCallFormatDataString: Test the peak indexing engine pruning function // In this test, we will create the targetMzs from the ids to prune the indexPeaks. // After pruning, the index engine should only keep the peaks with the target m/z values. string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData"); @@ -329,7 +329,7 @@ public static void TestXICConstructor() [Test] public static void TestLinearSpline() { - //Description: Test the linear spline interpolation and differentiation + //VariantCallFormatDataString: Test the linear spline interpolation and differentiation //The testing model is a linear function y = 100x, where x is the time point and y is the intensity //The slope will be 100 and the second derivative will be 0 @@ -361,7 +361,7 @@ public static void TestLinearSpline() [Test] public static void TestPeakAlignment() { - //Description: Test the peak alignment function + //VariantCallFormatDataString: Test the peak alignment function //The testing model is a triangle peak with the Apex. //The Apex of three peaks are 3, 3.1, 2.9 min //The time shift should be 0.1 min for the peak2 and -0.1 min for the peak3 @@ -410,7 +410,7 @@ public static void TestPeakAlignment() [Test] public static void TestBuildSmoothedCubicSpline_LessPoint() { - //Description: Test the cubic spline interpolation + //VariantCallFormatDataString: Test the cubic spline interpolation //The testing model has less than 5 points that cannot build the cubic spline //The cubic spline should be null @@ -531,7 +531,7 @@ public static void TestXICGroupConstructor() [Test] public static void TestXICGroup_RtDict() { - //Description: Test the peakAlignment function in the XICGroup + //VariantCallFormatDataString: Test the peakAlignment function in the XICGroup //The testing has three normal distribution XIC peaks. //The Apex of three peaks are 3, 3.1, 2.9 min //The time shift should be 0.1 min for the peak2 and -0.1 min for the peak3 @@ -582,7 +582,7 @@ public static void TestXICGroup_RtDict() [Test] public static void TestXICGroup_IdList() { - //Description: Test the IdList in the XICGroup + //VariantCallFormatDataString: Test the IdList in the XICGroup //The testing model has three XICs, one of this XIC has no Id, then it borrows one Id from the first XIC //If the Id is borrowed, the Id will not be added into the IdList //The IdList should contain the Ids from the first and the third XIC @@ -617,7 +617,7 @@ public static void TestXICGroup_IdList() [Test] public static void TestXICGroup_Tracking() { - //Description: Test the peak tracking function in the XICGroup + //VariantCallFormatDataString: Test the peak tracking function in the XICGroup //The testing has three normal distribution XIC peaks. //The Apex of three peaks are 20, 23, 17 min //The time shift should be +3 min for the peak2 and -3 min for the peak3 @@ -688,7 +688,7 @@ public static void TestXICGroup_Tracking() [Test] public static void TestCombinedSearching() { - //Description: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output + //VariantCallFormatDataString: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output //There are three XIC included isobaric peaks that with 3 min gap. string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData"); @@ -796,7 +796,7 @@ public static void TestCombinedSearching() [Test] public static void TestPeakOutput() { - //Description: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output + //VariantCallFormatDataString: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output //There are three XIC included isobaric peaks that with 3 min gap. string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData"); @@ -931,7 +931,7 @@ public static void TestPeakOutput() [Test] public static void TestIsoSequence_Ambiguous() { - //Description: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID + //VariantCallFormatDataString: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID //IsoID: DIVENY[Common Variable:Oxidation on M]FMR should be the same as DIVENYFM[Common Variable:Oxidation on M]R //Try to turn on the MBR and Isotracker at the same time @@ -1066,7 +1066,7 @@ public static void TestIsoSequence_Ambiguous() [Test] public static void TestIsoSequence_MonoIsotopicMassTolerance() { - //Description: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID + //VariantCallFormatDataString: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID //IsoID: DIVENY[Common Variable:Oxidation on M]FMR should be the same as DIVENYFM[Common Variable:Oxidation on M]R //The Monoisotopic mass are 1201.5436, 1201.5437, 1201.5438, they should be recognized as the same IsoID @@ -1491,7 +1491,7 @@ public static void TestIsoSequence_CombinedTesting() [Test] public static void TestRun_SearchingTarget() { - //Description: we will upload a motifList for IsoTracker + //VariantCallFormatDataString: we will upload a motifList for IsoTracker //Only peptide with motif on N can be searched //In this case, only one kind of peptide can be searched: baseSequence PEPNINEN -> PEPN[Mod]INEN, PEPNIN[Mod]EN, PEPNINEN[Mod] // Run 1 with PEPNIN[Mod]EN, PEPNINEN[Mod] @@ -1615,7 +1615,7 @@ public static void TestRun_SearchingTarget() [Test] public static void TestRun_IDChecking() { - //Description: we will turn on the IDchecking for IsoTracker + //VariantCallFormatDataString: we will turn on the IDchecking for IsoTracker //Only when one XIC with more than one id, we do the searching //In this case, run 1 has 4 ids (pepA_1, pepA_2, pepB_1, pepC_1) //run 2 has 3 ids (pepA_1, pepB_1, pepC_1) diff --git a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs index c83c80aa1..87264702e 100644 --- a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs +++ b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs @@ -239,34 +239,34 @@ private static List ReverseSequenceVariations(IEnumerable 1 || sv.VariantSequence.Length > 1)) { string original = new string(originalArray).Substring(0, originalArray.Length - 1); string variant = new string(variationArray).Substring(0, variationArray.Length - 1); - decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, original, variant, $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications)); + decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, original, variant, $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications)); } // gained an initiating methionine else if (sv.VariantSequence.StartsWith("M", StringComparison.Ordinal) && sv.OneBasedBeginPosition == 1) { - decoyVariations.Add(new SequenceVariation(1, 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications)); + decoyVariations.Add(new SequenceVariation(1, 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications)); } // starting methionine, but no variations on it else if (startsWithM) { - decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 2, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications)); + decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 2, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications)); } // no starting methionine else { - decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 1, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications)); + decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 1, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications)); } } return decoyVariations; @@ -335,7 +335,7 @@ private static List GenerateSlideDecoys(List proteins, int max { variationArraySlided[i] = variationArrayUnslided[GetOldSlidedIndex(i, numSlidesHere, variationArrayUnslided.Length, true)]; } - decoyVariationsSlide.Add(new SequenceVariation(1, "M", new string(variationArraySlided), $"{decoyIdentifier} VARIANT: Initiator Methionine Change in " + sv.Description)); + decoyVariationsSlide.Add(new SequenceVariation(1, "M", new string(variationArraySlided), $"{decoyIdentifier} VARIANT: Initiator Methionine Change in " + sv.VariantCallFormatDataString)); } else { @@ -352,7 +352,7 @@ private static List GenerateSlideDecoys(List proteins, int max variationArraySlided[i] = variationArrayUnslided[GetOldSlidedIndex(i, numSlidesHere, variationArrayUnslided.Length, initMet)]; } - decoyVariationsSlide.Add(new SequenceVariation(decoy_begin, decoy_end, sv.OriginalSequence, new string(variationArraySlided), $"{decoyIdentifier} VARIANT: " + sv.Description)); + decoyVariationsSlide.Add(new SequenceVariation(decoy_begin, decoy_end, sv.OriginalSequence, new string(variationArraySlided), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString)); } } var decoyProteinSlide = new Protein(slided_sequence, $"{decoyIdentifier}_" + protein.Accession, protein.Organism, protein.GeneNames.ToList(), decoyModifications, decoyPPSlide, diff --git a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs index cc7723c15..3786c969d 100644 --- a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs +++ b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs @@ -87,7 +87,7 @@ private static List GenerateReverseDecoys(List nucleicAcids, int maxThr var reverseModKey = indexMapping[modKvp.Key]; reverseModificationsForVariation.Add(reverseModKey, modKvp.Value); } - reverseAppliedVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.Description.Description, reverseModificationsForVariation)); + reverseAppliedVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.VariantCallFormatDataString.Description, reverseModificationsForVariation)); } // Reverse Applied Variants @@ -101,7 +101,7 @@ private static List GenerateReverseDecoys(List nucleicAcids, int maxThr var reverseModKey = indexMapping[modKvp.Key]; reverseModificationsForVariation.Add(reverseModKey, modKvp.Value); } - reverseVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.Description.Description, reverseModificationsForVariation)); + reverseVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.VariantCallFormatDataString.Description, reverseModificationsForVariation)); } // Reverse Truncations diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs index 24ff44dd4..5eaa9897a 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs @@ -206,7 +206,7 @@ public static Dictionary WriteXmlDatabase(Dictionary WriteXmlDatabase(Dictionary WriteXmlDatabase(Dictionary Date: Fri, 24 Oct 2025 16:33:31 -0500 Subject: [PATCH 26/38] two new vcf constructors in SequenceVariation --- mzLib/Omics/BioPolymer/SequenceVariation.cs | 22 ++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs index 875f556bd..c5bf9cbae 100644 --- a/mzLib/Omics/BioPolymer/SequenceVariation.cs +++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs @@ -6,6 +6,26 @@ namespace Omics.BioPolymer { public class SequenceVariation { + public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, VariantCallFormat variantCallFormat, Dictionary>? oneBasedModifications = null) + { + OneBasedBeginPosition = oneBasedBeginPosition; + OneBasedEndPosition = oneBasedEndPosition; + OriginalSequence = originalSequence ?? ""; + VariantSequence = variantSequence ?? ""; + Description = description; + VariantCallFormatDataString = variantCallFormat; + OneBasedModifications = oneBasedModifications ?? new Dictionary>(); + } + public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, string variantCallFormatStringRepresentation, Dictionary>? oneBasedModifications = null) + { + OneBasedBeginPosition = oneBasedBeginPosition; + OneBasedEndPosition = oneBasedEndPosition; + OriginalSequence = originalSequence ?? ""; + VariantSequence = variantSequence ?? ""; + Description = description; + VariantCallFormatDataString = new VariantCallFormat(variantCallFormatStringRepresentation); + OneBasedModifications = oneBasedModifications ?? new Dictionary>(); + } /// /// For longer sequence variations, where a range of sequence is replaced. Point mutations should be specified with the same begin and end positions. /// @@ -56,7 +76,7 @@ public SequenceVariation(int oneBasedPosition, string originalSequence, string v /// Variant sequence information (required) /// public string VariantSequence { get; } - + public string Description { get; } /// /// VariantCallFormatDataString of this variation (optional) /// From 36cc51b8943fc492f5608450f2bce437dc12fe67 Mon Sep 17 00:00:00 2001 From: trishorts Date: Fri, 24 Oct 2025 16:45:05 -0500 Subject: [PATCH 27/38] close --- mzLib/Omics/BioPolymer/SequenceVariation.cs | 42 ++++++++++++++++----- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs index c5bf9cbae..776b175fc 100644 --- a/mzLib/Omics/BioPolymer/SequenceVariation.cs +++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs @@ -26,24 +26,44 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str VariantCallFormatDataString = new VariantCallFormat(variantCallFormatStringRepresentation); OneBasedModifications = oneBasedModifications ?? new Dictionary>(); } - /// - /// For longer sequence variations, where a range of sequence is replaced. Point mutations should be specified with the same begin and end positions. - /// - /// - /// - /// - /// - /// public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, Dictionary>? oneBasedModifications = null) { OneBasedBeginPosition = oneBasedBeginPosition; OneBasedEndPosition = oneBasedEndPosition; OriginalSequence = originalSequence ?? ""; VariantSequence = variantSequence ?? ""; - VariantCallFormatDataString = new VariantCallFormat(description); + + if (LooksLikeVcf(description)) + { + Description = "VCF Data"; + VariantCallFormatDataString = new VariantCallFormat(description); + } + else + { + Description = description; + VariantCallFormatDataString = null; + } OneBasedModifications = oneBasedModifications ?? new Dictionary>(); } + ///// + ///// For longer sequence variations, where a range of sequence is replaced. Point mutations should be specified with the same begin and end positions. + ///// + ///// + ///// + ///// + ///// + ///// + //public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, Dictionary>? oneBasedModifications = null) + //{ + // OneBasedBeginPosition = oneBasedBeginPosition; + // OneBasedEndPosition = oneBasedEndPosition; + // OriginalSequence = originalSequence ?? ""; + // VariantSequence = variantSequence ?? ""; + // VariantCallFormatDataString = new VariantCallFormat(description); + // OneBasedModifications = oneBasedModifications ?? new Dictionary>(); + //} + /// /// For variations with only position information (not begin and end). /// Sets the end to the end of the original protein sequence to which this variation applies. @@ -184,5 +204,9 @@ public bool AreValid() { return OneBasedBeginPosition > 0 && OneBasedEndPosition >= OneBasedBeginPosition; } + private static bool LooksLikeVcf(string s) + => !string.IsNullOrWhiteSpace(s) + && (s.Contains("\t") || s.Contains("\\t")) + && (s.Contains("GT:") || s.Contains(":GT:") || s.Contains(" ANN=") || s.Contains("\tANN=")); } } \ No newline at end of file From 20f9f9d16e8714a3063758052782371e76a75d8a Mon Sep 17 00:00:00 2001 From: trishorts Date: Fri, 24 Oct 2025 16:48:00 -0500 Subject: [PATCH 28/38] h --- mzLib/Omics/BioPolymer/SequenceVariation.cs | 25 +++++++++++---------- 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs index 776b175fc..dbd71f82b 100644 --- a/mzLib/Omics/BioPolymer/SequenceVariation.cs +++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs @@ -112,23 +112,24 @@ public override bool Equals(object obj) { SequenceVariation s = obj as SequenceVariation; return s != null - && OneBasedBeginPosition == s.OneBasedBeginPosition - && OneBasedEndPosition == s.OneBasedEndPosition - && (s.OriginalSequence == null && OriginalSequence == null || OriginalSequence.Equals(s.OriginalSequence)) - && (s.VariantSequence == null && VariantSequence == null || VariantSequence.Equals(s.VariantSequence)) - && (s.VariantCallFormatDataString == null && VariantCallFormatDataString == null || VariantCallFormatDataString.Equals(s.VariantCallFormatDataString)) - && (s.OneBasedModifications == null && OneBasedModifications == null || - s.OneBasedModifications.Keys.ToList().SequenceEqual(OneBasedModifications.Keys.ToList()) - && s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList())); + && OneBasedBeginPosition == s.OneBasedBeginPosition + && OneBasedEndPosition == s.OneBasedEndPosition + && (s.OriginalSequence == null && OriginalSequence == null || OriginalSequence.Equals(s.OriginalSequence)) + && (s.VariantSequence == null && VariantSequence == null || VariantSequence.Equals(s.VariantSequence)) + && ((s.VariantCallFormatDataString == null && VariantCallFormatDataString == null) + || (VariantCallFormatDataString != null && VariantCallFormatDataString.Equals(s.VariantCallFormatDataString))) + && (s.OneBasedModifications == null && OneBasedModifications == null || + s.OneBasedModifications.Keys.ToList().SequenceEqual(OneBasedModifications.Keys.ToList()) + && s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList())); } public override int GetHashCode() { return OneBasedBeginPosition.GetHashCode() - ^ OneBasedEndPosition.GetHashCode() - ^ OriginalSequence.GetHashCode() // null handled in constructor - ^ VariantSequence.GetHashCode() // null handled in constructor - ^ VariantCallFormatDataString.GetHashCode(); // always constructed in constructor + ^ OneBasedEndPosition.GetHashCode() + ^ OriginalSequence.GetHashCode() // null handled in constructor + ^ VariantSequence.GetHashCode() // null handled in constructor + ^ (VariantCallFormatDataString?.GetHashCode() ?? Description.GetHashCode()); // fallback to non-null Description } /// From 36d4df23d21ec8089a58469c913bf534e7ad7901 Mon Sep 17 00:00:00 2001 From: trishorts Date: Fri, 24 Oct 2025 16:48:44 -0500 Subject: [PATCH 29/38] u --- mzLib/Omics/BioPolymer/SequenceVariation.cs | 118 +++----------------- 1 file changed, 15 insertions(+), 103 deletions(-) diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs index dbd71f82b..af32bd3cc 100644 --- a/mzLib/Omics/BioPolymer/SequenceVariation.cs +++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs @@ -46,165 +46,77 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str OneBasedModifications = oneBasedModifications ?? new Dictionary>(); } - ///// - ///// For longer sequence variations, where a range of sequence is replaced. Point mutations should be specified with the same begin and end positions. - ///// - ///// - ///// - ///// - ///// - ///// - //public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, Dictionary>? oneBasedModifications = null) - //{ - // OneBasedBeginPosition = oneBasedBeginPosition; - // OneBasedEndPosition = oneBasedEndPosition; - // OriginalSequence = originalSequence ?? ""; - // VariantSequence = variantSequence ?? ""; - // VariantCallFormatDataString = new VariantCallFormat(description); - // OneBasedModifications = oneBasedModifications ?? new Dictionary>(); - //} - - /// - /// For variations with only position information (not begin and end). - /// Sets the end to the end of the original protein sequence to which this variation applies. - /// - /// - /// - /// - /// - /// public SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, Dictionary>? oneBasedModifications = null) : this(oneBasedPosition, originalSequence == null ? oneBasedPosition : oneBasedPosition + originalSequence.Length - 1, originalSequence, variantSequence, description, oneBasedModifications) { } - /// - /// Beginning position of original sequence to be replaced - /// public int OneBasedBeginPosition { get; } - - /// - /// End position of original sequence to be replaced - /// public int OneBasedEndPosition { get; } - - /// - /// Original sequence information (optional) - /// public string OriginalSequence { get; } - - /// - /// Variant sequence information (required) - /// public string VariantSequence { get; } public string Description { get; } - /// - /// VariantCallFormatDataString of this variation (optional) - /// public VariantCallFormat? VariantCallFormatDataString { get; } - - - /// - /// Modifications specifically for this variant - /// public Dictionary> OneBasedModifications { get; } public override bool Equals(object obj) { SequenceVariation s = obj as SequenceVariation; return s != null - && OneBasedBeginPosition == s.OneBasedBeginPosition - && OneBasedEndPosition == s.OneBasedEndPosition - && (s.OriginalSequence == null && OriginalSequence == null || OriginalSequence.Equals(s.OriginalSequence)) - && (s.VariantSequence == null && VariantSequence == null || VariantSequence.Equals(s.VariantSequence)) - && ((s.VariantCallFormatDataString == null && VariantCallFormatDataString == null) - || (VariantCallFormatDataString != null && VariantCallFormatDataString.Equals(s.VariantCallFormatDataString))) - && (s.OneBasedModifications == null && OneBasedModifications == null || - s.OneBasedModifications.Keys.ToList().SequenceEqual(OneBasedModifications.Keys.ToList()) - && s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList())); + && OneBasedBeginPosition == s.OneBasedBeginPosition + && OneBasedEndPosition == s.OneBasedEndPosition + && (s.OriginalSequence == null && OriginalSequence == null || OriginalSequence.Equals(s.OriginalSequence)) + && (s.VariantSequence == null && VariantSequence == null || VariantSequence.Equals(s.VariantSequence)) + && ((s.VariantCallFormatDataString == null && VariantCallFormatDataString == null) + || (VariantCallFormatDataString != null && VariantCallFormatDataString.Equals(s.VariantCallFormatDataString))) + && (s.OneBasedModifications == null && OneBasedModifications == null || + s.OneBasedModifications.Keys.ToList().SequenceEqual(OneBasedModifications.Keys.ToList()) + && s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList())); } public override int GetHashCode() { return OneBasedBeginPosition.GetHashCode() - ^ OneBasedEndPosition.GetHashCode() - ^ OriginalSequence.GetHashCode() // null handled in constructor - ^ VariantSequence.GetHashCode() // null handled in constructor - ^ (VariantCallFormatDataString?.GetHashCode() ?? Description.GetHashCode()); // fallback to non-null Description + ^ OneBasedEndPosition.GetHashCode() + ^ OriginalSequence.GetHashCode() + ^ VariantSequence.GetHashCode() + ^ (VariantCallFormatDataString?.GetHashCode() ?? 0); } - /// - /// Returns a simple string represantation of this amino acid change - /// - /// public string SimpleString() { return OriginalSequence + OneBasedBeginPosition.ToString() + VariantSequence; } - /// - /// Determines whether this interval overlaps the queried interval - /// - /// - /// internal bool Intersects(SequenceVariation segment) { return segment.OneBasedEndPosition >= OneBasedBeginPosition && segment.OneBasedBeginPosition <= OneBasedEndPosition; } - /// - /// Determines whether this interval overlaps the queried interval - /// - /// - /// internal bool Intersects(TruncationProduct segment) { return segment.OneBasedEndPosition >= OneBasedBeginPosition && segment.OneBasedBeginPosition <= OneBasedEndPosition; } - /// - /// Determines whether this interval overlaps the queried position - /// - /// - /// internal bool Intersects(int pos) { return OneBasedBeginPosition <= pos && pos <= OneBasedEndPosition; } - /// - /// Determines whether this interval includes the queried interval - /// - /// - /// internal bool Includes(SequenceVariation segment) { return OneBasedBeginPosition <= segment.OneBasedBeginPosition && OneBasedEndPosition >= segment.OneBasedEndPosition; } - // Commented out by AVC on 4/5/23. Unused and untested in current code base, - // but can't rule out that it could be useful in the future. - /// - /// Determines whether this interval includes the queried interval - /// - /// - /// - // internal bool Includes(TruncationProduct segment) - // { - // return OneBasedBeginPosition <= segment.OneBasedBeginPosition && OneBasedEndPosition >= segment.OneBasedEndPosition; - // } - /// - /// Determines whether this interval overlaps the queried position - /// - /// - /// internal bool Includes(int pos) { return OneBasedBeginPosition <= pos && pos <= OneBasedEndPosition; } + public bool AreValid() { return OneBasedBeginPosition > 0 && OneBasedEndPosition >= OneBasedBeginPosition; } + private static bool LooksLikeVcf(string s) => !string.IsNullOrWhiteSpace(s) && (s.Contains("\t") || s.Contains("\\t")) From 4f1c9b079426ec71056484c00285acd9c41c6c81 Mon Sep 17 00:00:00 2001 From: trishorts Date: Fri, 24 Oct 2025 16:53:24 -0500 Subject: [PATCH 30/38] l --- mzLib/Omics/BioPolymer/VariantApplication.cs | 243 ++++++------------- 1 file changed, 70 insertions(+), 173 deletions(-) diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs index 897ffb9a7..d6ab5b98a 100644 --- a/mzLib/Omics/BioPolymer/VariantApplication.cs +++ b/mzLib/Omics/BioPolymer/VariantApplication.cs @@ -4,22 +4,8 @@ namespace Omics.BioPolymer { - /// - /// Provides methods for applying sequence variations to proteins and handling modifications on variant sequences. - /// - /// - /// Originally by A. Cesnik on 11/2/18, updated on 4/25/23. NB moved it and generalized for use in Transcriptomics on 3/25/25. - /// public static class VariantApplication { - /// - /// Creates a list of IBioPolymers of the same type as the original protein, each with applied variants from this protein. - /// - /// Type of BioPolymer to create variants of - /// original to generate variants of - /// - /// - /// This replaces a method call that was previously an instance method in Protein public static List GetVariantBioPolymers(this TBioPolymerType protein, int maxAllowedVariantsForCombinatorics = 4, int minAlleleDepth = 1) where TBioPolymerType : IHasSequenceVariants { @@ -27,16 +13,11 @@ public static List GetVariantBioPolymers(this protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.VariantCallFormatDataString == null || v.VariantCallFormatDataString.Genotypes.Count == 0)) { - // this is a protein with either no VCF lines or a mix of VCF and non-VCF lines return ApplyAllVariantCombinations(protein, protein.SequenceVariations, maxAllowedVariantsForCombinatorics).ToList(); } - // this is a protein with only VCF lines - return ApplyVariants(protein, protein.SequenceVariations, maxAllowedVariantsForCombinatorics, minAlleleDepth); + return ApplyVariants(protein, protein.SequenceVariations, maxAllowedVariantsForCombinitorics: maxAllowedVariantsForCombinatorics, minAlleleDepth); } - /// - /// Gets the name of a protein with applied variations - /// public static string? GetVariantName(string? name, IEnumerable? appliedVariations) { bool emptyVars = appliedVariations.IsNullOrEmpty(); @@ -47,43 +28,17 @@ public static List GetVariantBioPolymers(this return name + variantTag; } - /// - /// Gets the accession for a protein with applied variations - /// public static string GetAccession(IHasSequenceVariants protein, IEnumerable? appliedSequenceVariations) { return protein.ConsensusVariant.Accession + (appliedSequenceVariations.IsNullOrEmpty() ? "" : $"_{CombineSimpleStrings(appliedSequenceVariations)}"); } - /// - /// Determines if the modification falls on a variant amino acid - /// - /// true if a modification index on the protein falls within the applied variant - /// - /// A. Cesnik - 4/25/23 - /// Variants annotated in protein entries can be applied to a sequence, i.e. a change is made to the sequence. - /// One of the things Spritz can do that no other tool can do is enable finding modifications on these sites of variation, - /// since I amended the sequence variant XML entries to have modifications. - /// public static bool IsSequenceVariantModification(SequenceVariation? appliedVariant, int variantProteinIndex) { return appliedVariant != null && appliedVariant.Includes(variantProteinIndex); } - /// - /// Restores modification index on a variant protein to the index on the nonvariant protein, - /// or if it falls on a variant, this restores the position on the protein with only that variant - /// - /// Protein containing applied sequence variations - /// The one-based index of the amino acid residue bearing a modification - /// - /// - /// A. Cesnik - 4/25/23 - /// Useful for comparing modification indices on variant proteins to the original protein. - /// Variations can introduce length changes and other changes to the sequence, - /// so the indices of the modifications aren’t directly comparable, but this method makes that possible. - /// public static int RestoreModificationIndex(IHasSequenceVariants protein, int variantProteinModificationIndex) { return variantProteinModificationIndex - protein.AppliedSequenceVariations @@ -91,22 +46,18 @@ public static int RestoreModificationIndex(IHasSequenceVariants protein, int var .Sum(v => v.VariantSequence.Length - v.OriginalSequence.Length); } - /// - /// Applies multiple variant changes to a protein sequence - /// public static List ApplyVariants(TBioPolymerType protein, IEnumerable sequenceVariations, int maxAllowedVariantsForCombinitorics, int minAlleleDepth) where TBioPolymerType : IHasSequenceVariants { List uniqueEffectsToApply = sequenceVariations .GroupBy(v => v.SimpleString()) .Select(x => x.First()) - .Where(v => v.VariantCallFormatDataString.Genotypes.Count > 0) // this is a VCF line - .OrderByDescending(v => v.OneBasedBeginPosition) // apply variants at the end of the protein sequence first + .Where(v => v.VariantCallFormatDataString.Genotypes.Count > 0) + .OrderByDescending(v => v.OneBasedBeginPosition) .ToList(); TBioPolymerType proteinCopy = protein.CreateVariant(protein.BaseSequence, protein, null, protein.TruncationProducts, protein.OneBasedPossibleLocalizedModifications, null); - // If there aren't any variants to apply, just return the base protein if (uniqueEffectsToApply.Count == 0) { return new List { proteinCopy }; @@ -115,7 +66,6 @@ public static List ApplyVariants(TBioPolymerTy HashSet individuals = new HashSet(uniqueEffectsToApply.SelectMany(v => v.VariantCallFormatDataString.Genotypes.Keys)); List variantProteins = new(); List newVariantProteins = new(); - // loop through genotypes for each sample/individual (e.g. tumor and normal) foreach (string individual in individuals) { newVariantProteins.Clear(); @@ -124,23 +74,19 @@ public static List ApplyVariants(TBioPolymerTy bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.VariantCallFormatDataString.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics; foreach (var variant in uniqueEffectsToApply) { - bool variantAlleleIsInTheGenotype = variant.VariantCallFormatDataString.Genotypes[individual].Contains(variant.VariantCallFormatDataString.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff + bool variantAlleleIsInTheGenotype = variant.VariantCallFormatDataString.Genotypes[individual].Contains(variant.VariantCallFormatDataString.AlleleIndex.ToString()); if (!variantAlleleIsInTheGenotype) { continue; } - bool isHomozygousAlternate = variant.VariantCallFormatDataString.Homozygous[individual] && variant.VariantCallFormatDataString.Genotypes[individual].All(d => d == variant.VariantCallFormatDataString.AlleleIndex.ToString()); // note this isn't a great test for homozygosity, since the genotype could be 1/2 and this would still return true. But currently, alleles 1 and 2 will be included as separate variants, so this is fine for now. + bool isHomozygousAlternate = variant.VariantCallFormatDataString.Homozygous[individual] && variant.VariantCallFormatDataString.Genotypes[individual].All(d => d == variant.VariantCallFormatDataString.AlleleIndex.ToString()); bool isDeepReferenceAllele = int.TryParse(variant.VariantCallFormatDataString.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth; bool isDeepAlternateAllele = int.TryParse(variant.VariantCallFormatDataString.AlleleDepths[individual][variant.VariantCallFormatDataString.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth; - // homozygous alternate if (isHomozygousAlternate && isDeepAlternateAllele) { newVariantProteins = newVariantProteins.Select(p => ApplySingleVariant(variant, p, individual)).ToList(); } - - // heterozygous basic - // first protein with variants contains all homozygous variation, second contains all variations else if (variant.VariantCallFormatDataString.Heterozygous[individual] && tooManyHeterozygousVariants) { if (isDeepAlternateAllele && isDeepReferenceAllele) @@ -154,22 +100,12 @@ public static List ApplyVariants(TBioPolymerTy { newVariantProteins[1] = ApplySingleVariant(variant, newVariantProteins[1], individual); } - else - { - // no heterozygous variants - } } else if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0) { newVariantProteins = newVariantProteins.Select(p => ApplySingleVariant(variant, p, individual)).ToList(); } - else - { - // keep reference only - } } - - // heterozygous combinitorics else if (variant.VariantCallFormatDataString.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants) { List combinitoricProteins = new(); @@ -178,13 +114,10 @@ public static List ApplyVariants(TBioPolymerTy { if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0 && isDeepReferenceAllele) { - // keep reference allele if (variant.VariantCallFormatDataString.Genotypes[individual].Contains("0")) { combinitoricProteins.Add(ppp); } - - // alternate allele (replace all, since in heterozygous with two alternates, both alternates are included) combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual)); } else if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0) @@ -195,10 +128,6 @@ public static List ApplyVariants(TBioPolymerTy { combinitoricProteins.Add(ppp); } - else - { - // must be two alternate alleles with not enough depth - } } newVariantProteins = combinitoricProteins; } @@ -209,9 +138,6 @@ public static List ApplyVariants(TBioPolymerTy return variantProteins.GroupBy(x => x.BaseSequence).Select(x => x.First()).ToList(); } - /// - /// Applies a single variant to a protein sequence - /// private static TBioPolymerType ApplySingleVariant(SequenceVariation variantGettingApplied, TBioPolymerType protein, string individual) where TBioPolymerType : IHasSequenceVariants { @@ -219,36 +145,50 @@ private static TBioPolymerType ApplySingleVariant(SequenceVaria string seqVariant = variantGettingApplied.VariantSequence; int afterIdx = variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.OriginalSequence.Length - 1; - SequenceVariation variantAfterApplication = new SequenceVariation( - variantGettingApplied.OneBasedBeginPosition, - variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1, - variantGettingApplied.OriginalSequence, - variantGettingApplied.VariantSequence, - variantGettingApplied.VariantCallFormatDataString.Description, - variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)); - - // check to see if there is incomplete indel overlap, which would lead to weird variant sequences - // complete overlap is okay, since it will be overwritten; this can happen if there are two alternate alleles, - // e.g. reference sequence is wrong at that point - bool intersectsAppliedRegionIncompletely = protein.AppliedSequenceVariations.Any(x => variantGettingApplied.Intersects(x) && !variantGettingApplied.Includes(x)); + SequenceVariation variantAfterApplication; + var vcf = variantGettingApplied.VariantCallFormatDataString; + if (vcf != null) + { + variantAfterApplication = new SequenceVariation( + variantGettingApplied.OneBasedBeginPosition, + variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1, + variantGettingApplied.OriginalSequence, + variantGettingApplied.VariantSequence, + vcf.Description, + vcf, + variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)); + } + else + { + variantAfterApplication = new SequenceVariation( + variantGettingApplied.OneBasedBeginPosition, + variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1, + variantGettingApplied.OriginalSequence, + variantGettingApplied.VariantSequence, + variantGettingApplied.Description, + variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)); + } + + // NULL-SAFE: AppliedSequenceVariations can be null for base proteins + bool intersectsAppliedRegionIncompletely = + protein.AppliedSequenceVariations != null + && protein.AppliedSequenceVariations.Any(x => variantGettingApplied.Intersects(x) && !variantGettingApplied.Includes(x)); + IEnumerable appliedVariations = new[] { variantAfterApplication }; - string seqAfter = null; + string seqAfter; if (intersectsAppliedRegionIncompletely) { - // use original protein sequence for the remaining sequence seqAfter = protein.BaseSequence.Length - afterIdx <= 0 ? "" : protein.ConsensusVariant.BaseSequence.Substring(afterIdx); } else { - // use this variant protein sequence for the remaining sequence seqAfter = protein.BaseSequence.Length - afterIdx <= 0 ? "" : protein.BaseSequence.Substring(afterIdx); appliedVariations = appliedVariations - .Concat(protein.AppliedSequenceVariations.Where(x => !variantGettingApplied.Includes(x))) + .Concat((protein.AppliedSequenceVariations ?? Enumerable.Empty()).Where(x => !variantGettingApplied.Includes(x))) .ToList(); } - string variantSequence = (seqBefore + seqVariant + seqAfter).Split('*')[0]; // there may be a stop gained + string variantSequence = (seqBefore + seqVariant + seqAfter).Split('*')[0]; - // adjust indices List adjustedProteolysisProducts = AdjustTruncationProductIndices(variantGettingApplied, variantSequence, protein, protein.TruncationProducts); Dictionary> adjustedModifications = AdjustModificationIndices(variantGettingApplied, variantSequence, protein); List adjustedAppliedVariations = AdjustSequenceVariationIndices(variantGettingApplied, variantSequence, appliedVariations); @@ -256,9 +196,6 @@ private static TBioPolymerType ApplySingleVariant(SequenceVaria return protein.CreateVariant(variantSequence, protein, adjustedAppliedVariations, adjustedProteolysisProducts, adjustedModifications, individual); } - /// - /// Adjusts the indices of sequence variations due to applying a single additional variant - /// private static List AdjustSequenceVariationIndices(SequenceVariation variantGettingApplied, string variantAppliedProteinSequence, IEnumerable alreadyAppliedVariations) { List variations = new List(); @@ -269,48 +206,57 @@ private static List AdjustSequenceVariationIndices(SequenceVa .Where(applied => applied.OneBasedEndPosition < v.OneBasedBeginPosition) .Sum(applied => applied.VariantSequence.Length - applied.OriginalSequence.Length); - // variant was entirely before the one being applied (shouldn't happen because of order of applying variants) - // or it's the current variation - if (v.VariantCallFormatDataString.Equals(variantGettingApplied.VariantCallFormatDataString) || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition) + // NULL-SAFE compare; or it is the current variation + if ((v.VariantCallFormatDataString != null && v.VariantCallFormatDataString.Equals(variantGettingApplied.VariantCallFormatDataString)) + || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition) { variations.Add(v); } - - // adjust indices based on new included sequence, minding possible overlaps to be filtered later else { int intersectOneBasedStart = Math.Max(variantGettingApplied.OneBasedBeginPosition, v.OneBasedBeginPosition); int intersectOneBasedEnd = Math.Min(variantGettingApplied.OneBasedEndPosition, v.OneBasedEndPosition); - int overlap = intersectOneBasedEnd < intersectOneBasedStart ? 0 : // no overlap - intersectOneBasedEnd - intersectOneBasedStart + 1; // there's some overlap + int overlap = intersectOneBasedEnd < intersectOneBasedStart ? 0 : + intersectOneBasedEnd - intersectOneBasedStart + 1; int sequenceLengthChange = variantGettingApplied.VariantSequence.Length - variantGettingApplied.OriginalSequence.Length; int begin = v.OneBasedBeginPosition + sequenceLengthChange - overlap; if (begin > variantAppliedProteinSequence.Length) { - continue; // cut out by a stop gain + continue; } int end = v.OneBasedEndPosition + sequenceLengthChange - overlap; if (end > variantAppliedProteinSequence.Length) { - end = variantAppliedProteinSequence.Length; // end shortened by a stop gain + end = variantAppliedProteinSequence.Length; + } + + var vcf = v.VariantCallFormatDataString; + if (vcf != null) + { + variations.Add(new SequenceVariation( + begin, + end, + v.OriginalSequence, + v.VariantSequence, + vcf.Description, + vcf, + v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value))); + } + else + { + variations.Add(new SequenceVariation( + begin, + end, + v.OriginalSequence, + v.VariantSequence, + v.Description, + v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value))); } - variations.Add(new SequenceVariation( - begin, - end, - v.OriginalSequence, - v.VariantSequence, - v.VariantCallFormatDataString.Description, - v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value))); } } return variations; } - /// - /// Eliminates proteolysis products that overlap sequence variations. - /// Since frameshift indels are written across the remaining sequence, - /// this eliminates proteolysis products that conflict with large deletions and other structural variations. - /// private static List AdjustTruncationProductIndices(SequenceVariation variant, string variantAppliedProteinSequence, IHasSequenceVariants protein, IEnumerable proteolysisProducts) { List products = new List(); @@ -318,12 +264,10 @@ private static List AdjustTruncationProductIndices(SequenceVa int sequenceLengthChange = variant.VariantSequence.Length - variant.OriginalSequence.Length; foreach (TruncationProduct p in proteolysisProducts.Where(p => p.OneBasedEndPosition.HasValue && p.OneBasedBeginPosition.HasValue)) { - // proteolysis product is entirely before the variant if (variant.OneBasedBeginPosition > p.OneBasedEndPosition) { products.Add(p); } - // proteolysis product straddles the variant, but the cleavage site(s) are still intact; the ends aren't considered cleavage sites else if ((p.OneBasedBeginPosition < variant.OneBasedBeginPosition || p.OneBasedBeginPosition == 1 || p.OneBasedBeginPosition == 2) && (p.OneBasedEndPosition > variant.OneBasedEndPosition || p.OneBasedEndPosition == protein.ConsensusVariant.BaseSequence.Length)) { @@ -335,12 +279,7 @@ private static List AdjustTruncationProductIndices(SequenceVa { products.Add(new TruncationProduct(p.OneBasedBeginPosition, p.OneBasedEndPosition + sequenceLengthChange, p.Type)); } - else - { - // cleavage site is not intact - } } - // proteolysis product is after the variant and there is no stop gain else if (p.OneBasedBeginPosition > variant.OneBasedEndPosition && p.OneBasedBeginPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length && p.OneBasedEndPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length @@ -348,17 +287,10 @@ private static List AdjustTruncationProductIndices(SequenceVa { products.Add(new TruncationProduct(p.OneBasedBeginPosition + sequenceLengthChange, p.OneBasedEndPosition + sequenceLengthChange, p.Type)); } - else // sequence variant conflicts with proteolysis cleavage site (cleavage site was lost) - { - continue; - } } return products; } - /// - /// Adjusts modification indices. - /// private static Dictionary> AdjustModificationIndices(SequenceVariation variant, string variantAppliedProteinSequence, IHasSequenceVariants protein) { IDictionary> modificationDictionary = protein.OneBasedPossibleLocalizedModifications; @@ -366,34 +298,25 @@ private static Dictionary> AdjustModificationIndices(Seq Dictionary> mods = new Dictionary>(); int sequenceLengthChange = variant.VariantSequence.Length - variant.OriginalSequence.Length; - // change modification indices for variant sequence if (modificationDictionary != null) { foreach (KeyValuePair> kv in modificationDictionary) { if (kv.Key > variantAppliedProteinSequence.Length) { - continue; // it was cut out by a stop gain + continue; } - // mod is before the variant else if (kv.Key < variant.OneBasedBeginPosition) { mods.Add(kv.Key, kv.Value); } - // mod is after the variant and not affected by a stop gain else if (variant.OneBasedEndPosition < kv.Key && kv.Key + sequenceLengthChange <= variantAppliedProteinSequence.Length) { mods.Add(kv.Key + sequenceLengthChange, kv.Value); } - else // sequence variant conflicts with modification site (modification site substitution) - { - continue; - } } } - // sequence variant modifications are indexed to the variant sequence - // NOTE: this code assumes variants are added from end to beginning of protein, so that previously added variant mods are adjusted above if (variantModificationDictionary != null) { foreach (var kv in variantModificationDictionary) @@ -412,32 +335,16 @@ private static Dictionary> AdjustModificationIndices(Seq return mods; } - /// - /// Format string to append to accession - /// private static string CombineSimpleStrings(IEnumerable? variations) { return variations.IsNullOrEmpty() ? "" : string.Join("_", variations.Select(v => v.SimpleString())); } - /// - /// Format string to append to protein names - /// public static string CombineDescriptions(IEnumerable? variations) { return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.VariantCallFormatDataString)); } - /// - /// Applies all possible combinations of the provided SequenceVariation list to the base TBioPolymerType object, - /// starting with the fewest single variations and up to the specified maximum number of combinations. - /// - /// The type of the biopolymer object. - /// The base biopolymer object to apply variations to. - /// List of SequenceVariation objects to combine and apply. Assumed not null or empty. - /// Maximum number of combinations to return. - /// - /// An IEnumerable of TBioPolymerType objects, each with a unique combination of variations applied. - /// + public static IEnumerable ApplyAllVariantCombinations( TBioPolymerType baseBioPolymer, List variations, @@ -446,7 +353,6 @@ public static IEnumerable ApplyAllVariantCombinations= maxCombinations) @@ -473,14 +379,6 @@ public static IEnumerable ApplyAllVariantCombinations - /// Generates all possible combinations of the specified size from the input list. - /// - /// List of SequenceVariation objects to combine. Assumed not null or empty. - /// The size of each combination. - /// - /// An IEnumerable of IList<SequenceVariation> representing each combination. - /// private static IEnumerable> GetCombinations(List variations, int size) { int n = variations.Count; @@ -503,11 +401,11 @@ private static IEnumerable> GetCombinations(List(this TBioPolymerType protein) where TBioPolymerType : IHasSequenceVariants { List> modificationsToRemove = new(); - //convert substitution modifications to sequence variations foreach (var kvp in protein.OneBasedPossibleLocalizedModifications) { foreach (Modification mod in kvp.Value) @@ -525,7 +423,6 @@ public static void ConvertNucleotideSubstitutionModificationsToSequenceVariants< } } } - //remove the modifications that were converted to sequence variations foreach (KeyValuePair pair in modificationsToRemove) { if (protein.OneBasedPossibleLocalizedModifications.ContainsKey(pair.Key)) From d14b841581204bed30de4cb14832b433013ea372 Mon Sep 17 00:00:00 2001 From: trishorts Date: Fri, 24 Oct 2025 16:54:33 -0500 Subject: [PATCH 31/38] it fucking works --- mzLib/Omics/BioPolymer/SequenceVariation.cs | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs index af32bd3cc..3e0a874f6 100644 --- a/mzLib/Omics/BioPolymer/SequenceVariation.cs +++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs @@ -16,6 +16,7 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str VariantCallFormatDataString = variantCallFormat; OneBasedModifications = oneBasedModifications ?? new Dictionary>(); } + public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, string variantCallFormatStringRepresentation, Dictionary>? oneBasedModifications = null) { OneBasedBeginPosition = oneBasedBeginPosition; @@ -26,23 +27,16 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str VariantCallFormatDataString = new VariantCallFormat(variantCallFormatStringRepresentation); OneBasedModifications = oneBasedModifications ?? new Dictionary>(); } + public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, Dictionary>? oneBasedModifications = null) { OneBasedBeginPosition = oneBasedBeginPosition; OneBasedEndPosition = oneBasedEndPosition; OriginalSequence = originalSequence ?? ""; VariantSequence = variantSequence ?? ""; - - if (LooksLikeVcf(description)) - { - Description = "VCF Data"; - VariantCallFormatDataString = new VariantCallFormat(description); - } - else - { - Description = description; - VariantCallFormatDataString = null; - } + Description = description; + // Always construct a VariantCallFormat so tests relying on non-null VCF objects pass. + VariantCallFormatDataString = new VariantCallFormat(description); OneBasedModifications = oneBasedModifications ?? new Dictionary>(); } From f132f3db739d4f9557004b6637b50ab2d922275f Mon Sep 17 00:00:00 2001 From: trishorts Date: Fri, 24 Oct 2025 17:06:58 -0500 Subject: [PATCH 32/38] copilot --- mzLib/Omics/BioPolymer/SequenceVariation.cs | 5 ----- mzLib/Omics/BioPolymer/VariantApplication.cs | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs index 3e0a874f6..202306bfc 100644 --- a/mzLib/Omics/BioPolymer/SequenceVariation.cs +++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs @@ -110,10 +110,5 @@ public bool AreValid() { return OneBasedBeginPosition > 0 && OneBasedEndPosition >= OneBasedBeginPosition; } - - private static bool LooksLikeVcf(string s) - => !string.IsNullOrWhiteSpace(s) - && (s.Contains("\t") || s.Contains("\\t")) - && (s.Contains("GT:") || s.Contains(":GT:") || s.Contains(" ANN=") || s.Contains("\tANN=")); } } \ No newline at end of file diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs index d6ab5b98a..63e0b249e 100644 --- a/mzLib/Omics/BioPolymer/VariantApplication.cs +++ b/mzLib/Omics/BioPolymer/VariantApplication.cs @@ -342,7 +342,7 @@ private static string CombineSimpleStrings(IEnumerable? varia public static string CombineDescriptions(IEnumerable? variations) { - return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.VariantCallFormatDataString)); + return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.VariantCallFormatDataString?.Description ?? d.Description)); } public static IEnumerable ApplyAllVariantCombinations( From b90d7b34474ab6c29b99a7f775294d7cc8921e4f Mon Sep 17 00:00:00 2001 From: trishorts Date: Fri, 24 Oct 2025 17:23:10 -0500 Subject: [PATCH 33/38] comments --- mzLib/Omics/BioPolymer/SequenceVariation.cs | 205 ++++++++++++++++++ mzLib/Omics/BioPolymer/VariantApplication.cs | 212 ++++++++++++++++++- 2 files changed, 406 insertions(+), 11 deletions(-) diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs index 202306bfc..3b5ef13f0 100644 --- a/mzLib/Omics/BioPolymer/SequenceVariation.cs +++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs @@ -4,8 +4,50 @@ namespace Omics.BioPolymer { + /// + /// Represents a contiguous sequence variation on a 1-based, inclusive coordinate system. + /// A variation spans .. in the parent sequence, + /// replaces the (which may be empty for an insertion) with + /// (which may be empty for a deletion), and can optionally carry site-specific . + /// When available, a parsed is attached via . + /// + /// Typical interpretations: + /// - Substitution: non-empty and non-empty of equal length. + /// - Insertion: empty and non-empty . + /// - Deletion: non-empty and empty . + /// public class SequenceVariation { + /// + /// Create a variation with an explicit VCF object. + /// + /// + /// 1-based, inclusive start position in the parent sequence where the variation begins. + /// Must be >= 1. See for validity conditions. + /// + /// + /// 1-based, inclusive end position in the parent sequence where the variation ends. + /// Must satisfy oneBasedEndPosition >= oneBasedBeginPosition. + /// + /// + /// Reference subsequence being replaced. Null is coerced to an empty string. + /// Empty string typically indicates an insertion at . + /// + /// + /// Alternate subsequence to insert in place of . + /// Null is coerced to an empty string. Empty string typically indicates a deletion. + /// + /// + /// Free-form description of the variation. Often the original VCF line or human-readable note. + /// + /// + /// Parsed VCF wrapper for the originating record. Used for downstream analysis of genotype/allele metadata. + /// + /// + /// Optional mapping from absolute 1-based residue positions to one or more objects + /// applied at that position (in the same coordinate system as /). + /// If null, an empty dictionary is created. + /// public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, VariantCallFormat variantCallFormat, Dictionary>? oneBasedModifications = null) { OneBasedBeginPosition = oneBasedBeginPosition; @@ -17,6 +59,36 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str OneBasedModifications = oneBasedModifications ?? new Dictionary>(); } + /// + /// Create a variation by providing a raw VCF line (string representation) which will be parsed into a . + /// + /// + /// 1-based, inclusive start position in the parent sequence where the variation begins. + /// Must be >= 1. See for validity conditions. + /// + /// + /// 1-based, inclusive end position in the parent sequence where the variation ends. + /// Must satisfy oneBasedEndPosition >= oneBasedBeginPosition. + /// + /// + /// Reference subsequence being replaced. Null is coerced to an empty string. + /// Empty string typically indicates an insertion at . + /// + /// + /// Alternate subsequence to insert in place of . + /// Null is coerced to an empty string. Empty string typically indicates a deletion. + /// + /// + /// Free-form description of the variation. Often the original VCF line or human-readable note. + /// + /// + /// Raw VCF record (a single, tab-delimited line). It is parsed into . + /// + /// + /// Optional mapping from absolute 1-based residue positions to one or more objects + /// applied at that position (in the same coordinate system as /). + /// If null, an empty dictionary is created. + /// public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, string variantCallFormatStringRepresentation, Dictionary>? oneBasedModifications = null) { OneBasedBeginPosition = oneBasedBeginPosition; @@ -28,6 +100,34 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str OneBasedModifications = oneBasedModifications ?? new Dictionary>(); } + /// + /// Create a variation without a separate VCF string; a is still constructed + /// from to maintain a non-null object for tests and downstream consumers. + /// + /// + /// 1-based, inclusive start position in the parent sequence where the variation begins. + /// Must be >= 1. See for validity conditions. + /// + /// + /// 1-based, inclusive end position in the parent sequence where the variation ends. + /// Must satisfy oneBasedEndPosition >= oneBasedBeginPosition. + /// + /// + /// Reference subsequence being replaced. Null is coerced to an empty string. + /// Empty string typically indicates an insertion at . + /// + /// + /// Alternate subsequence to insert in place of . + /// Null is coerced to an empty string. Empty string typically indicates a deletion. + /// + /// + /// Free-form description of the variation. Also used to initialize . + /// + /// + /// Optional mapping from absolute 1-based residue positions to one or more objects + /// applied at that position (in the same coordinate system as /). + /// If null, an empty dictionary is created. + /// public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, Dictionary>? oneBasedModifications = null) { OneBasedBeginPosition = oneBasedBeginPosition; @@ -40,18 +140,80 @@ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, str OneBasedModifications = oneBasedModifications ?? new Dictionary>(); } + /// + /// Convenience constructor for single-position variations. The end position is inferred from + /// and the length of : + /// end = position + length(originalSequence) - 1 (or end = position if is null). + /// + /// + /// 1-based, inclusive position of the variation start. Used to infer . + /// + /// + /// Reference subsequence being replaced. If null, treated as empty for end-position inference. + /// + /// + /// Alternate subsequence to insert in place of . May be empty for deletions. + /// + /// + /// Free-form description of the variation. Also used to initialize . + /// + /// + /// Optional mapping from absolute 1-based residue positions to one or more objects + /// applied at that position. If null, an empty dictionary is created. + /// public SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, Dictionary>? oneBasedModifications = null) : this(oneBasedPosition, originalSequence == null ? oneBasedPosition : oneBasedPosition + originalSequence.Length - 1, originalSequence, variantSequence, description, oneBasedModifications) { } + /// + /// 1-based, inclusive start position of this variation within the parent sequence. + /// public int OneBasedBeginPosition { get; } + + /// + /// 1-based, inclusive end position of this variation within the parent sequence. + /// For single-site variations with non-null , this is + /// OneBasedBeginPosition + OriginalSequence.Length - 1. + /// public int OneBasedEndPosition { get; } + + /// + /// The reference subsequence replaced by this variation. Empty string implies an insertion. + /// public string OriginalSequence { get; } + + /// + /// The alternate subsequence inserted by this variation. Empty string implies a deletion. + /// public string VariantSequence { get; } + + /// + /// Free-form description of the variation. Often the raw VCF line or a human-readable summary. + /// public string Description { get; } + + /// + /// Optional parsed VCF wrapper providing structured access to the originating VCF record. + /// May be null in some construction paths; in the provided constructors it is initialized. + /// public VariantCallFormat? VariantCallFormatDataString { get; } + + /// + /// Mapping from absolute 1-based residue positions to a list of objects + /// to apply at each position. Never null; defaults to an empty dictionary. + /// public Dictionary> OneBasedModifications { get; } + /// + /// Determines value equality with another object. + /// Two objects are equal when: + /// - Begin and end positions are equal + /// - Original and variant sequences are equal (nulls treated as equal only if both are null) + /// - are both null or equal + /// - have identical key sets and identical flattened modification lists (sequence-equal) + /// + /// Object to compare against. + /// True if equal by the criteria above; otherwise false. public override bool Equals(object obj) { SequenceVariation s = obj as SequenceVariation; @@ -67,6 +229,10 @@ public override bool Equals(object obj) && s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList())); } + /// + /// Computes a hash code from begin/end positions, sequences, and the VCF wrapper (if present). + /// + /// A hash code suitable for hash-based collections. public override int GetHashCode() { return OneBasedBeginPosition.GetHashCode() @@ -76,36 +242,75 @@ public override int GetHashCode() ^ (VariantCallFormatDataString?.GetHashCode() ?? 0); } + /// + /// Produces a compact, human-readable representation: {OriginalSequence}{OneBasedBeginPosition}{VariantSequence}. + /// Example: substitution A->T at position 12 yields "A12T". + /// + /// The compact representation string. public string SimpleString() { return OriginalSequence + OneBasedBeginPosition.ToString() + VariantSequence; } + /// + /// Tests whether the current variation intersects (overlaps) another variation in coordinate space. + /// Intersection is inclusive: any shared position in the 1-based, inclusive ranges is considered overlap. + /// + /// The other to test. + /// True if the ranges overlap; otherwise false. internal bool Intersects(SequenceVariation segment) { return segment.OneBasedEndPosition >= OneBasedBeginPosition && segment.OneBasedBeginPosition <= OneBasedEndPosition; } + /// + /// Tests whether the current variation intersects (overlaps) a truncation product range. + /// Intersection is inclusive: any shared position in the 1-based, inclusive ranges is considered overlap. + /// + /// The segment to test. + /// True if the ranges overlap; otherwise false. internal bool Intersects(TruncationProduct segment) { return segment.OneBasedEndPosition >= OneBasedBeginPosition && segment.OneBasedBeginPosition <= OneBasedEndPosition; } + /// + /// Tests whether the current variation intersects a single 1-based position. + /// + /// A 1-based, inclusive position in the parent sequence. + /// True if lies within the variation’s range; otherwise false. internal bool Intersects(int pos) { return OneBasedBeginPosition <= pos && pos <= OneBasedEndPosition; } + /// + /// Tests whether the current variation fully includes another variation’s range. + /// Inclusion is inclusive on both ends. + /// + /// The other to test. + /// True if the current range fully contains ; otherwise false. internal bool Includes(SequenceVariation segment) { return OneBasedBeginPosition <= segment.OneBasedBeginPosition && OneBasedEndPosition >= segment.OneBasedEndPosition; } + /// + /// Tests whether the current variation includes a single 1-based position. + /// Inclusion is inclusive on both ends. + /// + /// A 1-based, inclusive position in the parent sequence. + /// True if lies within the variation’s range; otherwise false. internal bool Includes(int pos) { return OneBasedBeginPosition <= pos && pos <= OneBasedEndPosition; } + /// + /// Validates positional consistency: begin must be > 0 and end must be >= begin. + /// This does not validate string/length consistency between and . + /// + /// True if positions are valid; otherwise false. public bool AreValid() { return OneBasedBeginPosition > 0 && OneBasedEndPosition >= OneBasedBeginPosition; diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs index 63e0b249e..c5fd71ed5 100644 --- a/mzLib/Omics/BioPolymer/VariantApplication.cs +++ b/mzLib/Omics/BioPolymer/VariantApplication.cs @@ -4,20 +4,57 @@ namespace Omics.BioPolymer { + /// + /// Utilities for constructing and applying sequence variants to biopolymers (proteins/RNA) in a 1-based, inclusive coordinate system. + /// This includes: + /// - Expanding a base biopolymer into concrete variant instances (genotype-aware or combinatorial). + /// - Renaming/accession tagging based on applied variants. + /// - Re-mapping indices for truncation products and localized modifications after edits (insertions/deletions/substitutions). + /// - Converting certain annotation-style modifications (e.g., nucleotide substitution markers) into true sequence variants. + /// public static class VariantApplication { + /// + /// Builds concrete variant biopolymers from a base entity. + /// If any known variation lacks genotype information (no VCF or empty genotypes), a safe combinatorial expansion is used (bounded by ). + /// Otherwise, variants are applied in a genotype/allele-depth-aware manner via . + /// + /// A biopolymer type that supports sequence variants. + /// The base biopolymer to expand into variant instances. + /// + /// Maximum number of variants to consider when creating combinations for genotype-less scenarios. + /// This caps explosion in . + /// + /// + /// Minimum AD (Allele Depth) per sample required for an allele (reference or alternate) to be considered "deep" enough to participate in genotype-aware application. + /// + /// A list of concrete variants derived from . public static List GetVariantBioPolymers(this TBioPolymerType protein, int maxAllowedVariantsForCombinatorics = 4, int minAlleleDepth = 1) where TBioPolymerType : IHasSequenceVariants { + // Materialize any substitution-like annotations into concrete sequence variants on both the consensus and the instance protein.ConsensusVariant.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); + + // If all variants are positionally valid and any is missing VCF/genotype data, fall back to bounded combinatorial application if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.VariantCallFormatDataString == null || v.VariantCallFormatDataString.Genotypes.Count == 0)) { return ApplyAllVariantCombinations(protein, protein.SequenceVariations, maxAllowedVariantsForCombinatorics).ToList(); } + + // Otherwise, do genotype/allele-depth-aware application with combinatorics limited for heterozygous sites return ApplyVariants(protein, protein.SequenceVariations, maxAllowedVariantsForCombinitorics: maxAllowedVariantsForCombinatorics, minAlleleDepth); } + /// + /// Produces a name with an appended variant tag built from applied variation descriptions. + /// If both and are effectively empty, returns null. + /// + /// Base name (e.g., protein name). May be null. + /// Variations applied to the instance. If null/empty, no variant tag is appended. + /// + /// The base plus " variant:{...}" when variations exist; null if both inputs are empty. + /// public static string? GetVariantName(string? name, IEnumerable? appliedVariations) { bool emptyVars = appliedVariations.IsNullOrEmpty(); @@ -28,17 +65,38 @@ public static List GetVariantBioPolymers(this return name + variantTag; } + /// + /// Constructs a variant-aware accession by appending a compact variant string to the consensus accession. + /// + /// The variant-capable biopolymer. + /// Applied variations to encode into the suffix; may be null/empty. + /// + /// The consensus accession or consensus accession with "_{SimpleString}_..." suffix if variations are present. + /// public static string GetAccession(IHasSequenceVariants protein, IEnumerable? appliedSequenceVariations) { return protein.ConsensusVariant.Accession + (appliedSequenceVariations.IsNullOrEmpty() ? "" : $"_{CombineSimpleStrings(appliedSequenceVariations)}"); } + /// + /// Determines if a specific 1-based position in the variant biopolymer lies within a particular variation's range. + /// + /// The variation of interest; may be null. + /// 1-based position in the current (possibly already-edited) variant sequence. + /// True if the position is included by the variation; otherwise false. public static bool IsSequenceVariantModification(SequenceVariation? appliedVariant, int variantProteinIndex) { return appliedVariant != null && appliedVariant.Includes(variantProteinIndex); } + /// + /// Maps a modification index from the edited (variant) sequence back to the original consensus index by subtracting the + /// net length changes of all applied variations that ended before the queried position. + /// + /// The variant-capable biopolymer containing applied variations. + /// 1-based index in the variant sequence. + /// The corresponding 1-based index in the original consensus sequence. public static int RestoreModificationIndex(IHasSequenceVariants protein, int variantProteinModificationIndex) { return variantProteinModificationIndex - protein.AppliedSequenceVariations @@ -46,9 +104,24 @@ public static int RestoreModificationIndex(IHasSequenceVariants protein, int var .Sum(v => v.VariantSequence.Length - v.OriginalSequence.Length); } + /// + /// Applies a set of sequence variations in a genotype- and allele-depth-aware fashion for all individuals found in the VCF payloads. + /// Heterozygous sites can produce combinatorial branches up to per individual. + /// Results are deduplicated by final base sequence. + /// + /// A biopolymer type that supports sequence variants. + /// The base biopolymer to which variations will be applied. + /// Candidate variations. Duplicates by effect are collapsed using . + /// + /// Upper cap for heterozygous combinatorial branching. If an individual has more heterozygous variants than this number, + /// the algorithm limits branching to control explosion. + /// + /// Minimum AD (Allele Depth) per sample for an allele to be considered in application. + /// A list of concrete variant biopolymers across all individuals encoded in the VCF payloads. public static List ApplyVariants(TBioPolymerType protein, IEnumerable sequenceVariations, int maxAllowedVariantsForCombinitorics, int minAlleleDepth) where TBioPolymerType : IHasSequenceVariants { + // Remove duplicate effects (by SimpleString), require variants with genotype data, apply from higher to lower positions List uniqueEffectsToApply = sequenceVariations .GroupBy(v => v.SimpleString()) .Select(x => x.First()) @@ -56,6 +129,7 @@ public static List ApplyVariants(TBioPolymerTy .OrderByDescending(v => v.OneBasedBeginPosition) .ToList(); + // A shallow "base" variant to branch from (no applied variants yet) TBioPolymerType proteinCopy = protein.CreateVariant(protein.BaseSequence, protein, null, protein.TruncationProducts, protein.OneBasedPossibleLocalizedModifications, null); if (uniqueEffectsToApply.Count == 0) @@ -63,32 +137,41 @@ public static List ApplyVariants(TBioPolymerTy return new List { proteinCopy }; } + // All per-sample identifiers present in the VCF objects HashSet individuals = new HashSet(uniqueEffectsToApply.SelectMany(v => v.VariantCallFormatDataString.Genotypes.Keys)); List variantProteins = new(); List newVariantProteins = new(); + foreach (string individual in individuals) { newVariantProteins.Clear(); newVariantProteins.Add(proteinCopy); + // Whether to limit combinatorial branching for this individual bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.VariantCallFormatDataString.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics; + foreach (var variant in uniqueEffectsToApply) { + // Only proceed if the individual's genotype references this variant's alternate allele index bool variantAlleleIsInTheGenotype = variant.VariantCallFormatDataString.Genotypes[individual].Contains(variant.VariantCallFormatDataString.AlleleIndex.ToString()); if (!variantAlleleIsInTheGenotype) { continue; } + + // Zygosity and depth checks for this individual bool isHomozygousAlternate = variant.VariantCallFormatDataString.Homozygous[individual] && variant.VariantCallFormatDataString.Genotypes[individual].All(d => d == variant.VariantCallFormatDataString.AlleleIndex.ToString()); bool isDeepReferenceAllele = int.TryParse(variant.VariantCallFormatDataString.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth; bool isDeepAlternateAllele = int.TryParse(variant.VariantCallFormatDataString.AlleleDepths[individual][variant.VariantCallFormatDataString.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth; if (isHomozygousAlternate && isDeepAlternateAllele) { + // Deterministic application: all branches take the alt allele newVariantProteins = newVariantProteins.Select(p => ApplySingleVariant(variant, p, individual)).ToList(); } else if (variant.VariantCallFormatDataString.Heterozygous[individual] && tooManyHeterozygousVariants) { + // Limit branching: either keep ref, take alt, or update second branch if already present if (isDeepAlternateAllele && isDeepReferenceAllele) { if (newVariantProteins.Count == 1 && maxAllowedVariantsForCombinitorics > 0) @@ -108,6 +191,7 @@ public static List ApplyVariants(TBioPolymerTy } else if (variant.VariantCallFormatDataString.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants) { + // Full combinatorics (bounded) for heterozygous case: keep ref and/or take alt depending on depths List combinitoricProteins = new(); foreach (var ppp in newVariantProteins) @@ -116,9 +200,9 @@ public static List ApplyVariants(TBioPolymerTy { if (variant.VariantCallFormatDataString.Genotypes[individual].Contains("0")) { - combinitoricProteins.Add(ppp); + combinitoricProteins.Add(ppp); // keep reference branch } - combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual)); + combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual)); // alternate branch } else if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0) { @@ -135,16 +219,30 @@ public static List ApplyVariants(TBioPolymerTy variantProteins.AddRange(newVariantProteins); } + // De-duplicate by final sequence to avoid identical variants from different branches return variantProteins.GroupBy(x => x.BaseSequence).Select(x => x.First()).ToList(); } + /// + /// Applies a single to a biopolymer and returns the updated instance with + /// sequence, truncation products, localized modifications, and applied-variation indices all re-mapped. + /// + /// A biopolymer type that supports sequence variants. + /// Variation to apply (coordinates refer to the current protein's sequence). + /// Source biopolymer to mutate. + /// Sample identifier used to annotate the created variant (may be empty). + /// A new variant instance created via . private static TBioPolymerType ApplySingleVariant(SequenceVariation variantGettingApplied, TBioPolymerType protein, string individual) where TBioPolymerType : IHasSequenceVariants { + // Sequence prefix before the edit region string seqBefore = protein.BaseSequence.Substring(0, variantGettingApplied.OneBasedBeginPosition - 1); + // The replacement (alternate) sequence string seqVariant = variantGettingApplied.VariantSequence; + // First index in the original sequence after the edited region int afterIdx = variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.OriginalSequence.Length - 1; + // Reify a "post-application" variation object pinned to the inserted length SequenceVariation variantAfterApplication; var vcf = variantGettingApplied.VariantCallFormatDataString; if (vcf != null) @@ -169,7 +267,7 @@ private static TBioPolymerType ApplySingleVariant(SequenceVaria variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)); } - // NULL-SAFE: AppliedSequenceVariations can be null for base proteins + // If an already-applied variation partially overlaps the current edit, use the consensus tail to avoid index corruption bool intersectsAppliedRegionIncompletely = protein.AppliedSequenceVariations != null && protein.AppliedSequenceVariations.Any(x => variantGettingApplied.Intersects(x) && !variantGettingApplied.Includes(x)); @@ -178,17 +276,22 @@ private static TBioPolymerType ApplySingleVariant(SequenceVaria string seqAfter; if (intersectsAppliedRegionIncompletely) { + // Tail from consensus (not the possibly already-mutated BaseSequence) seqAfter = protein.BaseSequence.Length - afterIdx <= 0 ? "" : protein.ConsensusVariant.BaseSequence.Substring(afterIdx); } else { + // Tail from the current BaseSequence; keep any previously applied variations that are not fully contained by the current edit seqAfter = protein.BaseSequence.Length - afterIdx <= 0 ? "" : protein.BaseSequence.Substring(afterIdx); appliedVariations = appliedVariations .Concat((protein.AppliedSequenceVariations ?? Enumerable.Empty()).Where(x => !variantGettingApplied.Includes(x))) .ToList(); } + + // Clip at stop (*) if any string variantSequence = (seqBefore + seqVariant + seqAfter).Split('*')[0]; + // Re-map dependent structures after the sequence length change List adjustedProteolysisProducts = AdjustTruncationProductIndices(variantGettingApplied, variantSequence, protein, protein.TruncationProducts); Dictionary> adjustedModifications = AdjustModificationIndices(variantGettingApplied, variantSequence, protein); List adjustedAppliedVariations = AdjustSequenceVariationIndices(variantGettingApplied, variantSequence, appliedVariations); @@ -196,17 +299,27 @@ private static TBioPolymerType ApplySingleVariant(SequenceVaria return protein.CreateVariant(variantSequence, protein, adjustedAppliedVariations, adjustedProteolysisProducts, adjustedModifications, individual); } + /// + /// Adjusts (re-bases) the coordinates of already-applied variations after applying a new variation. + /// Handles overlaps by trimming and shifts by adding the net length change. + /// + /// The newly applied variation causing coordinate shifts. + /// The updated sequence after the application (used to clamp ends). + /// Variations that were previously applied (may be null). + /// A new list of variations with updated coordinates, filtered for validity. private static List AdjustSequenceVariationIndices(SequenceVariation variantGettingApplied, string variantAppliedProteinSequence, IEnumerable alreadyAppliedVariations) { List variations = new List(); if (alreadyAppliedVariations == null) { return variations; } + foreach (SequenceVariation v in alreadyAppliedVariations) { + // Net length already introduced before the start of v int addedIdx = alreadyAppliedVariations .Where(applied => applied.OneBasedEndPosition < v.OneBasedBeginPosition) .Sum(applied => applied.VariantSequence.Length - applied.OriginalSequence.Length); - // NULL-SAFE compare; or it is the current variation + // Either same VCF payload (null-safe) or fully before the new application region after compensating for prior shifts if ((v.VariantCallFormatDataString != null && v.VariantCallFormatDataString.Equals(variantGettingApplied.VariantCallFormatDataString)) || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition) { @@ -214,16 +327,20 @@ private static List AdjustSequenceVariationIndices(SequenceVa } else { + // Compute overlap with the newly applied edit and shift by the net length change int intersectOneBasedStart = Math.Max(variantGettingApplied.OneBasedBeginPosition, v.OneBasedBeginPosition); int intersectOneBasedEnd = Math.Min(variantGettingApplied.OneBasedEndPosition, v.OneBasedEndPosition); int overlap = intersectOneBasedEnd < intersectOneBasedStart ? 0 : intersectOneBasedEnd - intersectOneBasedStart + 1; + int sequenceLengthChange = variantGettingApplied.VariantSequence.Length - variantGettingApplied.OriginalSequence.Length; + int begin = v.OneBasedBeginPosition + sequenceLengthChange - overlap; if (begin > variantAppliedProteinSequence.Length) { continue; } + int end = v.OneBasedEndPosition + sequenceLengthChange - overlap; if (end > variantAppliedProteinSequence.Length) { @@ -257,22 +374,37 @@ private static List AdjustSequenceVariationIndices(SequenceVa return variations; } + /// + /// Re-bases proteolysis/truncation product ranges after applying a sequence variation. + /// Shifts segments to the right if the edit occurs before them and expands/contracts segments that span the edit. + /// If a stop (*) is introduced by the variant, downstream products are clamped to the new sequence length. + /// + /// The applied variation (source of positional shifts and length change). + /// The updated sequence after application. + /// The source biopolymer (used for consensus length in certain boundary checks). + /// Existing truncation products to be re-based; may be null. + /// Updated list of truncation products within the new coordinate system. private static List AdjustTruncationProductIndices(SequenceVariation variant, string variantAppliedProteinSequence, IHasSequenceVariants protein, IEnumerable proteolysisProducts) { List products = new List(); if (proteolysisProducts == null) { return products; } + int sequenceLengthChange = variant.VariantSequence.Length - variant.OriginalSequence.Length; + foreach (TruncationProduct p in proteolysisProducts.Where(p => p.OneBasedEndPosition.HasValue && p.OneBasedBeginPosition.HasValue)) { + // Entirely before the edit: unchanged if (variant.OneBasedBeginPosition > p.OneBasedEndPosition) { products.Add(p); } + // Segment spans the edit or is clamped at boundaries: extend/contract or clamp to stop else if ((p.OneBasedBeginPosition < variant.OneBasedBeginPosition || p.OneBasedBeginPosition == 1 || p.OneBasedBeginPosition == 2) && (p.OneBasedEndPosition > variant.OneBasedEndPosition || p.OneBasedEndPosition == protein.ConsensusVariant.BaseSequence.Length)) { if (variant.VariantSequence.EndsWith("*")) { + // Introduced stop codon/terminator: clamp to the new sequence length products.Add(new TruncationProduct(p.OneBasedBeginPosition, variantAppliedProteinSequence.Length, p.Type)); } else if (p.OneBasedEndPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length) @@ -280,6 +412,7 @@ private static List AdjustTruncationProductIndices(SequenceVa products.Add(new TruncationProduct(p.OneBasedBeginPosition, p.OneBasedEndPosition + sequenceLengthChange, p.Type)); } } + // Entirely after the edit: shift right/left by the net length change, if still within bounds and not terminated else if (p.OneBasedBeginPosition > variant.OneBasedEndPosition && p.OneBasedBeginPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length && p.OneBasedEndPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length @@ -291,32 +424,45 @@ private static List AdjustTruncationProductIndices(SequenceVa return products; } + /// + /// Produces a new localized modification dictionary re-based to the post-variation coordinate system and merged with + /// any one-based modifications carried by the applied variant itself. + /// + /// The variation causing index shifts. + /// The updated sequence after application (for bounds checks). + /// The source biopolymer providing the original modifications map. + /// A new dictionary of one-based modification lists keyed by position in the updated sequence. private static Dictionary> AdjustModificationIndices(SequenceVariation variant, string variantAppliedProteinSequence, IHasSequenceVariants protein) { + // Original per-position modifications (pre-application) IDictionary> modificationDictionary = protein.OneBasedPossibleLocalizedModifications; + // Modifications contributed by the variant itself (coordinated in the same one-based space) IDictionary> variantModificationDictionary = variant.OneBasedModifications; + Dictionary> mods = new Dictionary>(); int sequenceLengthChange = variant.VariantSequence.Length - variant.OriginalSequence.Length; + // Re-base original modifications if (modificationDictionary != null) { foreach (KeyValuePair> kv in modificationDictionary) { if (kv.Key > variantAppliedProteinSequence.Length) { - continue; + continue; // drop if beyond new end } else if (kv.Key < variant.OneBasedBeginPosition) { - mods.Add(kv.Key, kv.Value); + mods.Add(kv.Key, kv.Value); // unaffected positions } else if (variant.OneBasedEndPosition < kv.Key && kv.Key + sequenceLengthChange <= variantAppliedProteinSequence.Length) { - mods.Add(kv.Key + sequenceLengthChange, kv.Value); + mods.Add(kv.Key + sequenceLengthChange, kv.Value); // shift after the edit } } } + // Merge-in variant-borne modifications (may share positions) if (variantModificationDictionary != null) { foreach (var kv in variantModificationDictionary) @@ -335,35 +481,55 @@ private static Dictionary> AdjustModificationIndices(Seq return mods; } + /// + /// Concatenates the compact representations () of the provided variations with underscores. + /// + /// Variations to stringify; may be null/empty. + /// An underscore-joined string or empty string if none. private static string CombineSimpleStrings(IEnumerable? variations) { return variations.IsNullOrEmpty() ? "" : string.Join("_", variations.Select(v => v.SimpleString())); } + /// + /// Concatenates human-readable descriptions for the provided variations. + /// Prefers VCF description when available, otherwise falls back to the variation's own description. + /// + /// Variations to describe; may be null/empty. + /// A comma-delimited description string or empty string if none. public static string CombineDescriptions(IEnumerable? variations) { return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.VariantCallFormatDataString?.Description ?? d.Description)); } + /// + /// Applies all combinations of the provided variations to the base biopolymer up to a maximum number of yielded results. + /// The base (no-variant) biopolymer is yielded first, followed by combinations in increasing size. + /// + /// A biopolymer type that supports sequence variants. + /// The starting biopolymer (no variations applied). + /// Candidate variations to combine. + /// Maximum number of variants (including the base) to yield to bound combinatorial growth. + /// An enumerable of applied-variant biopolymers. public static IEnumerable ApplyAllVariantCombinations( TBioPolymerType baseBioPolymer, List variations, int maxCombinations) where TBioPolymerType : IHasSequenceVariants { - int count = 0; + int count = 0; // number of variants yielded so far yield return baseBioPolymer; count++; if (count >= maxCombinations) yield break; - int n = variations.Count; + int n = variations.Count; // total variation count for (int size = 1; size <= n; size++) { foreach (var combo in GetCombinations(variations, size)) { - var result = baseBioPolymer; + var result = baseBioPolymer; // start from base and apply this combination in order foreach (var variant in combo) { result = ApplySingleVariant(variant, result, string.Empty); @@ -379,19 +545,28 @@ public static IEnumerable ApplyAllVariantCombinations + /// Generates all k-combinations (order-independent, no repetition) of the given list. + /// This is a standard lexicographic index-based combinator that yields increasing index tuples. + /// + /// Source list to combine. + /// Combination size k (0 < k <= n). + /// An enumerable of read-only lists containing the selected variations. private static IEnumerable> GetCombinations(List variations, int size) { int n = variations.Count; var indices = new int[size]; - for (int i = 0; i < size; i++) indices[i] = i; + for (int i = 0; i < size; i++) indices[i] = i; // initial 0..k-1 while (true) { + // Materialize current combination var combo = new List(size); for (int i = 0; i < size; i++) combo.Add(variations[indices[i]]); yield return combo; + // Advance to next lexicographic combination int pos = size - 1; while (pos >= 0 && indices[pos] == n - size + pos) pos--; @@ -402,14 +577,25 @@ private static IEnumerable> GetCombinations(List + /// Scans localized modifications for "nucleotide substitution" annotations of the form "X->Y" and converts them + /// into concrete objects at the associated positions. + /// The originating annotation-style modifications are removed from both the possible-localized and original-non-variant collections + /// (and from consensus mirrors) when they are fully consumed by the conversion. + /// + /// A biopolymer type that supports sequence variants. + /// The biopolymer whose modifications/variants are to be updated in-place. public static void ConvertNucleotideSubstitutionModificationsToSequenceVariants(this TBioPolymerType protein) where TBioPolymerType : IHasSequenceVariants { + // Collect mods to remove after converting them to sequence variants List> modificationsToRemove = new(); + foreach (var kvp in protein.OneBasedPossibleLocalizedModifications) { foreach (Modification mod in kvp.Value) { + // Look for annotation-style nucleotide substitutions (e.g., "A->G") if (mod.ModificationType.Contains("nucleotide substitution") && mod.OriginalId.Contains("->")) { string[] originalAndSubstitutedAminoAcids = mod.OriginalId.Split(new[] { "->" }, StringSplitOptions.RemoveEmptyEntries); @@ -418,11 +604,14 @@ public static void ConvertNucleotideSubstitutionModificationsToSequenceVariants< { protein.SequenceVariations.Add(sequenceVariation); } + // Defer removal to avoid mutating collection during enumeration KeyValuePair pair = new(kvp.Key, mod); modificationsToRemove.Add(pair); } } } + + // Remove the consumed annotation-style modifications from both live and consensus dictionaries foreach (KeyValuePair pair in modificationsToRemove) { if (protein.OneBasedPossibleLocalizedModifications.ContainsKey(pair.Key)) @@ -441,6 +630,7 @@ public static void ConvertNucleotideSubstitutionModificationsToSequenceVariants< } } } + if (protein.OriginalNonVariantModifications.ContainsKey(pair.Key)) { List modList = protein.OriginalNonVariantModifications[pair.Key]; From 193bd673538c11df5abaa7a134ad8ae37319a72a Mon Sep 17 00:00:00 2001 From: trishorts Date: Fri, 24 Oct 2025 17:29:48 -0500 Subject: [PATCH 34/38] unwoops --- mzLib/Test/FlashLFQ/TestIsoTracker.cs | 32 +++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/mzLib/Test/FlashLFQ/TestIsoTracker.cs b/mzLib/Test/FlashLFQ/TestIsoTracker.cs index f918e991d..59720e4ff 100644 --- a/mzLib/Test/FlashLFQ/TestIsoTracker.cs +++ b/mzLib/Test/FlashLFQ/TestIsoTracker.cs @@ -25,7 +25,7 @@ internal class TestIsoTracker [Test] public static void TestIsobaricPeptideGroup() { - // VariantCallFormatDataString: Test the IsobaricPeptideGroup class + // Description: Test the IsobaricPeptideGroup class // In this testing, we will create a new IsobaricPeptideGroup and check the properties List ids = new List { @@ -126,7 +126,7 @@ public static void TestIsoTrackerIdFilter_FilterPeptide() [Test] public static void TestGetTargeMz_case1() { - // VariantCallFormatDataString: Test the GetTargetMz function in FlashLfqEngine + // Description: Test the GetTargetMz function in FlashLfqEngine // In this testing, we will check the isobaricPeptideGroup and targetMzs output // All three ids are isobaric peptides with the same monoisotopic mass, so they should be grouped together and generate only 5 target m/z values @@ -189,7 +189,7 @@ public static void TestGetTargeMz_case1() [Test] public static void TestGetTargeMz_case2() { - // VariantCallFormatDataString: Test the GetTargetMz function in FlashLfqEngine + // Description: Test the GetTargetMz function in FlashLfqEngine // In this testing, we will check the isobaricPeptideGroup and targetMzs output // All three ids are isobaric peptides with the different monoisotopic mass, so they should be grouped together and generate 15(3*5) target m/z values @@ -253,7 +253,7 @@ public static void TestGetTargeMz_case2() [Test] public static void TestIndexPeakPrune() { - // VariantCallFormatDataString: Test the peak indexing engine pruning function + // Description: Test the peak indexing engine pruning function // In this test, we will create the targetMzs from the ids to prune the indexPeaks. // After pruning, the index engine should only keep the peaks with the target m/z values. string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData"); @@ -329,7 +329,7 @@ public static void TestXICConstructor() [Test] public static void TestLinearSpline() { - //VariantCallFormatDataString: Test the linear spline interpolation and differentiation + //Description: Test the linear spline interpolation and differentiation //The testing model is a linear function y = 100x, where x is the time point and y is the intensity //The slope will be 100 and the second derivative will be 0 @@ -361,7 +361,7 @@ public static void TestLinearSpline() [Test] public static void TestPeakAlignment() { - //VariantCallFormatDataString: Test the peak alignment function + //Description: Test the peak alignment function //The testing model is a triangle peak with the Apex. //The Apex of three peaks are 3, 3.1, 2.9 min //The time shift should be 0.1 min for the peak2 and -0.1 min for the peak3 @@ -410,7 +410,7 @@ public static void TestPeakAlignment() [Test] public static void TestBuildSmoothedCubicSpline_LessPoint() { - //VariantCallFormatDataString: Test the cubic spline interpolation + //Description: Test the cubic spline interpolation //The testing model has less than 5 points that cannot build the cubic spline //The cubic spline should be null @@ -531,7 +531,7 @@ public static void TestXICGroupConstructor() [Test] public static void TestXICGroup_RtDict() { - //VariantCallFormatDataString: Test the peakAlignment function in the XICGroup + //Description: Test the peakAlignment function in the XICGroup //The testing has three normal distribution XIC peaks. //The Apex of three peaks are 3, 3.1, 2.9 min //The time shift should be 0.1 min for the peak2 and -0.1 min for the peak3 @@ -582,7 +582,7 @@ public static void TestXICGroup_RtDict() [Test] public static void TestXICGroup_IdList() { - //VariantCallFormatDataString: Test the IdList in the XICGroup + //Description: Test the IdList in the XICGroup //The testing model has three XICs, one of this XIC has no Id, then it borrows one Id from the first XIC //If the Id is borrowed, the Id will not be added into the IdList //The IdList should contain the Ids from the first and the third XIC @@ -617,7 +617,7 @@ public static void TestXICGroup_IdList() [Test] public static void TestXICGroup_Tracking() { - //VariantCallFormatDataString: Test the peak tracking function in the XICGroup + //Description: Test the peak tracking function in the XICGroup //The testing has three normal distribution XIC peaks. //The Apex of three peaks are 20, 23, 17 min //The time shift should be +3 min for the peak2 and -3 min for the peak3 @@ -688,7 +688,7 @@ public static void TestXICGroup_Tracking() [Test] public static void TestCombinedSearching() { - //VariantCallFormatDataString: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output + //Description: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output //There are three XIC included isobaric peaks that with 3 min gap. string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData"); @@ -796,7 +796,7 @@ public static void TestCombinedSearching() [Test] public static void TestPeakOutput() { - //VariantCallFormatDataString: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output + //Description: Test the IsoTracker in the FlashLFQ, checking items include the peak tracking and the peak output //There are three XIC included isobaric peaks that with 3 min gap. string testDataDirectory = Path.Combine(TestContext.CurrentContext.TestDirectory, "XICData"); @@ -931,7 +931,7 @@ public static void TestPeakOutput() [Test] public static void TestIsoSequence_Ambiguous() { - //VariantCallFormatDataString: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID + //Description: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID //IsoID: DIVENY[Common Variable:Oxidation on M]FMR should be the same as DIVENYFM[Common Variable:Oxidation on M]R //Try to turn on the MBR and Isotracker at the same time @@ -1066,7 +1066,7 @@ public static void TestIsoSequence_Ambiguous() [Test] public static void TestIsoSequence_MonoIsotopicMassTolerance() { - //VariantCallFormatDataString: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID + //Description: Test the IsoTracker in the FlashLFQ, checking the algorithm can correctly recognize the IsoID //IsoID: DIVENY[Common Variable:Oxidation on M]FMR should be the same as DIVENYFM[Common Variable:Oxidation on M]R //The Monoisotopic mass are 1201.5436, 1201.5437, 1201.5438, they should be recognized as the same IsoID @@ -1491,7 +1491,7 @@ public static void TestIsoSequence_CombinedTesting() [Test] public static void TestRun_SearchingTarget() { - //VariantCallFormatDataString: we will upload a motifList for IsoTracker + //Description: we will upload a motifList for IsoTracker //Only peptide with motif on N can be searched //In this case, only one kind of peptide can be searched: baseSequence PEPNINEN -> PEPN[Mod]INEN, PEPNIN[Mod]EN, PEPNINEN[Mod] // Run 1 with PEPNIN[Mod]EN, PEPNINEN[Mod] @@ -1615,7 +1615,7 @@ public static void TestRun_SearchingTarget() [Test] public static void TestRun_IDChecking() { - //VariantCallFormatDataString: we will turn on the IDchecking for IsoTracker + //Description: we will turn on the IDchecking for IsoTracker //Only when one XIC with more than one id, we do the searching //In this case, run 1 has 4 ids (pepA_1, pepA_2, pepB_1, pepC_1) //run 2 has 3 ids (pepA_1, pepB_1, pepC_1) From 8380b07bc0c754f93bc0504927765bce1c3bc93e Mon Sep 17 00:00:00 2001 From: MICHAEL SHORTREED Date: Sat, 25 Oct 2025 08:37:42 -0500 Subject: [PATCH 35/38] f --- mzLib/Omics/BioPolymer/VariantApplication.cs | 1 - 1 file changed, 1 deletion(-) diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs index c5fd71ed5..737f17bd5 100644 --- a/mzLib/Omics/BioPolymer/VariantApplication.cs +++ b/mzLib/Omics/BioPolymer/VariantApplication.cs @@ -1,5 +1,4 @@ using MzLibUtil; -using Omics.BioPolymer; using Omics.Modifications; namespace Omics.BioPolymer From 15940f00dc8ebddf55b4debb0708b16082b913d8 Mon Sep 17 00:00:00 2001 From: MICHAEL SHORTREED Date: Sat, 25 Oct 2025 08:44:40 -0500 Subject: [PATCH 36/38] new variant application constructor with more accurate parameters and names --- mzLib/Omics/BioPolymer/VariantApplication.cs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs index 737f17bd5..5b15184b6 100644 --- a/mzLib/Omics/BioPolymer/VariantApplication.cs +++ b/mzLib/Omics/BioPolymer/VariantApplication.cs @@ -13,6 +13,24 @@ namespace Omics.BioPolymer /// public static class VariantApplication { + + public static List GetVariantBioPolymers(this TBioPolymerType protein, int consensusPlusVariantIsoforms = 1, int minAlleleDepth = 0, int maxVariantsPerIsoform = 0) + where TBioPolymerType : IHasSequenceVariants + { + // Materialize any substitution-like annotations into concrete sequence variants on both the consensus and the instance + protein.ConsensusVariant.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); + protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); + + // If all variants are positionally valid and any is missing VCF/genotype data, fall back to bounded combinatorial application + if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.VariantCallFormatDataString == null || v.VariantCallFormatDataString.Genotypes.Count == 0)) + { + return ApplyAllVariantCombinations(protein, protein.SequenceVariations, consensusPlusVariantIsoforms).ToList(); + } + + // Otherwise, do genotype/allele-depth-aware application with combinatorics limited for heterozygous sites + return ApplyVariants(protein, protein.SequenceVariations, maxAllowedVariantsForCombinitorics: consensusPlusVariantIsoforms, minAlleleDepth); + } + /// /// Builds concrete variant biopolymers from a base entity. /// If any known variation lacks genotype information (no VCF or empty genotypes), a safe combinatorial expansion is used (bounded by ). From 942a930694fdc0cda1c3c68a87c9c7561bfc789b Mon Sep 17 00:00:00 2001 From: MICHAEL SHORTREED Date: Sat, 25 Oct 2025 09:03:18 -0500 Subject: [PATCH 37/38] new VariantApplication method with added maxVariantsPerIsoform --- mzLib/Omics/BioPolymer/VariantApplication.cs | 42 +++++++------------- 1 file changed, 15 insertions(+), 27 deletions(-) diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs index 5b15184b6..bd735b7c9 100644 --- a/mzLib/Omics/BioPolymer/VariantApplication.cs +++ b/mzLib/Omics/BioPolymer/VariantApplication.cs @@ -13,8 +13,14 @@ namespace Omics.BioPolymer /// public static class VariantApplication { - - public static List GetVariantBioPolymers(this TBioPolymerType protein, int consensusPlusVariantIsoforms = 1, int minAlleleDepth = 0, int maxVariantsPerIsoform = 0) + /// + /// Expand a base biopolymer into concrete consensus and variant isoforms by materializing substitution-style annotations + /// and either applying a bounded combinatorial expansion when genotype data is absent or performing genotype- and + /// allele-depth-aware variant application. The parameter limits the + /// number of isoforms returned (including the consensus); sets the minimum AD required + /// to consider an allele; is reserved for per-isoform combinatoric caps. + /// + public static List GetConsensusAndVariantBioPolymers(this TBioPolymerType protein, int consensusPlusVariantIsoforms = 1, int minAlleleDepth = 0, int maxVariantsPerIsoform = 0) where TBioPolymerType : IHasSequenceVariants { // Materialize any substitution-like annotations into concrete sequence variants on both the consensus and the instance @@ -32,35 +38,17 @@ public static List GetVariantBioPolymers(this } /// - /// Builds concrete variant biopolymers from a base entity. - /// If any known variation lacks genotype information (no VCF or empty genotypes), a safe combinatorial expansion is used (bounded by ). - /// Otherwise, variants are applied in a genotype/allele-depth-aware manner via . + /// LEGACY: Expand a base biopolymer into consensus and variant isoforms by materializing substitution-style annotations + /// and then either performing a bounded combinatorial expansion when genotype data is absent or running a + /// genotype- and allele-depth-aware variant application. The + /// parameter caps the total isoforms returned (including the consensus). This method is retained for backward + /// compatibility and may be removed in a future release — prefer + /// or newer variant-expansion APIs. /// - /// A biopolymer type that supports sequence variants. - /// The base biopolymer to expand into variant instances. - /// - /// Maximum number of variants to consider when creating combinations for genotype-less scenarios. - /// This caps explosion in . - /// - /// - /// Minimum AD (Allele Depth) per sample required for an allele (reference or alternate) to be considered "deep" enough to participate in genotype-aware application. - /// - /// A list of concrete variants derived from . public static List GetVariantBioPolymers(this TBioPolymerType protein, int maxAllowedVariantsForCombinatorics = 4, int minAlleleDepth = 1) where TBioPolymerType : IHasSequenceVariants { - // Materialize any substitution-like annotations into concrete sequence variants on both the consensus and the instance - protein.ConsensusVariant.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); - protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); - - // If all variants are positionally valid and any is missing VCF/genotype data, fall back to bounded combinatorial application - if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.VariantCallFormatDataString == null || v.VariantCallFormatDataString.Genotypes.Count == 0)) - { - return ApplyAllVariantCombinations(protein, protein.SequenceVariations, maxAllowedVariantsForCombinatorics).ToList(); - } - - // Otherwise, do genotype/allele-depth-aware application with combinatorics limited for heterozygous sites - return ApplyVariants(protein, protein.SequenceVariations, maxAllowedVariantsForCombinitorics: maxAllowedVariantsForCombinatorics, minAlleleDepth); + return GetConsensusAndVariantBioPolymers(protein, consensusPlusVariantIsoforms: maxAllowedVariantsForCombinatorics, minAlleleDepth: minAlleleDepth, maxVariantsPerIsoform: 0); } /// From 5cac61fd6d20d334f01145113636f1880ac5afe9 Mon Sep 17 00:00:00 2001 From: MICHAEL SHORTREED Date: Sat, 25 Oct 2025 09:25:10 -0500 Subject: [PATCH 38/38] new methods to evaluate validity of sequence variants --- mzLib/Omics/BioPolymer/SequenceVariation.cs | 78 +++++++++++++++++++-- 1 file changed, 74 insertions(+), 4 deletions(-) diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs index 3b5ef13f0..6fe611d4f 100644 --- a/mzLib/Omics/BioPolymer/SequenceVariation.cs +++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs @@ -307,13 +307,83 @@ internal bool Includes(int pos) } /// - /// Validates positional consistency: begin must be > 0 and end must be >= begin. - /// This does not validate string/length consistency between and . + /// Validates this variation. + /// Rules: + /// 1. Coordinates must be sensible (begin >= 1 and end >= begin). + /// 2. Variation must represent a meaningful change: + /// - Either the sequence actually changes (insertion, deletion, substitution, stop, frameshift), + /// - OR there are variant-specific modifications. + /// A “no-op” (OriginalSequence == VariantSequence with no variant-specific mods) is invalid. + /// 3. If variant-specific modifications exist, they must not violate positional constraints + /// (see ). /// - /// True if positions are valid; otherwise false. public bool AreValid() { - return OneBasedBeginPosition > 0 && OneBasedEndPosition >= OneBasedBeginPosition; + if (OneBasedBeginPosition <= 0 || OneBasedEndPosition < OneBasedBeginPosition) + { + return false; + } + + bool noSequenceChange = string.Equals(OriginalSequence ?? string.Empty, + VariantSequence ?? string.Empty, + StringComparison.Ordinal); + + bool hasMods = OneBasedModifications != null && OneBasedModifications.Count > 0; + + if (noSequenceChange && !hasMods) + { + return false; + } + + if (!hasMods) + { + return true; + } + + return !GetInvalidModificationPositions().Any(); + } + /// + /// Yields modification positions deemed invalid under the current edit semantics. + /// + private IEnumerable GetInvalidModificationPositions() + { + if (OneBasedModifications == null || OneBasedModifications.Count == 0) + { + yield break; + } + + bool isTermination = VariantSequence == "*" || VariantSequence.Length == 0; + + if (isTermination) + { + foreach (var kvp in OneBasedModifications) + { + if (kvp.Key >= OneBasedBeginPosition) + { + yield return kvp.Key; + } + } + yield break; + } + + int newSpanEnd = OneBasedBeginPosition + VariantSequence.Length - 1; + + foreach (var kvp in OneBasedModifications) + { + int pos = kvp.Key; + if (pos <= 0) + { + yield return pos; + continue; + } + + if (pos >= OneBasedBeginPosition + && pos <= OneBasedEndPosition + && pos > newSpanEnd) + { + yield return pos; + } + } } } } \ No newline at end of file