From a91d8684bd72c6e178a68138616a771501019fa2 Mon Sep 17 00:00:00 2001 From: trishorts Date: Mon, 27 Oct 2025 14:14:29 -0500 Subject: [PATCH 01/16] test deep homozygous variant with mods --- mzLib/Proteomics/Protein/Protein.cs | 10 +- .../Test/DatabaseTests/TestVariantProtein.cs | 126 +++++++++++++++- .../oneVcfProteinWithModsOnVariant.xml | 141 ++++++++++++++++++ mzLib/Test/Test.csproj | 3 + mzLib/Transcriptomics/RNA.cs | 10 +- 5 files changed, 285 insertions(+), 5 deletions(-) create mode 100644 mzLib/Test/DatabaseTests/oneVcfProteinWithModsOnVariant.xml diff --git a/mzLib/Proteomics/Protein/Protein.cs b/mzLib/Proteomics/Protein/Protein.cs index e6694b560..1453b6630 100644 --- a/mzLib/Proteomics/Protein/Protein.cs +++ b/mzLib/Proteomics/Protein/Protein.cs @@ -603,8 +603,16 @@ public TBioPolymerType CreateVariant(string variantBaseSequence if (original is not Protein originalProtein) throw new ArgumentException("The original BioPolymer must be Protein to create a protein variant"); - var variantProtein = new Protein(variantBaseSequence, originalProtein, appliedSequenceVariants, + var variantProtein = new Protein(variantBaseSequence, originalProtein, appliedSequenceVariants, applicableProteolysisProducts, oneBasedModifications, sampleNameForVariants); + + // Remove any applied variants from the list of (unapplied) database sequence variations + if (appliedSequenceVariants != null) + { + var appliedSimple = new HashSet(appliedSequenceVariants.Select(v => v.SimpleString())); + variantProtein.SequenceVariations.RemoveAll(sv => appliedSimple.Contains(sv.SimpleString())); + } + return (TBioPolymerType)(IHasSequenceVariants)variantProtein; } #endregion diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 8ced7b208..51f3f62e2 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -576,6 +576,127 @@ public void SequenceVariationIsValidTest() Assert.IsTrue(svValidOriginalSequenceIsEmpty.AreValid()); //This is valid because it is an insertion Assert.IsTrue(svValidVariantSequenceLenthIsZero.AreValid()); // This is valid because it is a deletion } + // + // In this unit test, you will not get the base protein because the variant in your VCF is homozygous ALT (GT=1/1) with sufficient depth, + // and the loader always expands VCF-only entries via ApplyVariants. The maxHeterozygousVariants parameter only limits heterozygous combinatorics; it does not prevent applying homozygous variants. + // What happens in your path + // • LoadProteinXML always expands proteins by calling GetVariantBioPolymers(maxHeterozygousVariants, minAlleleDepth). + // • If a protein has only VCF-style sequence variations, GetVariantBioPolymers routes to ApplyVariants. + // • In ApplyVariants, a homozygous ALT with depth ≥ minAlleleDepth results in replacing the base with the variant: + // • isHomozygousAlternate == true and isDeepAlternateAllele == true → newVariantProteins = ApplySingleVariant(...), i.e., base is not retained. + // • maxHeterozygousVariants only affects heterozygous logic.It does nothing for homozygous ALT variants. + // Why your specific test applies the variant Your vcfstring is GT= 1 / 1 with AD = 0,15 (ALT depth 15). With the default minAlleleDepth=1 (you didn’t override it), + // ApplyVariants treats this as homozygous ALT with sufficient depth and applies it, returning only the variant protein. + // + [Test] + public void VcfProteinWithModOnDeepHomozygousVariant() + { + string vcfstring = + "8\t98089650\t.\tG\tT\t488.77\t.\tANN=T|missense_variant|MODERATE|ERICH5|ENSG00000177459|transcript|ENST00000318528.7|protein_coding|2/3|c.633G>T|p.Lys211Asn|992/1761|633/1125|211/374||\tGT:AD:DP:GQ:PL\t1/1:0,15:15:45:517,45,0"; + + // Example VCF line with snpEff annotation: + // 8 98089650 . G T 488.77 . ANN=T|missense_variant|MODERATE|ERICH5|ENSG00000177459|transcript|ENST00000318528.7|protein_coding|2/3|c.633G>T|p.Lys211Asn|992/1761|633/1125|211/374|| GT:AD:DP:GQ:PL 1/1:0,15:15:45:517,45,0 + // + // --- VCF Standard Columns --- + // CHROM (8) → Chromosome 8 + // POS (98089650) → 1-based position of the variant (98,089,650) + // ID (.) → No variant identifier + // REF (G) → Reference allele is G + // ALT (T) → Alternate allele is T + // QUAL (488.77) → Variant call quality score + // FILTER (.) → No filter applied + // + // --- INFO Column --- + // INFO (ANN=...) holds snpEff annotation data. + // ANN format is: + // Allele | Effect | Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | + // Transcript_Biotype | Rank | HGVS.c | HGVS.p | cDNA_pos/cDNA_len | + // CDS_pos/CDS_len | AA_pos/AA_len | Distance | Errors/Warnings + // + // In this case: ANN=T|missense_variant|MODERATE|ERICH5|ENSG00000177459|transcript|ENST00000318528.7|protein_coding|2/3|c.633G>T|p.Lys211Asn|992/1761|633/1125|211/374|| + // - Allele = T + // - Effect = missense_variant (amino acid change) + // - Impact = MODERATE + // - Gene_Name = ERICH5 + // - Gene_ID = ENSG00000177459 + // - Feature_Type = transcript + // - Feature_ID = ENST00000318528.7 + // - Transcript_Biotype = protein_coding + // - Rank = 2/3 + // - HGVS.c = c.633G>T (cDNA change) + // - HGVS.p = p.Lys211Asn (protein change: Lysine 211 to Asparagine) + // - cDNA_pos/cDNA_len = 992/1761 + // - CDS_pos/CDS_len = 633/1125 + // - AA_pos/AA_len = 211/374 + // - Distance, Errors/Warnings = empty + // + // --- FORMAT Column --- + // FORMAT (GT:AD:DP:GQ:PL) defines how to read the sample column(s): + // GT → Genotype (1/1 = homozygous alternate) + // AD → Allele depth (0 reads for REF, 15 for ALT) + // DP → Read depth (15 reads at this site) + // GQ → Genotype quality (45) + // PL → Phred-scaled genotype likelihoods (517,45,0) + // + // --- SAMPLE Column --- + // Sample entry: 1/1:0,15:15:45:517,45,0 + // GT = 1/1 → Homozygous ALT genotype (both alleles = T) + // AD = 0,15 → 0 reads for REF (G), 15 reads for ALT (T) + // DP = 15 → Total coverage at this site = 15 reads + // GQ = 45 → Genotype quality + // PL = 517,45,0 → Genotype likelihoods + // + // --- Overall Summary --- + // Variant at chr8:98089650 changes G → T. + // The sample is homozygous for the ALT allele (T). + // The variant causes a missense mutation in ERICH5 (Lys211Asn). + // Variant passed filters, with moderate predicted impact. + + string file = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "oneVcfProteinWithModsOnVariant.xml"); + List proteinsNoDecoys = ProteinDbLoader.LoadProteinXML(file, true, DecoyType.None, null, false, null, out var un, + maxHeterozygousVariants: 0); + // What: Ensure only one proteoform was created. + // Expect: 1. + // Why: The VCF contains a single homozygous ALT (GT=1/1) with sufficient depth, so the loader applies the variant and does not retain a separate base proteoform (DecoyType.None, maxHeterozygousVariants=0). + Assert.AreEqual(1, proteinsNoDecoys.Count); + + // What: Verify the returned proteoform is the variant with the expected accession suffix. + // Expect: At least one protein has accession "Q6P6B1_K211N". + // Why: Variant accessions are formed by appending the applied variant’s SimpleString (K211N) to the base accession (VariantApplication.GetAccession). + Assert.That(proteinsNoDecoys.Any(a => a.Accession == "Q6P6B1_K211N")); + + // What: Confirm the unmodified base accession is not present as its own proteoform. + // Expect: No protein has accession "Q6P6B1". + // Why: For homozygous ALT with depth >= minAlleleDepth, only the variant proteoform is emitted; the base (reference) proteoform is not included. + Assert.That(proteinsNoDecoys.Any(a => a.Accession == "Q6P6B1"), Is.False); + + // What: The ConsensusVariant (reference) is still tracked for the proteoform. + // Expect: At least one protein references a ConsensusVariant with accession "Q6P6B1". + // Why: ConsensusVariant points to the original/reference protein, even when the returned proteoform is a variant. + Assert.That(proteinsNoDecoys.Any(a => a.ConsensusVariant.Accession == "Q6P6B1")); + + // What: Check the number of distinct positions with possible localized modifications on the variant proteoform. + // Expect: 7. + // Why: OneBasedPossibleLocalizedModifications is a dictionary keyed by 1-based residue index; Values.Count equals the number of distinct modified positions after variant application and index adjustments. + Assert.AreEqual(7, proteinsNoDecoys.First().OneBasedPossibleLocalizedModifications.Values.Count); + + // What: Verify that, on the variant proteoform, there are no remaining unapplied database sequence variants. + // Expect: 0. + // Why: Applied variants are removed from SequenceVariations during variant construction, and this case has only the single VCF-driven variant applied. + Assert.AreEqual(0, proteinsNoDecoys.First().SequenceVariations.Count); + + // What: Confirm exactly one sequence variant was applied to produce this proteoform. + // Expect: 1. + // Why: The input specifies a single homozygous variant (K211N), so exactly one applied variation is recorded. + Assert.AreEqual(1, proteinsNoDecoys.First().AppliedSequenceVariations.Count); + + // What: Verify the total number of modifications annotated on the applied variant region. + // Expect: 2. + // Why: AppliedSequenceVariations.First().OneBasedModifications is a dictionary of position -> list of modifications on the variant region. + // In this case there is one key (position) with two modifications; summing the list counts across all keys yields 2. + Assert.AreEqual(2, proteinsNoDecoys.First().AppliedSequenceVariations.First().OneBasedModifications.Values.Sum(v => v.Count)); + } + [Test] public void VariantModificationTest() { @@ -585,9 +706,9 @@ public void VariantModificationTest() List variantTargets = targets.Where(p => p.AppliedSequenceVariations.Count >= 1).ToList(); List decoys = variantProteins.Where(p => p.IsDecoy == true).ToList(); List variantDecoys = decoys.Where(p => p.AppliedSequenceVariations.Count >= 1).ToList(); - bool homozygousVariant = targets.Select(p => p.Accession).Contains("Q6P6B1"); + bool homozygousVariant = targets.Select(p => p.Accession).Contains("Q6P6B1"); - var variantMods = targets.SelectMany(p => p.AppliedSequenceVariations.Where(x=>x.OneBasedModifications.Count>= 1)).ToList(); + var variantMods = targets.SelectMany(p => p.AppliedSequenceVariations.Where(x => x.OneBasedModifications.Count >= 1)).ToList(); var decoyMods = decoys.SelectMany(p => p.AppliedSequenceVariations.Where(x => x.OneBasedModifications.Count >= 1)).ToList(); var negativeResidues = decoyMods.SelectMany(x => x.OneBasedModifications.Where(w => w.Key < 0)).ToList(); bool namingWrong = targets.Select(p => p.Accession).Contains("Q8N865_H300R_A158T_H300R"); @@ -603,7 +724,6 @@ public void VariantModificationTest() Assert.AreEqual(2, variantMods.Count); Assert.AreEqual(2, decoyMods.Count); Assert.AreEqual(0, negativeResidues.Count); - } [Test] public void WriteProteinXmlWithVariantsDiscoveredAsModifications2() diff --git a/mzLib/Test/DatabaseTests/oneVcfProteinWithModsOnVariant.xml b/mzLib/Test/DatabaseTests/oneVcfProteinWithModsOnVariant.xml new file mode 100644 index 000000000..d4721125a --- /dev/null +++ b/mzLib/Test/DatabaseTests/oneVcfProteinWithModsOnVariant.xml @@ -0,0 +1,141 @@ + + + ID Ammonia loss on N +MT Common Artifact +TG N +PP Anywhere. +CF H-3N-1 +MM -17.026549101 + +// + ID Hydroxylation on K +MT Common Biological +TG K +PP Anywhere. +CF O +MM 15.99491462 + +// + ID Hydroxylation on N +MT Common Biological +TG N +PP Anywhere. +CF O +MM 15.99491462 + +// + ID Hydroxylation on P +MT Common Biological +TG P +PP Anywhere. +CF O +MM 15.99491462 +DI HCD:170.060123533 + +// + ID Phosphoserine on S +AC PTM-0253 +MT UniProt +FT MOD_RES +TG S +PP Anywhere. +CF HO3P +MM 79.966331 +DR PSI-MOD; MOD:00046 +DR RESID; AA0037 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +TR Viruses; taxId:10239 (Viruses) +KW Phosphoprotein +// + + Q6P6B1 + ENST00000318528 + + + Glutamate-rich protein 5 + + + + ERICH5 + C8orf47 + ERICH5 + ENSG00000177459 + + + Homo sapiens + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + K + N + + + + + + + + + + + + + + + + + + + + + + + + + MGCSSSALNKAGDSSRFPSVTSNEHFSTAEESESCFAQPKPHALGRESTVDGNVQRESRPPLQKLKVSAEPTANGVKPLQEQPLAKDVAPGRDATDQSGSTEKTQPGEGLEESGPPQPGGKEDAPAAEGKKKDAGAGTEAESLKGNAEAQPLGPEAKGQPLQAAVEKDSLRAVEVTENPQTAAEMKPLGTTENVLTLQIAGELQPQGTVGKDEQAPLLETISKENESPEILEGSQFVETAEEQQLQATLGKEEQPQLLERIPKENVTPEVLDRSQLVEKPVMNDPFHKTPEGPGNMEQIQPEGIVGSMEHPARNVEAGAYVEMIRNIHTNEEDQRIEGETGEKVETDMENEKVSEGAETKEEETGEVVDLSAAT + + + + diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj index 59c393b39..c2e14a6c2 100644 --- a/mzLib/Test/Test.csproj +++ b/mzLib/Test/Test.csproj @@ -43,6 +43,9 @@ Always + + Always + Always diff --git a/mzLib/Transcriptomics/RNA.cs b/mzLib/Transcriptomics/RNA.cs index bfdefdf87..67b350297 100644 --- a/mzLib/Transcriptomics/RNA.cs +++ b/mzLib/Transcriptomics/RNA.cs @@ -73,8 +73,16 @@ public override TBioPolymerType CreateVariant(string variantBas if (original is not RNA rna) throw new ArgumentException("The original nucleic acid must be RNA to create an RNA variant."); - var variantRNA = new RNA(variantBaseSequence, rna, appliedSequenceVariants, + var variantRNA = new RNA(variantBaseSequence, rna, appliedSequenceVariants, applicableTruncationProducts, oneBasedModifications, sampleNameForVariants); + + // Remove any applied variants from the list of (unapplied) database sequence variations + if (appliedSequenceVariants != null) + { + var appliedSimple = new HashSet(appliedSequenceVariants.Select(v => v.SimpleString())); + variantRNA.SequenceVariations.RemoveAll(sv => appliedSimple.Contains(sv.SimpleString())); + } + return (TBioPolymerType)(IHasSequenceVariants)variantRNA; } From 787fb98ad5ef195116558096f59753783a5d54e6 Mon Sep 17 00:00:00 2001 From: trishorts Date: Mon, 27 Oct 2025 14:27:02 -0500 Subject: [PATCH 02/16] protein variants fixed --- .../Test/DatabaseTests/TestVariantProtein.cs | 64 +++++++++++++------ 1 file changed, 43 insertions(+), 21 deletions(-) diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 51f3f62e2..c540d90e2 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -133,19 +133,19 @@ public static void LoadSeqVarModifications(string databaseName, int modIdx, int Assert.AreEqual(modIdx, target.OneBasedPossibleLocalizedModifications.Single().Key); Assert.AreEqual(1, target.AppliedSequenceVariations.Count()); Assert.AreEqual(modIdx, target.AppliedSequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(1, target.SequenceVariations.Count()); - Assert.AreEqual(modIdx, target.SequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(1, target.SequenceVariations.Single().OneBasedModifications.Count); - Assert.AreEqual(modIdx, target.SequenceVariations.Single().OneBasedModifications.Single().Key); //PEP[mod]TID, MEP[mod]TID + Assert.AreEqual(0, target.SequenceVariations.Count()); + Assert.AreEqual(modIdx, target.AppliedSequenceVariations.Single().OneBasedBeginPosition); + Assert.AreEqual(1, target.AppliedSequenceVariations.Single().OneBasedModifications.Count); + Assert.AreEqual(modIdx, target.AppliedSequenceVariations.Single().OneBasedModifications.Single().Key); //PEP[mod]TID, MEP[mod]TID var decoy = proteins[1]; Assert.AreEqual(1, decoy.OneBasedPossibleLocalizedModifications.Count); Assert.AreEqual(reversedModIdx, decoy.OneBasedPossibleLocalizedModifications.Single().Key); //DITP[mod]EP, MDITP[mod]E Assert.AreEqual(1, decoy.AppliedSequenceVariations.Count()); Assert.AreEqual(reversedModIdx, decoy.AppliedSequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(1, decoy.SequenceVariations.Count()); - Assert.AreEqual(reversedModIdx, decoy.SequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(1, decoy.SequenceVariations.Single().OneBasedModifications.Count); - Assert.AreEqual(reversedModIdx, decoy.SequenceVariations.Single().OneBasedModifications.Single().Key); + Assert.AreEqual(0, decoy.SequenceVariations.Count()); + Assert.AreEqual(reversedModIdx, decoy.AppliedSequenceVariations.Single().OneBasedBeginPosition); + Assert.AreEqual(1, decoy.AppliedSequenceVariations.Single().OneBasedModifications.Count); + Assert.AreEqual(reversedModIdx, decoy.AppliedSequenceVariations.Single().OneBasedModifications.Single().Key); string rewriteDbName = $"{Path.GetFileNameWithoutExtension(databaseName)}rewrite.xml"; ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), proteins.Where(p => !p.IsDecoy).ToList(), Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", rewriteDbName)); @@ -156,19 +156,19 @@ public static void LoadSeqVarModifications(string databaseName, int modIdx, int Assert.AreEqual(modIdx, target.OneBasedPossibleLocalizedModifications.Single().Key); Assert.AreEqual(1, target.AppliedSequenceVariations.Count()); Assert.AreEqual(modIdx, target.AppliedSequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(1, target.SequenceVariations.Count()); - Assert.AreEqual(modIdx, target.SequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(1, target.SequenceVariations.Single().OneBasedModifications.Count); - Assert.AreEqual(modIdx, target.SequenceVariations.Single().OneBasedModifications.Single().Key); + Assert.AreEqual(0, target.SequenceVariations.Count()); + Assert.AreEqual(modIdx, target.AppliedSequenceVariations.Single().OneBasedBeginPosition); + Assert.AreEqual(1, target.AppliedSequenceVariations.Single().OneBasedModifications.Count); + Assert.AreEqual(modIdx, target.AppliedSequenceVariations.Single().OneBasedModifications.Single().Key); decoy = proteins[1]; Assert.AreEqual(1, decoy.OneBasedPossibleLocalizedModifications.Count); Assert.AreEqual(reversedModIdx, decoy.OneBasedPossibleLocalizedModifications.Single().Key); Assert.AreEqual(1, decoy.AppliedSequenceVariations.Count()); Assert.AreEqual(reversedModIdx, decoy.AppliedSequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(1, decoy.SequenceVariations.Count()); - Assert.AreEqual(reversedModIdx, decoy.SequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(1, decoy.SequenceVariations.Single().OneBasedModifications.Count); - Assert.AreEqual(reversedModIdx, decoy.SequenceVariations.Single().OneBasedModifications.Single().Key); + Assert.AreEqual(0, decoy.SequenceVariations.Count()); + Assert.AreEqual(reversedModIdx, decoy.AppliedSequenceVariations.Single().OneBasedBeginPosition); + Assert.AreEqual(1, decoy.AppliedSequenceVariations.Single().OneBasedModifications.Count); + Assert.AreEqual(reversedModIdx, decoy.AppliedSequenceVariations.Single().OneBasedModifications.Single().Key); } [TestCase("ranges1.xml", 1, 2, 5, 6)] // without starting methionine @@ -269,15 +269,37 @@ public static void ReverseDecoySpliceSites(string databaseName, int beginIdx, in } [Test] - [TestCase("HomozygousHLA.xml", 1, 18)] - [TestCase("HomozygousHLA.xml", 10, 17)] - public static void HomozygousVariantsAtVariedDepths(string filename, int minVariantDepth, int appliedCount) + public static void HomozygousVariantsDepthOne() + { + string filename = "HomozygousHLA.xml"; + int minVariantDepth = 1; + int appliedCount = 18; + var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", filename), true, + DecoyType.None, null, false, null, out var unknownModifications, minAlleleDepth: minVariantDepth); + Assert.AreEqual(1, proteins.Count); + //all sequence variants are applied and therefore removed from the SequenceVariations list + Assert.AreEqual(0, proteins[0].SequenceVariations.Count()); // some redundant + + Assert.AreEqual(appliedCount, proteins[0].AppliedSequenceVariations.Count()); // some redundant + Assert.AreEqual(appliedCount, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes + Assert.AreEqual(1, proteins[0].GetVariantBioPolymers().Count); + var variantProteins = proteins[0].GetVariantBioPolymers(); + List peptides = proteins.SelectMany(vp => vp.Digest(new DigestionParams(), null, null)).ToList(); + } + + [Test] + + public static void HomozygousVariantsDepth17() { + string filename = "HomozygousHLA.xml"; + int minVariantDepth = 10; + int appliedCount = 17; var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", filename), true, DecoyType.None, null, false, null, out var unknownModifications, minAlleleDepth: minVariantDepth); Assert.AreEqual(1, proteins.Count); - Assert.AreEqual(18, proteins[0].SequenceVariations.Count()); // some redundant - Assert.AreEqual(18, proteins[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes + // 17 of 18 sequence variants are applied and therefore removed from the SequenceVariations list leaving 1 behind + Assert.AreEqual(1, proteins[0].SequenceVariations.Count()); // some redundant + Assert.AreEqual(1, proteins[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes Assert.AreEqual(appliedCount, proteins[0].AppliedSequenceVariations.Count()); // some redundant Assert.AreEqual(appliedCount, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes Assert.AreEqual(1, proteins[0].GetVariantBioPolymers().Count); From 5feda66a912c89e6a9231451e68030f38d366977 Mon Sep 17 00:00:00 2001 From: trishorts Date: Mon, 27 Oct 2025 14:56:24 -0500 Subject: [PATCH 03/16] unit test fix --- .../Test/Transcriptomics/TestVariantOligo.cs | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/mzLib/Test/Transcriptomics/TestVariantOligo.cs b/mzLib/Test/Transcriptomics/TestVariantOligo.cs index aa643dec5..2554d9482 100644 --- a/mzLib/Test/Transcriptomics/TestVariantOligo.cs +++ b/mzLib/Test/Transcriptomics/TestVariantOligo.cs @@ -35,23 +35,38 @@ public static void VariantRna() RNA v = new RNA("CAUA", p, new[] { new SequenceVariation(3, "A", "U", "desc", null) }, null, null, null); Assert.That(v.ConsensusVariant, Is.EqualTo(p)); } - + /// + /// This is an interesting test case. The RNA XML contains one entry. + /// The entry has 5 sequence variations, but they are perfectly identical + /// I guess the correct behavior is to only apply the variant once, since applying it multiple times does nothing + /// [Test] public void VariantXml() { string file = Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics", "TestData", "SeqVar.xml"); - List variantProteins = RnaDbLoader.LoadRnaXML(file, true, DecoyType.None, false, AllKnownMods, [], out _); + List variantProteins = RnaDbLoader.LoadRnaXML(file, true, DecoyType.None, false, AllKnownMods, [], out _, + minAlleleDepth: 1000); + + Assert.That(1, Is.EqualTo(variantProteins.Count)); + Assert.That(0, Is.EqualTo(variantProteins.Count(a=>a.AppliedSequenceVariations.Count > 0))); + + variantProteins = RnaDbLoader.LoadRnaXML(file, true, DecoyType.None, false, AllKnownMods, [], out _, + maxHeterozygousVariants: 10, + minAlleleDepth: 0); + + Assert.That(1, Is.EqualTo(variantProteins.Count)); + Assert.That(1, Is.EqualTo(variantProteins.Count(a => a.AppliedSequenceVariations.Count > 0))); - Assert.That(variantProteins.First().ConsensusVariant.SequenceVariations.Count(), Is.EqualTo(5)); - Assert.That(variantProteins.Count, Is.EqualTo(1)); // there is only one unique amino acid change Assert.That(variantProteins.First().ConsensusVariant.BaseSequence, Is.Not.EqualTo(variantProteins.First().BaseSequence)); Assert.That(variantProteins.First().ConsensusVariant.BaseSequence[116], Is.EqualTo('C')); Assert.That(variantProteins.First().BaseSequence[116], Is.EqualTo('G')); + Assert.That(variantProteins.First().ConsensusVariant.Name, Is.Not.EqualTo(variantProteins.First().Name)); Assert.That(variantProteins.First().ConsensusVariant.FullName, Is.Not.EqualTo(variantProteins.First().FullName)); Assert.That(variantProteins.First().ConsensusVariant.Accession, Is.Not.EqualTo(variantProteins.First().Accession)); List oligos = variantProteins.SelectMany(vp => vp.Digest(new RnaDigestionParams(), null, null)).ToList(); + Assert.That(44, Is.EqualTo(oligos.Count)); } [Test] From 104f153d2ecacd5f0b53c2ef473474d6aa33569a Mon Sep 17 00:00:00 2001 From: trishorts Date: Tue, 28 Oct 2025 10:13:37 -0500 Subject: [PATCH 04/16] seems like progress is being made --- .../Omics/BioPolymer/IHasSequenceVariants.cs | 18 +++-- mzLib/Omics/BioPolymer/VariantApplication.cs | 28 +++++-- mzLib/Proteomics/Protein/Protein.cs | 35 ++++++--- .../Test/DatabaseTests/TestVariantProtein.cs | 22 +++++- .../Test/Transcriptomics/TestVariantOligo.cs | 17 ++++- mzLib/Transcriptomics/NucleicAcid.cs | 10 ++- mzLib/Transcriptomics/RNA.cs | 75 +++++++++++++------ 7 files changed, 157 insertions(+), 48 deletions(-) diff --git a/mzLib/Omics/BioPolymer/IHasSequenceVariants.cs b/mzLib/Omics/BioPolymer/IHasSequenceVariants.cs index b9bb73015..98f68131d 100644 --- a/mzLib/Omics/BioPolymer/IHasSequenceVariants.cs +++ b/mzLib/Omics/BioPolymer/IHasSequenceVariants.cs @@ -47,13 +47,19 @@ public interface IHasSequenceVariants /// List TruncationProducts { get; } - + /// - /// Used to construct a new variant of the same type as the original and is called in + /// Constructs a new variant biopolymer instance from the original, applying the specified sequence variants and modifications. + /// The method sets both the full set of database sequence variants and the subset of applied variants, ensuring that only unapplied variants remain in the SequenceVariations list, + /// while applied variants are tracked in AppliedSequenceVariations. This enables accurate representation of both the original and variant-specific annotations. /// - /// The generic structure enables proteins to produce proteins and RNA to produce RNA - /// - TBioPolymerType CreateVariant(string variantBaseSequence, TBioPolymerType original, IEnumerable appliedSequenceVariants, - IEnumerable applicableProteolysisProducts, IDictionary> oneBasedModifications, string sampleNameForVariants) + TBioPolymerType CreateVariant( + string variantBaseSequence, + TBioPolymerType original, + IEnumerable? sequenceVariants, // NEW: pass DB variants explicitly + IEnumerable? appliedSequenceVariants, + IEnumerable applicableProteolysisProducts, + IDictionary> oneBasedModifications, + string sampleNameForVariants) where TBioPolymerType : IHasSequenceVariants; } \ No newline at end of file diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs index 1669c3afe..1a78a5873 100644 --- a/mzLib/Omics/BioPolymer/VariantApplication.cs +++ b/mzLib/Omics/BioPolymer/VariantApplication.cs @@ -104,14 +104,24 @@ public static List ApplyVariants(TBioPolymerTy .OrderByDescending(v => v.OneBasedBeginPosition) // apply variants at the end of the protein sequence first .ToList(); - TBioPolymerType proteinCopy = protein.CreateVariant(protein.BaseSequence, protein, null, protein.TruncationProducts, protein.OneBasedPossibleLocalizedModifications, null); - - // If there aren't any variants to apply, just return the base protein + // If there aren't any variants to apply, just return the original as-is if (uniqueEffectsToApply.Count == 0) { - return new List { proteinCopy }; + return new List { protein }; } + // Start from a normalized copy whose “unapplied” list is the DB list. + TBioPolymerType proteinCopy = protein.CreateVariant( + protein.BaseSequence, + protein, + protein.SequenceVariations, // pass DB variants + null, // none applied yet + protein.TruncationProducts, + protein.OneBasedPossibleLocalizedModifications, + null); + + + HashSet individuals = new HashSet(uniqueEffectsToApply.SelectMany(v => v.Description.Genotypes.Keys)); List variantProteins = new(); List newVariantProteins = new(); @@ -253,7 +263,15 @@ private static TBioPolymerType ApplySingleVariant(SequenceVaria Dictionary> adjustedModifications = AdjustModificationIndices(variantGettingApplied, variantSequence, protein); List adjustedAppliedVariations = AdjustSequenceVariationIndices(variantGettingApplied, variantSequence, appliedVariations); - return protein.CreateVariant(variantSequence, protein, adjustedAppliedVariations, adjustedProteolysisProducts, adjustedModifications, individual); + // Pass BOTH lists: current DB list (possibly already filtered on the interim proteoform) + newly applied + return protein.CreateVariant( + variantSequence, + protein, + protein.SequenceVariations, // carry forward the current “unapplied” list + adjustedAppliedVariations, + adjustedProteolysisProducts, + adjustedModifications, + individual); } /// diff --git a/mzLib/Proteomics/Protein/Protein.cs b/mzLib/Proteomics/Protein/Protein.cs index 1453b6630..01cc6c40e 100644 --- a/mzLib/Proteomics/Protein/Protein.cs +++ b/mzLib/Proteomics/Protein/Protein.cs @@ -133,8 +133,13 @@ public Protein(Protein originalProtein, string newBaseSequence) /// /// /// - public Protein(string variantBaseSequence, Protein protein, IEnumerable appliedSequenceVariations, - IEnumerable applicableProteolysisProducts, IDictionary> oneBasedModifications, string sampleNameForVariants) + public Protein(string variantBaseSequence, + Protein protein, + IEnumerable? sequenceVariations, + IEnumerable? appliedSequenceVariations, + IEnumerable applicableProteolysisProducts, + IDictionary> oneBasedModifications, + string sampleNameForVariants) : this( variantBaseSequence, VariantApplication.GetAccession(protein, appliedSequenceVariations), @@ -143,11 +148,11 @@ public Protein(string variantBaseSequence, Protein protein, IEnumerable x.Key, x => x.Value) : new Dictionary>(), proteolysisProducts: new List(applicableProteolysisProducts ?? new List()), name: VariantApplication.GetVariantName(protein.Name, appliedSequenceVariations), - fullName: VariantApplication.GetVariantName(protein.FullName, appliedSequenceVariations), + fullName: VariantApplication.GetVariantName(protein.FullName, appliedSequenceVariations), isDecoy: protein.IsDecoy, isContaminant: protein.IsContaminant, databaseReferences: new List(protein.DatabaseReferences), - sequenceVariations: new List(protein.SequenceVariations), + sequenceVariations: new List(sequenceVariations ?? protein.SequenceVariations), disulfideBonds: new List(protein.DisulfideBonds), spliceSites: new List(protein.SpliceSites), databaseFilePath: protein.DatabaseFilePath, @@ -596,18 +601,30 @@ private Protein GenerateFullyLabeledSilacProtein(SilacLabel label) /// public IDictionary> OriginalNonVariantModifications { get; set; } - public TBioPolymerType CreateVariant(string variantBaseSequence, TBioPolymerType original, IEnumerable appliedSequenceVariants, - IEnumerable applicableProteolysisProducts, IDictionary> oneBasedModifications, string sampleNameForVariants) + public TBioPolymerType CreateVariant( + string variantBaseSequence, + TBioPolymerType original, + IEnumerable? sequenceVariants, + IEnumerable? appliedSequenceVariants, + IEnumerable applicableProteolysisProducts, + IDictionary> oneBasedModifications, + string sampleNameForVariants) where TBioPolymerType : IHasSequenceVariants { if (original is not Protein originalProtein) throw new ArgumentException("The original BioPolymer must be Protein to create a protein variant"); - var variantProtein = new Protein(variantBaseSequence, originalProtein, appliedSequenceVariants, - applicableProteolysisProducts, oneBasedModifications, sampleNameForVariants); + var variantProtein = new Protein( + variantBaseSequence, + originalProtein, + sequenceVariants ?? originalProtein.SequenceVariations, + appliedSequenceVariants, + applicableProteolysisProducts, + oneBasedModifications, + sampleNameForVariants); // Remove any applied variants from the list of (unapplied) database sequence variations - if (appliedSequenceVariants != null) + if (appliedSequenceVariants != null && appliedSequenceVariants.Any()) { var appliedSimple = new HashSet(appliedSequenceVariants.Select(v => v.SimpleString())); variantProtein.SequenceVariations.RemoveAll(sv => appliedSimple.Contains(sv.SimpleString())); diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index c540d90e2..06a909bb1 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -50,7 +50,17 @@ public static void TearDown() public static void VariantProtein() { Protein p = new Protein("MAAA", "accession"); - Protein v = new Protein("MAVA", p, new[] { new SequenceVariation(3, "A", "V", "desc", null) }, null, null, null); + + // New ctor: (variantBaseSequence, protein, sequenceVariants, appliedSequenceVariants, proteolysisProducts, mods, sampleName) + Protein v = new Protein( + "MAVA", + p, + p.SequenceVariations, // DB variants (empty in this test) + new[] { new SequenceVariation(3, "A", "V", "desc", null) }, // applied + null, + null, + null); + Assert.AreEqual(p, v.ConsensusVariant); } @@ -420,7 +430,15 @@ public static void CrashOnCreateVariantFromRNA() var rna = new RNA("GUACUGACU"); NUnit.Framework.Assert.Throws(() => { - proteins[0].CreateVariant(proteins[0].BaseSequence, rna, [], [], new Dictionary>(), ""); + // New signature requires sequenceVariants and applicableTruncationProducts + proteins[0].CreateVariant( + proteins[0].BaseSequence, + rna, + [], // sequenceVariants + [], // appliedSequenceVariants + [], // applicableTruncationProducts + new Dictionary>(), + ""); }); } diff --git a/mzLib/Test/Transcriptomics/TestVariantOligo.cs b/mzLib/Test/Transcriptomics/TestVariantOligo.cs index 2554d9482..c02f7e092 100644 --- a/mzLib/Test/Transcriptomics/TestVariantOligo.cs +++ b/mzLib/Test/Transcriptomics/TestVariantOligo.cs @@ -325,7 +325,14 @@ public static void CrashOnCreateVariantFromProtein() var protein = new Protein("PEPTIDE", "accession"); NUnit.Framework.Assert.Throws(() => { - rnas[0].CreateVariant(rnas[0].BaseSequence, protein, [], [], new Dictionary>(), ""); + rnas[0].CreateVariant( + rnas[0].BaseSequence, + protein, + [], // sequenceVariants + [], // appliedSequenceVariants + [], // applicableTruncationProducts + new Dictionary>(), // mods + ""); // sampleNameForVariants }); } @@ -402,7 +409,13 @@ public void VariantModificationTest() // One of the mod residues is removed by the variant. string dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics", "TestData", "VariantModsGPTMD.xml"); List rna = RnaDbLoader.LoadRnaXML(dbPath, true, DecoyType.Reverse, false, AllKnownMods, [], out var unknownModifications); - Assert.That(rna.All(p => p.SequenceVariations.Count == 1)); + + List rnaWithAppliedVariants = rna.Where(p => p.AppliedSequenceVariations.Count > 0).ToList(); + + List rnaWithoutAppliedVariants = rna.Where(p => p.AppliedSequenceVariations.Count == 0).ToList(); + + Assert.That(rnaWithoutAppliedVariants.All(p => p.SequenceVariations.Count == 1)); + Assert.That(rnaWithAppliedVariants.All(p => p.SequenceVariations.Count == 0)); List targets = rna.Where(p => p.IsDecoy == false).ToList(); RNA variantTarget = targets.First(p => p.AppliedSequenceVariations.Count >= 1); diff --git a/mzLib/Transcriptomics/NucleicAcid.cs b/mzLib/Transcriptomics/NucleicAcid.cs index 6f11d8de1..12d046723 100644 --- a/mzLib/Transcriptomics/NucleicAcid.cs +++ b/mzLib/Transcriptomics/NucleicAcid.cs @@ -268,8 +268,14 @@ public IEnumerable Digest(RnaDigestionParams digestionParamete public IDictionary> OriginalNonVariantModifications { get; set; } // Abstract so we can do this construction in the appropriate derived class - public abstract TBioPolymerType CreateVariant(string variantBaseSequence, TBioPolymerType original, IEnumerable appliedSequenceVariants, - IEnumerable applicableProteolysisProducts, IDictionary> oneBasedModifications, string sampleNameForVariants) + public abstract TBioPolymerType CreateVariant( + string variantBaseSequence, + TBioPolymerType original, + IEnumerable? sequenceVariants, + IEnumerable? appliedSequenceVariants, + IEnumerable applicableProteolysisProducts, + IDictionary> oneBasedModifications, + string sampleNameForVariants) where TBioPolymerType : IHasSequenceVariants; #endregion diff --git a/mzLib/Transcriptomics/RNA.cs b/mzLib/Transcriptomics/RNA.cs index 67b350297..7a5d9921a 100644 --- a/mzLib/Transcriptomics/RNA.cs +++ b/mzLib/Transcriptomics/RNA.cs @@ -38,28 +38,47 @@ public RNA(string sequence, string accession, truncationProducts, sequenceVariations, appliedSequenceVariations, sampleNameForVariants, fullName) { } - + /// /// For creating a variant of an existing nucleic acid. Filters out modifications that do not match their nucleotide target site. /// - public RNA(string variantBaseSequence, NucleicAcid original, IEnumerable? appliedSequenceVariants, - IEnumerable? applicableTruncationProducts, IDictionary> oneBasedModifications, string sampleNameForVariants) - : this(variantBaseSequence, VariantApplication.GetAccession(original, appliedSequenceVariants), - oneBasedModifications, - original.FivePrimeTerminus, original.ThreePrimeTerminus, - VariantApplication.GetVariantName(original.Name, appliedSequenceVariants), - original.Organism, original.DatabaseFilePath, original.IsContaminant, - original.IsDecoy, original.GeneNames, original.AdditionalDatabaseFields, - [..applicableTruncationProducts ?? new List()], original.SequenceVariations, - [..appliedSequenceVariants ?? new List()], sampleNameForVariants, - VariantApplication.GetVariantName(original.FullName, appliedSequenceVariants)) + // NEW: canonical constructor that accepts sequenceVariants explicitly + public RNA(string variantBaseSequence, + NucleicAcid original, + IEnumerable? sequenceVariants, + IEnumerable? appliedSequenceVariants, + IEnumerable? applicableTruncationProducts, + IDictionary> oneBasedModifications, + string sampleNameForVariants) + : this(variantBaseSequence, + VariantApplication.GetAccession(original, appliedSequenceVariants), + oneBasedModifications, + original.FivePrimeTerminus, original.ThreePrimeTerminus, + VariantApplication.GetVariantName(original.Name, appliedSequenceVariants), + original.Organism, original.DatabaseFilePath, original.IsContaminant, + original.IsDecoy, original.GeneNames, original.AdditionalDatabaseFields, + [.. (applicableTruncationProducts ?? new List())], + // COPY: do not alias the DB list + [.. (sequenceVariants ?? original.SequenceVariations ?? new List())], + [.. (appliedSequenceVariants ?? new List())], + sampleNameForVariants, + VariantApplication.GetVariantName(original.FullName, appliedSequenceVariants)) { ConsensusVariant = original.ConsensusVariant; OriginalNonVariantModifications = ConsensusVariant.OriginalNonVariantModifications; AppliedSequenceVariations = (appliedSequenceVariants ?? new List()).ToList(); SampleNameForVariants = sampleNameForVariants; } - + // Back-compat overload (kept for existing call sites) + public RNA(string variantBaseSequence, + NucleicAcid original, + IEnumerable? appliedSequenceVariants, + IEnumerable? applicableTruncationProducts, + IDictionary> oneBasedModifications, + string sampleNameForVariants) + : this(variantBaseSequence, original, original.SequenceVariations, appliedSequenceVariants, applicableTruncationProducts, oneBasedModifications, sampleNameForVariants) + { + } public override IBioPolymer CloneWithNewSequenceAndMods(string newBaseSequence, IDictionary>? newMods = null) { // Create a new rna with the new base sequence and modifications @@ -67,23 +86,35 @@ public override IBioPolymer CloneWithNewSequenceAndMods(string newBaseSequence, return newRna; } - public override TBioPolymerType CreateVariant(string variantBaseSequence, TBioPolymerType original, IEnumerable appliedSequenceVariants, - IEnumerable applicableTruncationProducts, IDictionary> oneBasedModifications, string sampleNameForVariants) + public override TBioPolymerType CreateVariant( + string variantBaseSequence, + TBioPolymerType original, + IEnumerable? sequenceVariants, + IEnumerable? appliedSequenceVariants, + IEnumerable applicableTruncationProducts, + IDictionary> oneBasedModifications, + string sampleNameForVariants) { - if (original is not RNA rna) + if (original is not RNA originalRna) throw new ArgumentException("The original nucleic acid must be RNA to create an RNA variant."); - var variantRNA = new RNA(variantBaseSequence, rna, appliedSequenceVariants, - applicableTruncationProducts, oneBasedModifications, sampleNameForVariants); + var variant = new RNA( + variantBaseSequence, + originalRna, + sequenceVariants, + appliedSequenceVariants, + applicableTruncationProducts, + oneBasedModifications, + sampleNameForVariants); - // Remove any applied variants from the list of (unapplied) database sequence variations - if (appliedSequenceVariants != null) + // Remove only actually-applied variants from the VARIANT’s copy + if (appliedSequenceVariants != null && appliedSequenceVariants.Any()) { var appliedSimple = new HashSet(appliedSequenceVariants.Select(v => v.SimpleString())); - variantRNA.SequenceVariations.RemoveAll(sv => appliedSimple.Contains(sv.SimpleString())); + variant.SequenceVariations.RemoveAll(sv => appliedSimple.Contains(sv.SimpleString())); } - return (TBioPolymerType)(IHasSequenceVariants)variantRNA; + return (TBioPolymerType)(IHasSequenceVariants)variant; } public bool Equals(RNA? other) From 5f3fe860af2df02db8559a3bf08bbf53331c5215 Mon Sep 17 00:00:00 2001 From: trishorts Date: Tue, 28 Oct 2025 10:22:34 -0500 Subject: [PATCH 05/16] move sv to av --- .../Test/Transcriptomics/TestVariantOligo.cs | 80 +++++++++++++------ 1 file changed, 57 insertions(+), 23 deletions(-) diff --git a/mzLib/Test/Transcriptomics/TestVariantOligo.cs b/mzLib/Test/Transcriptomics/TestVariantOligo.cs index c02f7e092..214516887 100644 --- a/mzLib/Test/Transcriptomics/TestVariantOligo.cs +++ b/mzLib/Test/Transcriptomics/TestVariantOligo.cs @@ -82,19 +82,19 @@ public static void LoadSeqVarModifications(string databaseName, int modIdx, int Assert.That(target.OneBasedPossibleLocalizedModifications.Single().Key, Is.EqualTo(modIdx)); Assert.That(target.AppliedSequenceVariations.Count(), Is.EqualTo(1)); Assert.That(target.AppliedSequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(modIdx)); - Assert.That(target.SequenceVariations.Count(), Is.EqualTo(1)); - Assert.That(target.SequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(modIdx)); - Assert.That(target.SequenceVariations.Single().OneBasedModifications.Count, Is.EqualTo(1)); - Assert.That(target.SequenceVariations.Single().OneBasedModifications.Single().Key, Is.EqualTo(modIdx)); //PEP[mod]TID, MEP[mod]TID + Assert.That(target.SequenceVariations.Count(), Is.EqualTo(0)); + Assert.That(target.AppliedSequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(modIdx)); + Assert.That(target.AppliedSequenceVariations.Single().OneBasedModifications.Count, Is.EqualTo(1)); + Assert.That(target.AppliedSequenceVariations.Single().OneBasedModifications.Single().Key, Is.EqualTo(modIdx)); //PEP[mod]TID, MEP[mod]TID var decoy = rna[1]; Assert.That(decoy.OneBasedPossibleLocalizedModifications.Count, Is.EqualTo(1)); Assert.That(decoy.OneBasedPossibleLocalizedModifications.Single().Key, Is.EqualTo(reversedModIdx)); //DITP[mod]EP, MDITP[mod]E Assert.That(decoy.AppliedSequenceVariations.Count(), Is.EqualTo(1)); Assert.That(decoy.AppliedSequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(reversedModIdx)); - Assert.That(decoy.SequenceVariations.Count(), Is.EqualTo(1)); - Assert.That(decoy.SequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(reversedModIdx)); - Assert.That(decoy.SequenceVariations.Single().OneBasedModifications.Count, Is.EqualTo(1)); - Assert.That(decoy.SequenceVariations.Single().OneBasedModifications.Single().Key, Is.EqualTo(reversedModIdx)); + Assert.That(decoy.SequenceVariations.Count(), Is.EqualTo(0)); + Assert.That(decoy.AppliedSequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(reversedModIdx)); + Assert.That(decoy.AppliedSequenceVariations.Single().OneBasedModifications.Count, Is.EqualTo(1)); + Assert.That(decoy.AppliedSequenceVariations.Single().OneBasedModifications.Single().Key, Is.EqualTo(reversedModIdx)); string rewriteDbName = $"{Path.GetFileNameWithoutExtension(databaseName)}rewrite.xml"; ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), rna.Where(p => !p.IsDecoy).ToList(), Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics", "TestData", rewriteDbName)); @@ -105,19 +105,19 @@ public static void LoadSeqVarModifications(string databaseName, int modIdx, int Assert.That(target.OneBasedPossibleLocalizedModifications.Single().Key, Is.EqualTo(modIdx)); Assert.That(target.AppliedSequenceVariations.Count(), Is.EqualTo(1)); Assert.That(target.AppliedSequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(modIdx)); - Assert.That(target.SequenceVariations.Count(), Is.EqualTo(1)); - Assert.That(target.SequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(modIdx)); - Assert.That(target.SequenceVariations.Single().OneBasedModifications.Count, Is.EqualTo(1)); - Assert.That(target.SequenceVariations.Single().OneBasedModifications.Single().Key, Is.EqualTo(modIdx)); + Assert.That(target.SequenceVariations.Count(), Is.EqualTo(0)); + Assert.That(target.AppliedSequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(modIdx)); + Assert.That(target.AppliedSequenceVariations.Single().OneBasedModifications.Count, Is.EqualTo(1)); + Assert.That(target.AppliedSequenceVariations.Single().OneBasedModifications.Single().Key, Is.EqualTo(modIdx)); decoy = rna[1]; Assert.That(decoy.OneBasedPossibleLocalizedModifications.Count, Is.EqualTo(1)); Assert.That(decoy.OneBasedPossibleLocalizedModifications.Single().Key, Is.EqualTo(reversedModIdx)); Assert.That(decoy.AppliedSequenceVariations.Count(), Is.EqualTo(1)); Assert.That(decoy.AppliedSequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(reversedModIdx)); - Assert.That(decoy.SequenceVariations.Count(), Is.EqualTo(1)); - Assert.That(decoy.SequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(reversedModIdx)); - Assert.That(decoy.SequenceVariations.Single().OneBasedModifications.Count, Is.EqualTo(1)); - Assert.That(decoy.SequenceVariations.Single().OneBasedModifications.Single().Key, Is.EqualTo(reversedModIdx)); + Assert.That(decoy.SequenceVariations.Count(), Is.EqualTo(0)); + Assert.That(decoy.AppliedSequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(reversedModIdx)); + Assert.That(decoy.AppliedSequenceVariations.Single().OneBasedModifications.Count, Is.EqualTo(1)); + Assert.That(decoy.AppliedSequenceVariations.Single().OneBasedModifications.Single().Key, Is.EqualTo(reversedModIdx)); } [TestCase("ranges1.xml", 1, 2, 5, 6)] // trunc excludes natural 3' @@ -150,17 +150,51 @@ public static void ReverseDecoyProteolysisProducts(string databaseName, int begi } [Test] - [TestCase("HomozygousHLA.xml", 1, 18)] - [TestCase("HomozygousHLA.xml", 10, 17)] - public static void HomozygousVariantsAtVariedDepths(string filename, int minVariantDepth, int appliedCount) + public static void HomozygousVariantsAtDepth1() + { + string filename = "HomozygousHLA.xml"; + int minVariantDepth = 1; + int appliedCount = 18; + + string dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics", "TestData", filename); + var rna = RnaDbLoader.LoadRnaXML(dbPath, true, DecoyType.None, false, AllKnownMods, [], out var unknownModifications, minAlleleDepth: minVariantDepth); + + Assert.That(rna.Count, Is.EqualTo(1)); + + // All variants applied => none left in SequenceVariations on the variant + Assert.That(rna[0].SequenceVariations.Count(), Is.EqualTo(0)); + Assert.That(rna[0].AppliedSequenceVariations.Count(), Is.EqualTo(appliedCount)); + Assert.That(rna[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), Is.EqualTo(appliedCount)); + + // Optional: DB annotations still live on the consensus/reference + Assert.That(rna[0].ConsensusVariant.SequenceVariations.Count, Is.EqualTo(18)); + + Assert.That(rna[0].GetVariantBioPolymers().Count, Is.EqualTo(1)); + var variantProteins = rna[0].GetVariantBioPolymers(); + List peptides = rna.SelectMany(vp => vp.Digest(new RnaDigestionParams(), null, null)).ToList(); + } + + [Test] + public static void HomozygousVariantsAtDepth10() { + string filename = "HomozygousHLA.xml"; + int minVariantDepth = 10; + int appliedCount = 17; + string dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics", "TestData", filename); var rna = RnaDbLoader.LoadRnaXML(dbPath, true, DecoyType.None, false, AllKnownMods, [], out var unknownModifications, minAlleleDepth: minVariantDepth); + Assert.That(rna.Count, Is.EqualTo(1)); - Assert.That(rna[0].SequenceVariations.Count(), Is.EqualTo(18)); // some redundant - Assert.That(rna[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), Is.EqualTo(18)); // unique changes - Assert.That(rna[0].AppliedSequenceVariations.Count(), Is.EqualTo(appliedCount)); // some redundant - Assert.That(rna[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), Is.EqualTo(appliedCount)); // unique changes + + // 17 applied => 1 unapplied remains in SequenceVariations on the variant + Assert.That(rna[0].SequenceVariations.Count(), Is.EqualTo(1)); + Assert.That(rna[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), Is.EqualTo(1)); + Assert.That(rna[0].AppliedSequenceVariations.Count(), Is.EqualTo(appliedCount)); + Assert.That(rna[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), Is.EqualTo(appliedCount)); + + // Optional: consensus still has all DB annotations + Assert.That(rna[0].ConsensusVariant.SequenceVariations.Count, Is.EqualTo(18)); + Assert.That(rna[0].GetVariantBioPolymers().Count, Is.EqualTo(1)); var variantProteins = rna[0].GetVariantBioPolymers(); List peptides = rna.SelectMany(vp => vp.Digest(new RnaDigestionParams(), null, null)).ToList(); From 7add84e7f141bdb486506cd20eaabdecf9077761 Mon Sep 17 00:00:00 2001 From: trishorts Date: Tue, 28 Oct 2025 10:40:20 -0500 Subject: [PATCH 06/16] yay --- mzLib/Test/Transcriptomics/TestVariantOligo.cs | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/mzLib/Test/Transcriptomics/TestVariantOligo.cs b/mzLib/Test/Transcriptomics/TestVariantOligo.cs index 214516887..9d876d710 100644 --- a/mzLib/Test/Transcriptomics/TestVariantOligo.cs +++ b/mzLib/Test/Transcriptomics/TestVariantOligo.cs @@ -494,13 +494,17 @@ public void VariantModificationTest() } } - [Test] public void TwoTruncationsAndSequenceVariant_DbLoading() { string dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics", "TestData", "TruncationAndVariantMods.xml"); List rna = RnaDbLoader.LoadRnaXML(dbPath, true, DecoyType.Reverse, false, AllKnownMods, [], out var unknownModifications); - Assert.That(rna.All(p => p.SequenceVariations.Count == 1)); + // With new behavior: + // - RNAs with applied variants should have 0 unapplied SequenceVariations + // - RNAs without applied variants should retain the 1 DB SequenceVariation + Assert.That(rna.Where(p => p.AppliedSequenceVariations.Count > 0).All(p => p.SequenceVariations.Count == 0)); + Assert.That(rna.Where(p => p.AppliedSequenceVariations.Count == 0).All(p => p.SequenceVariations.Count == 1)); + Assert.That(rna.All(p => p.OriginalNonVariantModifications.Count == 2)); List targets = rna.Where(p => p.IsDecoy == false).ToList(); @@ -533,7 +537,10 @@ public void TwoTruncationsAndSequenceVariant_Digestion(string testCase, string b List rna = RnaDbLoader.LoadRnaXML(dbPath, true, DecoyType.Reverse, false, AllKnownMods, [], out var unknownModifications); RnaDigestionParams digestionParams = new RnaDigestionParams("RNase T1", missedCleavages, 2); - Assert.That(rna.All(p => p.SequenceVariations.Count == 1)); + // New behavior consistent with DbLoading test above + Assert.That(rna.Where(p => p.AppliedSequenceVariations.Count > 0).All(p => p.SequenceVariations.Count == 0)); + Assert.That(rna.Where(p => p.AppliedSequenceVariations.Count == 0).All(p => p.SequenceVariations.Count == 1)); + Assert.That(rna.All(p => p.OriginalNonVariantModifications.Count == 2)); Assert.That(rna.All(p => p.TruncationProducts.Count == 2)); From d9b82b6cda556eef20a88af197970f837772f945 Mon Sep 17 00:00:00 2001 From: trishorts Date: Tue, 28 Oct 2025 10:53:18 -0500 Subject: [PATCH 07/16] Update mzLib/Test/DatabaseTests/TestVariantProtein.cs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../Test/DatabaseTests/TestVariantProtein.cs | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 06a909bb1..99cd4fca6 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -616,18 +616,18 @@ public void SequenceVariationIsValidTest() Assert.IsTrue(svValidOriginalSequenceIsEmpty.AreValid()); //This is valid because it is an insertion Assert.IsTrue(svValidVariantSequenceLenthIsZero.AreValid()); // This is valid because it is a deletion } - // - // In this unit test, you will not get the base protein because the variant in your VCF is homozygous ALT (GT=1/1) with sufficient depth, - // and the loader always expands VCF-only entries via ApplyVariants. The maxHeterozygousVariants parameter only limits heterozygous combinatorics; it does not prevent applying homozygous variants. - // What happens in your path - // • LoadProteinXML always expands proteins by calling GetVariantBioPolymers(maxHeterozygousVariants, minAlleleDepth). - // • If a protein has only VCF-style sequence variations, GetVariantBioPolymers routes to ApplyVariants. - // • In ApplyVariants, a homozygous ALT with depth ≥ minAlleleDepth results in replacing the base with the variant: - // • isHomozygousAlternate == true and isDeepAlternateAllele == true → newVariantProteins = ApplySingleVariant(...), i.e., base is not retained. - // • maxHeterozygousVariants only affects heterozygous logic.It does nothing for homozygous ALT variants. - // Why your specific test applies the variant Your vcfstring is GT= 1 / 1 with AD = 0,15 (ALT depth 15). With the default minAlleleDepth=1 (you didn’t override it), - // ApplyVariants treats this as homozygous ALT with sufficient depth and applies it, returning only the variant protein. - // + /// + /// In this unit test, you will not get the base protein because the variant in your VCF is homozygous ALT (GT=1/1) with sufficient depth, + /// and the loader always expands VCF-only entries via ApplyVariants. The maxHeterozygousVariants parameter only limits heterozygous combinatorics; it does not prevent applying homozygous variants. + /// What happens in your path + /// • LoadProteinXML always expands proteins by calling GetVariantBioPolymers(maxHeterozygousVariants, minAlleleDepth). + /// • If a protein has only VCF-style sequence variations, GetVariantBioPolymers routes to ApplyVariants. + /// • In ApplyVariants, a homozygous ALT with depth ≥ minAlleleDepth results in replacing the base with the variant: + /// • isHomozygousAlternate == true and isDeepAlternateAllele == true → newVariantProteins = ApplySingleVariant(...), i.e., base is not retained. + /// • maxHeterozygousVariants only affects heterozygous logic.It does nothing for homozygous ALT variants. + /// Why your specific test applies the variant Your vcfstring is GT= 1 / 1 with AD = 0,15 (ALT depth 15). With the default minAlleleDepth=1 (you didn’t override it), + /// ApplyVariants treats this as homozygous ALT with sufficient depth and applies it, returning only the variant protein. + /// [Test] public void VcfProteinWithModOnDeepHomozygousVariant() { From 8c67a7ce440bd2939df3d984a22be07bccb3fb32 Mon Sep 17 00:00:00 2001 From: trishorts Date: Tue, 28 Oct 2025 10:53:56 -0500 Subject: [PATCH 08/16] Update mzLib/Test/Transcriptomics/TestVariantOligo.cs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- mzLib/Test/Transcriptomics/TestVariantOligo.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mzLib/Test/Transcriptomics/TestVariantOligo.cs b/mzLib/Test/Transcriptomics/TestVariantOligo.cs index 9d876d710..1e55b2eed 100644 --- a/mzLib/Test/Transcriptomics/TestVariantOligo.cs +++ b/mzLib/Test/Transcriptomics/TestVariantOligo.cs @@ -48,7 +48,7 @@ public void VariantXml() minAlleleDepth: 1000); Assert.That(1, Is.EqualTo(variantProteins.Count)); - Assert.That(0, Is.EqualTo(variantProteins.Count(a=>a.AppliedSequenceVariations.Count > 0))); + Assert.That(0, Is.EqualTo(variantProteins.Count(a => a.AppliedSequenceVariations.Count > 0))); variantProteins = RnaDbLoader.LoadRnaXML(file, true, DecoyType.None, false, AllKnownMods, [], out _, maxHeterozygousVariants: 10, From 572adaa2728d82e739713152ebc5b2ee91036f19 Mon Sep 17 00:00:00 2001 From: trishorts Date: Tue, 28 Oct 2025 12:11:16 -0500 Subject: [PATCH 09/16] checks on the consensus --- mzLib/Test/DatabaseTests/TestVariantProtein.cs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 06a909bb1..58024d62c 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -735,6 +735,16 @@ public void VcfProteinWithModOnDeepHomozygousVariant() // Why: AppliedSequenceVariations.First().OneBasedModifications is a dictionary of position -> list of modifications on the variant region. // In this case there is one key (position) with two modifications; summing the list counts across all keys yields 2. Assert.AreEqual(2, proteinsNoDecoys.First().AppliedSequenceVariations.First().OneBasedModifications.Values.Sum(v => v.Count)); + + // Additional checks on the ConsensusVariant for completeness + var consensusProtein = proteinsNoDecoys.First().ConsensusVariant; + var consensusSequenceVariants = consensusProtein.SequenceVariations; + Assert.That(1, Is.EqualTo(consensusSequenceVariants.Count)); + Assert.That(211, Is.EqualTo(consensusSequenceVariants.First().OneBasedBeginPosition)); + var consensusAppliedVariants = consensusProtein.AppliedSequenceVariations; + Assert.That(0, Is.EqualTo(consensusAppliedVariants.Count)); + var consensusModifications = consensusProtein.OneBasedPossibleLocalizedModifications; + Assert.That(6, Is.EqualTo(consensusModifications.Count)); } [Test] From be0d4833e354fc5b2f8cc1869209b458731cc39b Mon Sep 17 00:00:00 2001 From: trishorts Date: Tue, 28 Oct 2025 12:51:08 -0500 Subject: [PATCH 10/16] what happens when you add mods to proteins with variants --- .../Test/DatabaseTests/TestVariantProtein.cs | 312 ++++++++++++++++++ 1 file changed, 312 insertions(+) diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 58024d62c..2d3be1691 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -940,5 +940,317 @@ public void Constructor_ParsesDescriptionCorrectly() var adValues = svd.AlleleDepths[adKey]; Assert.AreEqual(new[] { "30", "30" }, adValues); } + [Test] + public static void TestProteinVariantsRoundTrip() + { + string databaseName = "humanGAPDH.xml"; + var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true, + DecoyType.None, null, false, null, out var unknownModifications, 1, 99); + Assert.AreEqual(4, proteins.Count); // 4 target + var consensusProtein = proteins[0]; + Assert.That(2, Is.EqualTo(consensusProtein.SequenceVariations.Count)); + Assert.That(0, Is.EqualTo(consensusProtein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(consensusProtein.OneBasedPossibleLocalizedModifications.Keys.Count)); + var A22G_Protein = proteins[1]; + Assert.That(1, Is.EqualTo(A22G_Protein.SequenceVariations.Count)); + Assert.That(1, Is.EqualTo(A22G_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + var K251N_Protein = proteins[2]; + Assert.That(1, Is.EqualTo(K251N_Protein.SequenceVariations.Count)); + Assert.That(1, Is.EqualTo(K251N_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(K251N_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + var K251N_A22G_Protein = proteins[3]; + Assert.That(0, Is.EqualTo(K251N_A22G_Protein.SequenceVariations.Count)); + Assert.That(2, Is.EqualTo(K251N_A22G_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(K251N_A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + // Write the non-decoy proteins to a temporary XML database + string tempDir = Path.Combine(Path.GetTempPath(), $"mzLib_roundtrip_{Guid.NewGuid():N}"); + Directory.CreateDirectory(tempDir); + string tempFile = Path.Combine(tempDir, "roundtrip_proteins.xml"); + + ProteinDbWriter.WriteXmlDatabase( + new Dictionary>>(), // no fixed mods to embed + proteins, + tempFile); + + // Count ProteinXmlEntry nodes directly from the XML (namespace- and case-insensitive) + var xdoc = System.Xml.Linq.XDocument.Load(tempFile); + int proteinXmlEntryCount = xdoc + .Descendants() + .Count(e => string.Equals(e.Name.LocalName, "protein", StringComparison.OrdinalIgnoreCase)); + TestContext.WriteLine($"ProteinXmlEntry count in XML: {proteinXmlEntryCount}"); + + // Expect the XML to contain one entry per written protein + // Mysteriously there is only one written protein..... + Assert.That(proteinXmlEntryCount, Is.EqualTo(1)); + + // Read the database back in + var roundTripped = ProteinDbLoader.LoadProteinXML( + tempFile, + generateTargets: true, + decoyType: DecoyType.None, + allKnownModifications: null, + isContaminant: false, + modTypesToExclude: null, + out var unknownMods2, + minAlleleDepth: 1, + maxHeterozygousVariants: 99); + + try + { + // Basic shape checks + Assert.That(roundTripped.Count, Is.EqualTo(proteins.Count)); + + // Ensure accessions match 1:1 + var originalAccs = proteins.Select(p => p.Accession).OrderBy(a => a).ToList(); + var rtAccs = roundTripped.Select(p => p.Accession).OrderBy(a => a).ToList(); + Assert.That(rtAccs, Is.EqualTo(originalAccs)); + + // Validate per-proteoform annotations survived round-trip + foreach (var rt in roundTripped) + { + var orig = proteins.Single(p => p.Accession == rt.Accession); + + Assert.That(rt.BaseSequence, Is.EqualTo(orig.BaseSequence), $"BaseSequence mismatch for {rt.Accession}"); + Assert.That(rt.SequenceVariations.Count, Is.EqualTo(orig.SequenceVariations.Count), $"SequenceVariations count mismatch for {rt.Accession}"); + Assert.That(rt.AppliedSequenceVariations.Count, Is.EqualTo(orig.AppliedSequenceVariations.Count), $"AppliedSequenceVariations count mismatch for {rt.Accession}"); + Assert.That(rt.OneBasedPossibleLocalizedModifications.Keys.Count, Is.EqualTo(orig.OneBasedPossibleLocalizedModifications.Keys.Count), $"Mods count mismatch for {rt.Accession}"); + + // Spot-check variant identity if applied variants exist + if (orig.AppliedSequenceVariations.Count > 0) + { + var origSimple = orig.AppliedSequenceVariations.Select(v => v.SimpleString()).OrderBy(s => s).ToArray(); + var rtSimple = rt.AppliedSequenceVariations.Select(v => v.SimpleString()).OrderBy(s => s).ToArray(); + Assert.That(rtSimple, Is.EqualTo(origSimple), $"Applied variant set mismatch for {rt.Accession}"); + } + } + } + finally + { + // Cleanup + if (Directory.Exists(tempDir)) + { + try { Directory.Delete(tempDir, recursive: true); } catch { /* ignore */ } + } + } + + } + [Test] + public static void TestProteinVariantsAddModsRoundTrip() + { + ModificationMotif.TryGetMotif("M", out ModificationMotif motifM); + Modification mm = new Modification("mod", null, "type", null, motifM, "Anywhere.", null, 16, new Dictionary>(), null, null, null, null, null); + + + + + string databaseName = "humanGAPDH.xml"; + var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true, + DecoyType.None, null, false, null, out var unknownModifications, 1, 99); + Assert.AreEqual(4, proteins.Count); // 4 target + var consensusProtein = proteins[0]; + Assert.That(2, Is.EqualTo(consensusProtein.SequenceVariations.Count)); + Assert.That(0, Is.EqualTo(consensusProtein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(consensusProtein.OneBasedPossibleLocalizedModifications.Keys.Count)); + consensusProtein.OneBasedPossibleLocalizedModifications.Add(1, new List(){mm}); + Assert.That(1, Is.EqualTo(consensusProtein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + + var A22G_Protein = proteins[1]; + Assert.That(1, Is.EqualTo(A22G_Protein.SequenceVariations.Count)); + Assert.That(1, Is.EqualTo(A22G_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + var K251N_Protein = proteins[2]; + Assert.That(1, Is.EqualTo(K251N_Protein.SequenceVariations.Count)); + Assert.That(1, Is.EqualTo(K251N_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(K251N_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + var K251N_A22G_Protein = proteins[3]; + Assert.That(0, Is.EqualTo(K251N_A22G_Protein.SequenceVariations.Count)); + Assert.That(2, Is.EqualTo(K251N_A22G_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(K251N_A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + // Write the non-decoy proteins to a temporary XML database + string tempDir = Path.Combine(Path.GetTempPath(), $"mzLib_roundtrip_{Guid.NewGuid():N}"); + Directory.CreateDirectory(tempDir); + string tempFile = Path.Combine(tempDir, "roundtrip_proteins.xml"); + + ProteinDbWriter.WriteXmlDatabase( + new Dictionary>>(), // no fixed mods to embed + proteins, + tempFile); + + // Count ProteinXmlEntry nodes directly from the XML (namespace- and case-insensitive) + var xdoc = System.Xml.Linq.XDocument.Load(tempFile); + int proteinXmlEntryCount = xdoc + .Descendants() + .Count(e => string.Equals(e.Name.LocalName, "protein", StringComparison.OrdinalIgnoreCase)); + TestContext.WriteLine($"ProteinXmlEntry count in XML: {proteinXmlEntryCount}"); + + // Expect the XML to contain one entry per written protein + // Mysteriously there is only one written protein..... + Assert.That(proteinXmlEntryCount, Is.EqualTo(1)); + + // Read the database back in + var roundTripped = ProteinDbLoader.LoadProteinXML( + tempFile, + generateTargets: true, + decoyType: DecoyType.None, + allKnownModifications: null, + isContaminant: false, + modTypesToExclude: null, + out var unknownMods2, + minAlleleDepth: 1, + maxHeterozygousVariants: 99); + + try + { + // Basic shape checks + Assert.That(roundTripped.Count, Is.EqualTo(proteins.Count)); + + // Ensure accessions match 1:1 + var originalAccs = proteins.Select(p => p.Accession).OrderBy(a => a).ToList(); + var rtAccs = roundTripped.Select(p => p.Accession).OrderBy(a => a).ToList(); + Assert.That(rtAccs, Is.EqualTo(originalAccs)); + + // Validate per-proteoform annotations survived round-trip + foreach (var rt in roundTripped) + { + var orig = proteins.Single(p => p.Accession == rt.Accession); + + Assert.That(rt.BaseSequence, Is.EqualTo(orig.BaseSequence), $"BaseSequence mismatch for {rt.Accession}"); + Assert.That(rt.SequenceVariations.Count, Is.EqualTo(orig.SequenceVariations.Count), $"SequenceVariations count mismatch for {rt.Accession}"); + Assert.That(rt.AppliedSequenceVariations.Count, Is.EqualTo(orig.AppliedSequenceVariations.Count), $"AppliedSequenceVariations count mismatch for {rt.Accession}"); + Assert.That(rt.OneBasedPossibleLocalizedModifications.Keys.Count, Is.EqualTo(orig.OneBasedPossibleLocalizedModifications.Keys.Count), $"Mods count mismatch for {rt.Accession}"); + + // Spot-check variant identity if applied variants exist + if (orig.AppliedSequenceVariations.Count > 0) + { + var origSimple = orig.AppliedSequenceVariations.Select(v => v.SimpleString()).OrderBy(s => s).ToArray(); + var rtSimple = rt.AppliedSequenceVariations.Select(v => v.SimpleString()).OrderBy(s => s).ToArray(); + Assert.That(rtSimple, Is.EqualTo(origSimple), $"Applied variant set mismatch for {rt.Accession}"); + } + } + } + finally + { + // Cleanup + if (Directory.Exists(tempDir)) + { + try { Directory.Delete(tempDir, recursive: true); } catch { /* ignore */ } + } + } + + } + [Test] + public static void TestProteinVariantsAddModsToVariantRoundTrip() + { + ModificationMotif.TryGetMotif("G", out ModificationMotif motifM); + Modification mg = new Modification("mod", null, "type", null, motifM, "Anywhere.", null, 16, new Dictionary>(), null, null, null, null, null); + + + + + string databaseName = "humanGAPDH.xml"; + var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true, + DecoyType.None, null, false, null, out var unknownModifications, 1, 99); + Assert.AreEqual(4, proteins.Count); // 4 target + var consensusProtein = proteins[0]; + Assert.That(2, Is.EqualTo(consensusProtein.SequenceVariations.Count)); + Assert.That(0, Is.EqualTo(consensusProtein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(consensusProtein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + + + var A22G_Protein = proteins[1]; + Assert.That(1, Is.EqualTo(A22G_Protein.SequenceVariations.Count)); + Assert.That(1, Is.EqualTo(A22G_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + A22G_Protein.OneBasedPossibleLocalizedModifications.Add(22, new List() { mg }); + Assert.That(1, Is.EqualTo(A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + + var K251N_Protein = proteins[2]; + Assert.That(1, Is.EqualTo(K251N_Protein.SequenceVariations.Count)); + Assert.That(1, Is.EqualTo(K251N_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(K251N_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + var K251N_A22G_Protein = proteins[3]; + Assert.That(0, Is.EqualTo(K251N_A22G_Protein.SequenceVariations.Count)); + Assert.That(2, Is.EqualTo(K251N_A22G_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(K251N_A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + // Write the non-decoy proteins to a temporary XML database + string tempDir = Path.Combine(Path.GetTempPath(), $"mzLib_roundtrip_{Guid.NewGuid():N}"); + Directory.CreateDirectory(tempDir); + string tempFile = Path.Combine(tempDir, "roundtrip_proteins.xml"); + + ProteinDbWriter.WriteXmlDatabase( + new Dictionary>>(), // no fixed mods to embed + proteins, + tempFile); + + // Count ProteinXmlEntry nodes directly from the XML (namespace- and case-insensitive) + var xdoc = System.Xml.Linq.XDocument.Load(tempFile); + int proteinXmlEntryCount = xdoc + .Descendants() + .Count(e => string.Equals(e.Name.LocalName, "protein", StringComparison.OrdinalIgnoreCase)); + TestContext.WriteLine($"ProteinXmlEntry count in XML: {proteinXmlEntryCount}"); + + // Expect the XML to contain one entry per written protein + // Mysteriously there is only one written protein..... + Assert.That(proteinXmlEntryCount, Is.EqualTo(1)); + + // Read the database back in + var roundTripped = ProteinDbLoader.LoadProteinXML( + tempFile, + generateTargets: true, + decoyType: DecoyType.None, + allKnownModifications: null, + isContaminant: false, + modTypesToExclude: null, + out var unknownMods2, + minAlleleDepth: 1, + maxHeterozygousVariants: 99); + + try + { + // Basic shape checks + Assert.That(roundTripped.Count, Is.EqualTo(proteins.Count)); + + // Ensure accessions match 1:1 + var originalAccs = proteins.Select(p => p.Accession).OrderBy(a => a).ToList(); + var rtAccs = roundTripped.Select(p => p.Accession).OrderBy(a => a).ToList(); + Assert.That(rtAccs, Is.EqualTo(originalAccs)); + + // Validate per-proteoform annotations survived round-trip + foreach (var rt in roundTripped) + { + var orig = proteins.Single(p => p.Accession == rt.Accession); + + Assert.That(rt.BaseSequence, Is.EqualTo(orig.BaseSequence), $"BaseSequence mismatch for {rt.Accession}"); + Assert.That(rt.SequenceVariations.Count, Is.EqualTo(orig.SequenceVariations.Count), $"SequenceVariations count mismatch for {rt.Accession}"); + Assert.That(rt.AppliedSequenceVariations.Count, Is.EqualTo(orig.AppliedSequenceVariations.Count), $"AppliedSequenceVariations count mismatch for {rt.Accession}"); + Assert.That(rt.OneBasedPossibleLocalizedModifications.Keys.Count, Is.EqualTo(orig.OneBasedPossibleLocalizedModifications.Keys.Count), $"Mods count mismatch for {rt.Accession}"); + + // Spot-check variant identity if applied variants exist + if (orig.AppliedSequenceVariations.Count > 0) + { + var origSimple = orig.AppliedSequenceVariations.Select(v => v.SimpleString()).OrderBy(s => s).ToArray(); + var rtSimple = rt.AppliedSequenceVariations.Select(v => v.SimpleString()).OrderBy(s => s).ToArray(); + Assert.That(rtSimple, Is.EqualTo(origSimple), $"Applied variant set mismatch for {rt.Accession}"); + } + } + } + finally + { + // Cleanup + if (Directory.Exists(tempDir)) + { + try { Directory.Delete(tempDir, recursive: true); } catch { /* ignore */ } + } + } + + } } } \ No newline at end of file From bd88b747f50e0a6891b5caa8aef8c7868171644d Mon Sep 17 00:00:00 2001 From: trishorts Date: Tue, 28 Oct 2025 13:09:53 -0500 Subject: [PATCH 11/16] create deletion variant --- .../Test/DatabaseTests/TestVariantProtein.cs | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index a65cbd116..87a7e7bf7 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -1162,6 +1162,119 @@ public static void TestProteinVariantsAddModsToVariantRoundTrip() + var A22G_Protein = proteins[1]; + Assert.That(1, Is.EqualTo(A22G_Protein.SequenceVariations.Count)); + Assert.That(1, Is.EqualTo(A22G_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + A22G_Protein.OneBasedPossibleLocalizedModifications.Add(22, new List() { mg }); + Assert.That(1, Is.EqualTo(A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + + var K251N_Protein = proteins[2]; + Assert.That(1, Is.EqualTo(K251N_Protein.SequenceVariations.Count)); + Assert.That(1, Is.EqualTo(K251N_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(K251N_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + var K251N_A22G_Protein = proteins[3]; + Assert.That(0, Is.EqualTo(K251N_A22G_Protein.SequenceVariations.Count)); + Assert.That(2, Is.EqualTo(K251N_A22G_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(K251N_A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + // Write the non-decoy proteins to a temporary XML database + string tempDir = Path.Combine(Path.GetTempPath(), $"mzLib_roundtrip_{Guid.NewGuid():N}"); + Directory.CreateDirectory(tempDir); + string tempFile = Path.Combine(tempDir, "roundtrip_proteins.xml"); + + ProteinDbWriter.WriteXmlDatabase( + new Dictionary>>(), // no fixed mods to embed + proteins, + tempFile); + + // Count ProteinXmlEntry nodes directly from the XML (namespace- and case-insensitive) + var xdoc = System.Xml.Linq.XDocument.Load(tempFile); + int proteinXmlEntryCount = xdoc + .Descendants() + .Count(e => string.Equals(e.Name.LocalName, "protein", StringComparison.OrdinalIgnoreCase)); + TestContext.WriteLine($"ProteinXmlEntry count in XML: {proteinXmlEntryCount}"); + + // Expect the XML to contain one entry per written protein + // Mysteriously there is only one written protein..... + Assert.That(proteinXmlEntryCount, Is.EqualTo(1)); + + // Read the database back in + var roundTripped = ProteinDbLoader.LoadProteinXML( + tempFile, + generateTargets: true, + decoyType: DecoyType.None, + allKnownModifications: null, + isContaminant: false, + modTypesToExclude: null, + out var unknownMods2, + minAlleleDepth: 1, + maxHeterozygousVariants: 99); + + try + { + // Basic shape checks + Assert.That(roundTripped.Count, Is.EqualTo(proteins.Count)); + + // Ensure accessions match 1:1 + var originalAccs = proteins.Select(p => p.Accession).OrderBy(a => a).ToList(); + var rtAccs = roundTripped.Select(p => p.Accession).OrderBy(a => a).ToList(); + Assert.That(rtAccs, Is.EqualTo(originalAccs)); + + // Validate per-proteoform annotations survived round-trip + foreach (var rt in roundTripped) + { + var orig = proteins.Single(p => p.Accession == rt.Accession); + + Assert.That(rt.BaseSequence, Is.EqualTo(orig.BaseSequence), $"BaseSequence mismatch for {rt.Accession}"); + Assert.That(rt.SequenceVariations.Count, Is.EqualTo(orig.SequenceVariations.Count), $"SequenceVariations count mismatch for {rt.Accession}"); + Assert.That(rt.AppliedSequenceVariations.Count, Is.EqualTo(orig.AppliedSequenceVariations.Count), $"AppliedSequenceVariations count mismatch for {rt.Accession}"); + Assert.That(rt.OneBasedPossibleLocalizedModifications.Keys.Count, Is.EqualTo(orig.OneBasedPossibleLocalizedModifications.Keys.Count), $"Mods count mismatch for {rt.Accession}"); + + // Spot-check variant identity if applied variants exist + if (orig.AppliedSequenceVariations.Count > 0) + { + var origSimple = orig.AppliedSequenceVariations.Select(v => v.SimpleString()).OrderBy(s => s).ToArray(); + var rtSimple = rt.AppliedSequenceVariations.Select(v => v.SimpleString()).OrderBy(s => s).ToArray(); + Assert.That(rtSimple, Is.EqualTo(origSimple), $"Applied variant set mismatch for {rt.Accession}"); + } + } + } + finally + { + // Cleanup + if (Directory.Exists(tempDir)) + { + try { Directory.Delete(tempDir, recursive: true); } catch { /* ignore */ } + } + } + + } + [Test] + public static void TestProteinVariantsAddModsToDeletionVariantRoundTrip() + { + ModificationMotif.TryGetMotif("G", out ModificationMotif motifM); + Modification mg = new Modification("mod", null, "type", null, motifM, "Anywhere.", null, 16, new Dictionary>(), null, null, null, null, null); + + + + + string databaseName = "humanGAPDH.xml"; + var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true, + DecoyType.None, null, false, null, out var unknownModifications, 1, 99); + Assert.AreEqual(4, proteins.Count); // 4 target + var consensusProtein = proteins[0]; + Assert.That(2, Is.EqualTo(consensusProtein.SequenceVariations.Count)); + Assert.That(0, Is.EqualTo(consensusProtein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(consensusProtein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + SequenceVariation del = new SequenceVariation(1,1,"M","", "delete M",null); + consensusProtein.SequenceVariations.Add(del); + Assert.That(3, Is.EqualTo(consensusProtein.SequenceVariations.Count)); + + var A22G_Protein = proteins[1]; Assert.That(1, Is.EqualTo(A22G_Protein.SequenceVariations.Count)); Assert.That(1, Is.EqualTo(A22G_Protein.AppliedSequenceVariations.Count)); From 73b41f2bf0bdca2774bfc6a59924394130f51f65 Mon Sep 17 00:00:00 2001 From: trishorts Date: Tue, 28 Oct 2025 15:00:53 -0500 Subject: [PATCH 12/16] read write read one variant protein current behavior --- mzLib/Test/DatabaseTests/TestVariantProtein.cs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 87a7e7bf7..43de5bb0a 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -940,6 +940,15 @@ public void Constructor_ParsesDescriptionCorrectly() var adValues = svd.AlleleDepths[adKey]; Assert.AreEqual(new[] { "30", "30" }, adValues); } + /// + /// The humanGAPDH.xml test file contains one protein with two separate amino acid substitution variants. + /// When loaded with ProteinDbLoader.LoadProteinXML with variant expansion enabled, + /// the resulting protein entries reflect these variants appropriately. + /// There are four total proteins: the consensus (no variants), each single variant applied separately, and both variants applied. + /// Writing this list of four proteins back to XML results in an XML file that contains one Protein entry that is exactly the same as the original. + /// There are no entries for the variant proteins because the writer does not currently serialize applied variants as separate proteins. + /// Loading in this XML again results in the same four proteins as the original load. One consensus and three variant-applied proteoforms. + /// [Test] public static void TestProteinVariantsRoundTrip() { From 311f9b65a99ef9e1a127c896009629589382c828 Mon Sep 17 00:00:00 2001 From: trishorts Date: Tue, 28 Oct 2025 15:09:53 -0500 Subject: [PATCH 13/16] test that adds mod to a shared amino acid on a variant protein fails to write that mod to the database --- .../Test/DatabaseTests/TestVariantProtein.cs | 138 +++++++++++++++++- 1 file changed, 135 insertions(+), 3 deletions(-) diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 43de5bb0a..2efe03712 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -1045,6 +1045,16 @@ public static void TestProteinVariantsRoundTrip() } } + /// + /// The humanGAPDH.xml test file contains one protein with two separate amino acid substitution variants. + /// When loaded with ProteinDbLoader.LoadProteinXML with variant expansion enabled, + /// the resulting protein entries reflect these variants appropriately. + /// There are four total proteins: the consensus (no variants), each single variant applied separately, and both variants applied. + /// In this test we add a modification to the consensus protein to ensure that modifications survive the round trip as well. + /// Writing this list of four proteins back to XML results in an XML file that contains one Protein entry that is exactly the same as the original. + /// There are no entries for the variant proteins because the writer does not currently serialize applied variants as separate proteins. + /// Loading in this XML again results in the same four proteins as the original load. One consensus and three variant-applied proteoforms. + /// [Test] public static void TestProteinVariantsAddModsRoundTrip() { @@ -1052,8 +1062,6 @@ public static void TestProteinVariantsAddModsRoundTrip() Modification mm = new Modification("mod", null, "type", null, motifM, "Anywhere.", null, 16, new Dictionary>(), null, null, null, null, null); - - string databaseName = "humanGAPDH.xml"; var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true, DecoyType.None, null, false, null, out var unknownModifications, 1, 99); @@ -1062,6 +1070,8 @@ public static void TestProteinVariantsAddModsRoundTrip() Assert.That(2, Is.EqualTo(consensusProtein.SequenceVariations.Count)); Assert.That(0, Is.EqualTo(consensusProtein.AppliedSequenceVariations.Count)); Assert.That(0, Is.EqualTo(consensusProtein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + // Add a modification to the consensus protein to test that mods survive the round trip consensusProtein.OneBasedPossibleLocalizedModifications.Add(1, new List(){mm}); Assert.That(1, Is.EqualTo(consensusProtein.OneBasedPossibleLocalizedModifications.Keys.Count)); @@ -1130,7 +1140,128 @@ public static void TestProteinVariantsAddModsRoundTrip() Assert.That(rt.BaseSequence, Is.EqualTo(orig.BaseSequence), $"BaseSequence mismatch for {rt.Accession}"); Assert.That(rt.SequenceVariations.Count, Is.EqualTo(orig.SequenceVariations.Count), $"SequenceVariations count mismatch for {rt.Accession}"); Assert.That(rt.AppliedSequenceVariations.Count, Is.EqualTo(orig.AppliedSequenceVariations.Count), $"AppliedSequenceVariations count mismatch for {rt.Accession}"); - Assert.That(rt.OneBasedPossibleLocalizedModifications.Keys.Count, Is.EqualTo(orig.OneBasedPossibleLocalizedModifications.Keys.Count), $"Mods count mismatch for {rt.Accession}"); + + // We added a mod to the consensus protein in a position that will be contained in all variant proteins as well + Assert.That(rt.OneBasedPossibleLocalizedModifications.Keys.Count, Is.EqualTo(1), $"Mods count mismatch for {rt.Accession}"); + + // Spot-check variant identity if applied variants exist + if (orig.AppliedSequenceVariations.Count > 0) + { + var origSimple = orig.AppliedSequenceVariations.Select(v => v.SimpleString()).OrderBy(s => s).ToArray(); + var rtSimple = rt.AppliedSequenceVariations.Select(v => v.SimpleString()).OrderBy(s => s).ToArray(); + Assert.That(rtSimple, Is.EqualTo(origSimple), $"Applied variant set mismatch for {rt.Accession}"); + } + } + } + finally + { + // Cleanup + if (Directory.Exists(tempDir)) + { + try { Directory.Delete(tempDir, recursive: true); } catch { /* ignore */ } + } + } + + } + /// + /// The humanGAPDH.xml test file contains one protein with two separate amino acid substitution variants. + /// When loaded with ProteinDbLoader.LoadProteinXML with variant expansion enabled, + /// the resulting protein entries reflect these variants appropriately. + /// There are four total proteins: the consensus (no variants), each single variant applied separately, and both variants applied. + /// In this test we add a modification to a variant protein but on a position shared by all proteins + /// Writing this list of four proteins back to XML results in an XML file that contains one Protein entry that is exactly the same as the original. + /// There are no entries for the variant proteins because the writer does not currently serialize applied variants as separate proteins. + /// Loading in this XML again results in the same four proteins as the original load. One consensus and three variant-applied proteoforms. + /// + [Test] + public static void TestProteinVariantsAddModsToVariantOnSharedAminoAcidRoundTrip() + { + ModificationMotif.TryGetMotif("M", out ModificationMotif motifM); + Modification mm = new Modification("mod", null, "type", null, motifM, "Anywhere.", null, 16, new Dictionary>(), null, null, null, null, null); + + + string databaseName = "humanGAPDH.xml"; + var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true, + DecoyType.None, null, false, null, out var unknownModifications, 1, 99); + Assert.AreEqual(4, proteins.Count); // 4 target + var consensusProtein = proteins[0]; + Assert.That(2, Is.EqualTo(consensusProtein.SequenceVariations.Count)); + Assert.That(0, Is.EqualTo(consensusProtein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(consensusProtein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + + var A22G_Protein = proteins[1]; + Assert.That(1, Is.EqualTo(A22G_Protein.SequenceVariations.Count)); + Assert.That(1, Is.EqualTo(A22G_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + // Add a modification to the consensus protein to test that mods survive the round trip + A22G_Protein.OneBasedPossibleLocalizedModifications.Add(1, new List() { mm }); + Assert.That(1, Is.EqualTo(A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + var K251N_Protein = proteins[2]; + Assert.That(1, Is.EqualTo(K251N_Protein.SequenceVariations.Count)); + Assert.That(1, Is.EqualTo(K251N_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(K251N_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + var K251N_A22G_Protein = proteins[3]; + Assert.That(0, Is.EqualTo(K251N_A22G_Protein.SequenceVariations.Count)); + Assert.That(2, Is.EqualTo(K251N_A22G_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(K251N_A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + // Write the non-decoy proteins to a temporary XML database + string tempDir = Path.Combine(Path.GetTempPath(), $"mzLib_roundtrip_{Guid.NewGuid():N}"); + Directory.CreateDirectory(tempDir); + string tempFile = Path.Combine(tempDir, "roundtrip_proteins.xml"); + + ProteinDbWriter.WriteXmlDatabase( + new Dictionary>>(), // no fixed mods to embed + proteins, + tempFile); + + // Count ProteinXmlEntry nodes directly from the XML (namespace- and case-insensitive) + var xdoc = System.Xml.Linq.XDocument.Load(tempFile); + int proteinXmlEntryCount = xdoc + .Descendants() + .Count(e => string.Equals(e.Name.LocalName, "protein", StringComparison.OrdinalIgnoreCase)); + TestContext.WriteLine($"ProteinXmlEntry count in XML: {proteinXmlEntryCount}"); + + // Expect the XML to contain one entry per written protein + // Mysteriously there is only one written protein..... + Assert.That(proteinXmlEntryCount, Is.EqualTo(1)); + + // Read the database back in + var roundTripped = ProteinDbLoader.LoadProteinXML( + tempFile, + generateTargets: true, + decoyType: DecoyType.None, + allKnownModifications: null, + isContaminant: false, + modTypesToExclude: null, + out var unknownMods2, + minAlleleDepth: 1, + maxHeterozygousVariants: 99); + + try + { + // Basic shape checks + Assert.That(roundTripped.Count, Is.EqualTo(proteins.Count)); + + // Ensure accessions match 1:1 + var originalAccs = proteins.Select(p => p.Accession).OrderBy(a => a).ToList(); + var rtAccs = roundTripped.Select(p => p.Accession).OrderBy(a => a).ToList(); + Assert.That(rtAccs, Is.EqualTo(originalAccs)); + + // Validate per-proteoform annotations survived round-trip + foreach (var rt in roundTripped) + { + var orig = proteins.Single(p => p.Accession == rt.Accession); + + Assert.That(rt.BaseSequence, Is.EqualTo(orig.BaseSequence), $"BaseSequence mismatch for {rt.Accession}"); + Assert.That(rt.SequenceVariations.Count, Is.EqualTo(orig.SequenceVariations.Count), $"SequenceVariations count mismatch for {rt.Accession}"); + Assert.That(rt.AppliedSequenceVariations.Count, Is.EqualTo(orig.AppliedSequenceVariations.Count), $"AppliedSequenceVariations count mismatch for {rt.Accession}"); + + // We added a mod to the consensus protein in a position that will be contained in all variant proteins as well + Assert.That(rt.OneBasedPossibleLocalizedModifications.Keys.Count, Is.EqualTo(1), $"Mods count mismatch for {rt.Accession}"); // Spot-check variant identity if applied variants exist if (orig.AppliedSequenceVariations.Count > 0) @@ -1151,6 +1282,7 @@ public static void TestProteinVariantsAddModsRoundTrip() } } + [Test] public static void TestProteinVariantsAddModsToVariantRoundTrip() { From 3ec9c40a8cb7ac848c0248d088832a74a25f45d3 Mon Sep 17 00:00:00 2001 From: trishorts Date: Wed, 29 Oct 2025 08:02:21 -0500 Subject: [PATCH 14/16] fix bug in protein xml writer that only wrote consensus --- .../Test/DatabaseTests/TestVariantProtein.cs | 10 +++++++--- .../ProteinDbWriter.cs | 20 +++++++++++++------ 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 2efe03712..84e304160 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -988,11 +988,13 @@ public static void TestProteinVariantsRoundTrip() int proteinXmlEntryCount = xdoc .Descendants() .Count(e => string.Equals(e.Name.LocalName, "protein", StringComparison.OrdinalIgnoreCase)); + int countOfConsensusProteins = proteins.Select(p => p.ConsensusVariant).Distinct().Count(); + int countOfProteinsWithAppliedVariants = proteins.Where(p => p.AppliedSequenceVariations.Count > 0).Distinct().Count(); TestContext.WriteLine($"ProteinXmlEntry count in XML: {proteinXmlEntryCount}"); // Expect the XML to contain one entry per written protein // Mysteriously there is only one written protein..... - Assert.That(proteinXmlEntryCount, Is.EqualTo(1)); + Assert.That(proteinXmlEntryCount, Is.EqualTo(countOfConsensusProteins + countOfProteinsWithAppliedVariants)); // Read the database back in var roundTripped = ProteinDbLoader.LoadProteinXML( @@ -1008,8 +1010,10 @@ public static void TestProteinVariantsRoundTrip() try { - // Basic shape checks - Assert.That(roundTripped.Count, Is.EqualTo(proteins.Count)); + // Even though we are applying variants all over again, we should end up with the same total number of proteins + // This is because the variants generated by the consensus protein with sequence variations produce the same set of proteins + // as those with applied variants. In other words, the duplicates get collapsed. + Assert.That(roundTripped.Count, Is.EqualTo(countOfConsensusProteins + countOfProteinsWithAppliedVariants)); // Ensure accessions match 1:1 var originalAccs = proteins.Select(p => p.Accession).OrderBy(a => a).ToList(); diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs index 24ff44dd4..9a0155b54 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs @@ -297,8 +297,16 @@ public static Dictionary WriteXmlDatabase(Dictionary>>(); - // write nonvariant proteins (for cases where variants aren't applied, this just gets the rna itself) - var nonVariantProteins = proteinList.Select(p => p.ConsensusVariant).Distinct().ToList(); + // create the list of distinct consensus proteins. + var proteinsToWrite = proteinList + .Select(p => p.ConsensusVariant) + .Distinct() + .ToList(); + // add any proteins with applied variants + proteinsToWrite = proteinsToWrite + .Concat(proteinList.Where(p => p.AppliedSequenceVariations.Count > 0)) + .Distinct() + .ToList(); var xmlWriterSettings = new XmlWriterSettings { @@ -314,7 +322,7 @@ public static Dictionary WriteXmlDatabase(Dictionary myModificationList = new List(); - foreach (Protein p in nonVariantProteins) + foreach (Protein p in proteinsToWrite) { foreach (KeyValuePair> entry in p.OneBasedPossibleLocalizedModifications) { @@ -323,13 +331,13 @@ public static Dictionary WriteXmlDatabase(Dictionary allRelevantModifications = new HashSet( - nonVariantProteins + proteinsToWrite .SelectMany(p => p.SequenceVariations .SelectMany(sv => sv.OneBasedModifications) .Concat(p.OneBasedPossibleLocalizedModifications) .SelectMany(kv => kv.Value)) .Concat(additionalModsToAddToProteins - .Where(kv => nonVariantProteins + .Where(kv => proteinsToWrite .SelectMany(p => p.SequenceVariations .Select(sv => VariantApplication.GetAccession(p, new[] { sv })).Concat(new[] { p.Accession })) .Contains(kv.Key)) @@ -342,7 +350,7 @@ public static Dictionary WriteXmlDatabase(Dictionary Date: Wed, 29 Oct 2025 08:23:57 -0500 Subject: [PATCH 15/16] k --- mzLib/Omics/BioPolymer/VariantApplication.cs | 31 ++++++++++++++------ 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs index 1a78a5873..8a3a667da 100644 --- a/mzLib/Omics/BioPolymer/VariantApplication.cs +++ b/mzLib/Omics/BioPolymer/VariantApplication.cs @@ -94,7 +94,7 @@ public static int RestoreModificationIndex(IHasSequenceVariants protein, int var /// /// Applies multiple variant changes to a protein sequence /// - public static List ApplyVariants(TBioPolymerType protein, IEnumerable sequenceVariations, int maxAllowedVariantsForCombinitorics, int minAlleleDepth) + public static List ApplyVariants(TBioPolymerType protein, IEnumerable sequenceVariations, int maxAllowedVariantsForCombinatorics, int minAlleleDepth) where TBioPolymerType : IHasSequenceVariants { List uniqueEffectsToApply = sequenceVariations @@ -131,7 +131,7 @@ public static List ApplyVariants(TBioPolymerTy newVariantProteins.Clear(); newVariantProteins.Add(proteinCopy); - bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.Description.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics; + bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.Description.Heterozygous[individual]) > maxAllowedVariantsForCombinatorics; foreach (var variant in uniqueEffectsToApply) { bool variantAlleleIsInTheGenotype = variant.Description.Genotypes[individual].Contains(variant.Description.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff @@ -155,12 +155,12 @@ public static List ApplyVariants(TBioPolymerTy { if (isDeepAlternateAllele && isDeepReferenceAllele) { - if (newVariantProteins.Count == 1 && maxAllowedVariantsForCombinitorics > 0) + if (newVariantProteins.Count == 1 && maxAllowedVariantsForCombinatorics > 0) { TBioPolymerType variantProtein = ApplySingleVariant(variant, newVariantProteins[0], individual); newVariantProteins.Add(variantProtein); } - else if (maxAllowedVariantsForCombinitorics > 0) + else if (maxAllowedVariantsForCombinatorics > 0) { newVariantProteins[1] = ApplySingleVariant(variant, newVariantProteins[1], individual); } @@ -169,7 +169,7 @@ public static List ApplyVariants(TBioPolymerTy // no heterozygous variants } } - else if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0) + else if (isDeepAlternateAllele && maxAllowedVariantsForCombinatorics > 0) { newVariantProteins = newVariantProteins.Select(p => ApplySingleVariant(variant, p, individual)).ToList(); } @@ -186,7 +186,7 @@ public static List ApplyVariants(TBioPolymerTy foreach (var ppp in newVariantProteins) { - if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0 && isDeepReferenceAllele) + if (isDeepAlternateAllele && maxAllowedVariantsForCombinatorics > 0 && isDeepReferenceAllele) { // keep reference allele if (variant.Description.Genotypes[individual].Contains("0")) @@ -197,7 +197,7 @@ public static List ApplyVariants(TBioPolymerTy // alternate allele (replace all, since in heterozygous with two alternates, both alternates are included) combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual)); } - else if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0) + else if (isDeepAlternateAllele && maxAllowedVariantsForCombinatorics > 0) { combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual)); } @@ -430,12 +430,23 @@ private static Dictionary> AdjustModificationIndices(Seq return mods; } + /// + /// Provides a deterministic ordering for variant-based naming/IDs. + /// + private static IEnumerable OrderForNaming(IEnumerable variations) => + variations + .OrderBy(v => v.OneBasedBeginPosition) + .ThenBy(v => v.OneBasedEndPosition) + .ThenBy(v => v.SimpleString()); // tie-breaker for identical coordinates + /// /// Format string to append to accession /// private static string CombineSimpleStrings(IEnumerable? variations) { - return variations.IsNullOrEmpty() ? "" : string.Join("_", variations.Select(v => v.SimpleString())); + return variations.IsNullOrEmpty() + ? "" + : string.Join("_", OrderForNaming(variations!).Select(v => v.SimpleString())); } /// @@ -443,7 +454,9 @@ private static string CombineSimpleStrings(IEnumerable? varia /// public static string CombineDescriptions(IEnumerable? variations) { - return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.Description)); + return variations.IsNullOrEmpty() + ? "" + : string.Join(", variant:", OrderForNaming(variations!).Select(d => d.Description)); } /// /// Applies all possible combinations of the provided SequenceVariation list to the base TBioPolymerType object, From c50381508751fe15f79e1d31aa1bf738227a8883 Mon Sep 17 00:00:00 2001 From: trishorts Date: Wed, 29 Oct 2025 09:29:41 -0500 Subject: [PATCH 16/16] undo add sequence variants --- .../Omics/BioPolymer/IHasSequenceVariants.cs | 8 +- mzLib/Omics/BioPolymer/VariantApplication.cs | 25 ++++-- mzLib/Proteomics/Protein/Protein.cs | 81 +++++++------------ .../Test/DatabaseTests/TestVariantProtein.cs | 2 - .../Test/Transcriptomics/TestVariantOligo.cs | 1 - mzLib/Transcriptomics/NucleicAcid.cs | 10 +-- mzLib/Transcriptomics/RNA.cs | 64 +++------------ .../ProteinDbLoader.cs | 6 ++ 8 files changed, 73 insertions(+), 124 deletions(-) diff --git a/mzLib/Omics/BioPolymer/IHasSequenceVariants.cs b/mzLib/Omics/BioPolymer/IHasSequenceVariants.cs index 98f68131d..384d098ae 100644 --- a/mzLib/Omics/BioPolymer/IHasSequenceVariants.cs +++ b/mzLib/Omics/BioPolymer/IHasSequenceVariants.cs @@ -56,10 +56,10 @@ public interface IHasSequenceVariants TBioPolymerType CreateVariant( string variantBaseSequence, TBioPolymerType original, - IEnumerable? sequenceVariants, // NEW: pass DB variants explicitly - IEnumerable? appliedSequenceVariants, + IEnumerable appliedSequenceVariants, IEnumerable applicableProteolysisProducts, - IDictionary> oneBasedModifications, + IDictionary> oneBasedModifications, string sampleNameForVariants) - where TBioPolymerType : IHasSequenceVariants; + where TBioPolymerType : IHasSequenceVariants; } \ No newline at end of file diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs index 8a3a667da..b280f2662 100644 --- a/mzLib/Omics/BioPolymer/VariantApplication.cs +++ b/mzLib/Omics/BioPolymer/VariantApplication.cs @@ -1,4 +1,5 @@ -using MzLibUtil; +using System.Linq; +using MzLibUtil; using Omics.BioPolymer; using Omics.Modifications; @@ -101,7 +102,10 @@ public static List ApplyVariants(TBioPolymerTy .GroupBy(v => v.SimpleString()) .Select(x => x.First()) .Where(v => v.Description.Genotypes.Count > 0) // this is a VCF line - .OrderByDescending(v => v.OneBasedBeginPosition) // apply variants at the end of the protein sequence first + .OrderBy(v => v.OneBasedBeginPosition) + .ThenBy(v => v.OneBasedEndPosition) + .ThenBy(v => v.OriginalSequence ?? string.Empty) + .ThenBy(v => v.VariantSequence ?? string.Empty) // apply variants at the end of the protein sequence first .ToList(); // If there aren't any variants to apply, just return the original as-is @@ -114,7 +118,6 @@ public static List ApplyVariants(TBioPolymerTy TBioPolymerType proteinCopy = protein.CreateVariant( protein.BaseSequence, protein, - protein.SequenceVariations, // pass DB variants null, // none applied yet protein.TruncationProducts, protein.OneBasedPossibleLocalizedModifications, @@ -267,7 +270,6 @@ private static TBioPolymerType ApplySingleVariant(SequenceVaria return protein.CreateVariant( variantSequence, protein, - protein.SequenceVariations, // carry forward the current “unapplied” list adjustedAppliedVariations, adjustedProteolysisProducts, adjustedModifications, @@ -431,13 +433,14 @@ private static Dictionary> AdjustModificationIndices(Seq } /// - /// Provides a deterministic ordering for variant-based naming/IDs. + /// Deterministic ordering for variant-based naming/IDs to avoid order-of-application artifacts. /// private static IEnumerable OrderForNaming(IEnumerable variations) => variations .OrderBy(v => v.OneBasedBeginPosition) .ThenBy(v => v.OneBasedEndPosition) - .ThenBy(v => v.SimpleString()); // tie-breaker for identical coordinates + .ThenBy(v => v.OriginalSequence ?? string.Empty) + .ThenBy(v => v.VariantSequence ?? string.Empty); /// /// Format string to append to accession @@ -456,7 +459,7 @@ public static string CombineDescriptions(IEnumerable? variati { return variations.IsNullOrEmpty() ? "" - : string.Join(", variant:", OrderForNaming(variations!).Select(d => d.Description)); + : string.Join(", variant:", OrderForNaming(variations!).Select(v => v.Description)); } /// /// Applies all possible combinations of the provided SequenceVariation list to the base TBioPolymerType object, @@ -482,7 +485,13 @@ public static IEnumerable ApplyAllVariantCombinations= maxCombinations) yield break; - + // Sort variations to ensure deterministic combination order + variations = variations + .OrderBy(v => v.OneBasedBeginPosition) + .ThenBy(v => v.OneBasedEndPosition) + .ThenBy(v => v.OriginalSequence ?? string.Empty) + .ThenBy(v => v.VariantSequence ?? string.Empty) + .ToList(); int n = variations.Count; for (int size = 1; size <= n; size++) { diff --git a/mzLib/Proteomics/Protein/Protein.cs b/mzLib/Proteomics/Protein/Protein.cs index 01cc6c40e..c9b0c8115 100644 --- a/mzLib/Proteomics/Protein/Protein.cs +++ b/mzLib/Proteomics/Protein/Protein.cs @@ -133,35 +133,35 @@ public Protein(Protein originalProtein, string newBaseSequence) /// /// /// - public Protein(string variantBaseSequence, - Protein protein, - IEnumerable? sequenceVariations, - IEnumerable? appliedSequenceVariations, - IEnumerable applicableProteolysisProducts, - IDictionary> oneBasedModifications, + public Protein( + string variantBaseSequence, + Protein protein, + IEnumerable appliedSequenceVariations, + IEnumerable applicableProteolysisProducts, + IDictionary> oneBasedModifications, string sampleNameForVariants) : this( - variantBaseSequence, - VariantApplication.GetAccession(protein, appliedSequenceVariations), - organism: protein.Organism, - geneNames: new List>(protein.GeneNames), - oneBasedModifications: oneBasedModifications != null ? oneBasedModifications.ToDictionary(x => x.Key, x => x.Value) : new Dictionary>(), - proteolysisProducts: new List(applicableProteolysisProducts ?? new List()), - name: VariantApplication.GetVariantName(protein.Name, appliedSequenceVariations), - fullName: VariantApplication.GetVariantName(protein.FullName, appliedSequenceVariations), - isDecoy: protein.IsDecoy, - isContaminant: protein.IsContaminant, - databaseReferences: new List(protein.DatabaseReferences), - sequenceVariations: new List(sequenceVariations ?? protein.SequenceVariations), - disulfideBonds: new List(protein.DisulfideBonds), - spliceSites: new List(protein.SpliceSites), - databaseFilePath: protein.DatabaseFilePath, - dataset: protein.DatasetEntryTag, - created: protein.CreatedEntryTag, - modified: protein.ModifiedEntryTag, - version: protein.VersionEntryTag, - xmlns: protein.XmlnsEntryTag, - uniProtSequenceAttributes: protein.UniProtSequenceAttributes) + variantBaseSequence, + VariantApplication.GetAccession(protein, appliedSequenceVariations), + organism: protein.Organism, + geneNames: new List>(protein.GeneNames), + oneBasedModifications: oneBasedModifications != null ? oneBasedModifications.ToDictionary(x => x.Key, x => x.Value) : new Dictionary>(), + proteolysisProducts: new List(applicableProteolysisProducts ?? new List()), + name: VariantApplication.GetVariantName(protein.Name, appliedSequenceVariations), + fullName: VariantApplication.GetVariantName(protein.FullName, appliedSequenceVariations), + isDecoy: protein.IsDecoy, + isContaminant: protein.IsContaminant, + databaseReferences: new List(protein.DatabaseReferences), + sequenceVariations: new List(protein.SequenceVariations), + disulfideBonds: new List(protein.DisulfideBonds), + spliceSites: new List(protein.SpliceSites), + databaseFilePath: protein.DatabaseFilePath, + dataset: protein.DatasetEntryTag, + created: protein.CreatedEntryTag, + modified: protein.ModifiedEntryTag, + version: protein.VersionEntryTag, + xmlns: protein.XmlnsEntryTag, + uniProtSequenceAttributes: protein.UniProtSequenceAttributes) { NonVariantProtein = protein.ConsensusVariant as Protein; OriginalNonVariantModifications = ConsensusVariant.OriginalNonVariantModifications; @@ -601,34 +601,15 @@ private Protein GenerateFullyLabeledSilacProtein(SilacLabel label) /// public IDictionary> OriginalNonVariantModifications { get; set; } - public TBioPolymerType CreateVariant( - string variantBaseSequence, - TBioPolymerType original, - IEnumerable? sequenceVariants, - IEnumerable? appliedSequenceVariants, - IEnumerable applicableProteolysisProducts, - IDictionary> oneBasedModifications, - string sampleNameForVariants) + public TBioPolymerType CreateVariant(string variantBaseSequence, TBioPolymerType original, IEnumerable appliedSequenceVariants, + IEnumerable applicableProteolysisProducts, IDictionary> oneBasedModifications, string sampleNameForVariants) where TBioPolymerType : IHasSequenceVariants { if (original is not Protein originalProtein) throw new ArgumentException("The original BioPolymer must be Protein to create a protein variant"); - var variantProtein = new Protein( - variantBaseSequence, - originalProtein, - sequenceVariants ?? originalProtein.SequenceVariations, - appliedSequenceVariants, - applicableProteolysisProducts, - oneBasedModifications, - sampleNameForVariants); - - // Remove any applied variants from the list of (unapplied) database sequence variations - if (appliedSequenceVariants != null && appliedSequenceVariants.Any()) - { - var appliedSimple = new HashSet(appliedSequenceVariants.Select(v => v.SimpleString())); - variantProtein.SequenceVariations.RemoveAll(sv => appliedSimple.Contains(sv.SimpleString())); - } + var variantProtein = new Protein(variantBaseSequence, originalProtein, appliedSequenceVariants, + applicableProteolysisProducts, oneBasedModifications, sampleNameForVariants); return (TBioPolymerType)(IHasSequenceVariants)variantProtein; } diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 84e304160..43a4e58e8 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -55,7 +55,6 @@ public static void VariantProtein() Protein v = new Protein( "MAVA", p, - p.SequenceVariations, // DB variants (empty in this test) new[] { new SequenceVariation(3, "A", "V", "desc", null) }, // applied null, null, @@ -434,7 +433,6 @@ public static void CrashOnCreateVariantFromRNA() proteins[0].CreateVariant( proteins[0].BaseSequence, rna, - [], // sequenceVariants [], // appliedSequenceVariants [], // applicableTruncationProducts new Dictionary>(), diff --git a/mzLib/Test/Transcriptomics/TestVariantOligo.cs b/mzLib/Test/Transcriptomics/TestVariantOligo.cs index 1e55b2eed..96486982c 100644 --- a/mzLib/Test/Transcriptomics/TestVariantOligo.cs +++ b/mzLib/Test/Transcriptomics/TestVariantOligo.cs @@ -362,7 +362,6 @@ public static void CrashOnCreateVariantFromProtein() rnas[0].CreateVariant( rnas[0].BaseSequence, protein, - [], // sequenceVariants [], // appliedSequenceVariants [], // applicableTruncationProducts new Dictionary>(), // mods diff --git a/mzLib/Transcriptomics/NucleicAcid.cs b/mzLib/Transcriptomics/NucleicAcid.cs index 12d046723..6f11d8de1 100644 --- a/mzLib/Transcriptomics/NucleicAcid.cs +++ b/mzLib/Transcriptomics/NucleicAcid.cs @@ -268,14 +268,8 @@ public IEnumerable Digest(RnaDigestionParams digestionParamete public IDictionary> OriginalNonVariantModifications { get; set; } // Abstract so we can do this construction in the appropriate derived class - public abstract TBioPolymerType CreateVariant( - string variantBaseSequence, - TBioPolymerType original, - IEnumerable? sequenceVariants, - IEnumerable? appliedSequenceVariants, - IEnumerable applicableProteolysisProducts, - IDictionary> oneBasedModifications, - string sampleNameForVariants) + public abstract TBioPolymerType CreateVariant(string variantBaseSequence, TBioPolymerType original, IEnumerable appliedSequenceVariants, + IEnumerable applicableProteolysisProducts, IDictionary> oneBasedModifications, string sampleNameForVariants) where TBioPolymerType : IHasSequenceVariants; #endregion diff --git a/mzLib/Transcriptomics/RNA.cs b/mzLib/Transcriptomics/RNA.cs index 7a5d9921a..e68bfe00b 100644 --- a/mzLib/Transcriptomics/RNA.cs +++ b/mzLib/Transcriptomics/RNA.cs @@ -5,6 +5,7 @@ using Omics; using Omics.BioPolymer; using Omics.Modifications; +using System.Security.Cryptography; namespace Transcriptomics { @@ -42,26 +43,16 @@ public RNA(string sequence, string accession, /// /// For creating a variant of an existing nucleic acid. Filters out modifications that do not match their nucleotide target site. /// - // NEW: canonical constructor that accepts sequenceVariants explicitly - public RNA(string variantBaseSequence, - NucleicAcid original, - IEnumerable? sequenceVariants, - IEnumerable? appliedSequenceVariants, - IEnumerable? applicableTruncationProducts, - IDictionary> oneBasedModifications, - string sampleNameForVariants) - : this(variantBaseSequence, - VariantApplication.GetAccession(original, appliedSequenceVariants), + public RNA(string variantBaseSequence, NucleicAcid original, IEnumerable? appliedSequenceVariants, + IEnumerable? applicableTruncationProducts, IDictionary> oneBasedModifications, string sampleNameForVariants) + : this(variantBaseSequence, VariantApplication.GetAccession(original, appliedSequenceVariants), oneBasedModifications, original.FivePrimeTerminus, original.ThreePrimeTerminus, VariantApplication.GetVariantName(original.Name, appliedSequenceVariants), original.Organism, original.DatabaseFilePath, original.IsContaminant, original.IsDecoy, original.GeneNames, original.AdditionalDatabaseFields, - [.. (applicableTruncationProducts ?? new List())], - // COPY: do not alias the DB list - [.. (sequenceVariants ?? original.SequenceVariations ?? new List())], - [.. (appliedSequenceVariants ?? new List())], - sampleNameForVariants, + [.. applicableTruncationProducts ?? new List()], original.SequenceVariations, + [.. appliedSequenceVariants ?? new List()], sampleNameForVariants, VariantApplication.GetVariantName(original.FullName, appliedSequenceVariants)) { ConsensusVariant = original.ConsensusVariant; @@ -69,16 +60,7 @@ public RNA(string variantBaseSequence, AppliedSequenceVariations = (appliedSequenceVariants ?? new List()).ToList(); SampleNameForVariants = sampleNameForVariants; } - // Back-compat overload (kept for existing call sites) - public RNA(string variantBaseSequence, - NucleicAcid original, - IEnumerable? appliedSequenceVariants, - IEnumerable? applicableTruncationProducts, - IDictionary> oneBasedModifications, - string sampleNameForVariants) - : this(variantBaseSequence, original, original.SequenceVariations, appliedSequenceVariants, applicableTruncationProducts, oneBasedModifications, sampleNameForVariants) - { - } + public override IBioPolymer CloneWithNewSequenceAndMods(string newBaseSequence, IDictionary>? newMods = null) { // Create a new rna with the new base sequence and modifications @@ -86,35 +68,15 @@ public override IBioPolymer CloneWithNewSequenceAndMods(string newBaseSequence, return newRna; } - public override TBioPolymerType CreateVariant( - string variantBaseSequence, - TBioPolymerType original, - IEnumerable? sequenceVariants, - IEnumerable? appliedSequenceVariants, - IEnumerable applicableTruncationProducts, - IDictionary> oneBasedModifications, - string sampleNameForVariants) + public override TBioPolymerType CreateVariant(string variantBaseSequence, TBioPolymerType original, IEnumerable appliedSequenceVariants, + IEnumerable applicableTruncationProducts, IDictionary> oneBasedModifications, string sampleNameForVariants) { - if (original is not RNA originalRna) + if (original is not RNA rna) throw new ArgumentException("The original nucleic acid must be RNA to create an RNA variant."); - var variant = new RNA( - variantBaseSequence, - originalRna, - sequenceVariants, - appliedSequenceVariants, - applicableTruncationProducts, - oneBasedModifications, - sampleNameForVariants); - - // Remove only actually-applied variants from the VARIANT’s copy - if (appliedSequenceVariants != null && appliedSequenceVariants.Any()) - { - var appliedSimple = new HashSet(appliedSequenceVariants.Select(v => v.SimpleString())); - variant.SequenceVariations.RemoveAll(sv => appliedSimple.Contains(sv.SimpleString())); - } - - return (TBioPolymerType)(IHasSequenceVariants)variant; + var variantRNA = new RNA(variantBaseSequence, rna, appliedSequenceVariants, + applicableTruncationProducts, oneBasedModifications, sampleNameForVariants); + return (TBioPolymerType)(IHasSequenceVariants)variantRNA; } public bool Equals(RNA? other) diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs index 252179982..96ea653ac 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs @@ -138,7 +138,13 @@ public static List LoadProteinXML(string proteinDbLocation, bool genera decoys.AddRange(DecoyProteinGenerator.GenerateDecoys(targets, decoyType, maxThreads, decoyIdentifier)); IEnumerable proteinsToExpand = generateTargets ? targets.Concat(decoys) : decoys; + var bubba = proteinsToExpand + .SelectMany(p => p.GetVariantBioPolymers(maxHeterozygousVariants, minAlleleDepth)).ToList(); + var t = targets.ToList(); + var e = t.SelectMany(p => p.GetVariantBioPolymers(maxHeterozygousVariants, minAlleleDepth)).ToList(); + var toReturn = proteinsToExpand.SelectMany(p => p.GetVariantBioPolymers(maxHeterozygousVariants, minAlleleDepth)); + return Merge(toReturn).ToList(); }