diff --git a/mzLib/Omics/BioPolymer/IHasSequenceVariants.cs b/mzLib/Omics/BioPolymer/IHasSequenceVariants.cs index b9bb73015..384d098ae 100644 --- a/mzLib/Omics/BioPolymer/IHasSequenceVariants.cs +++ b/mzLib/Omics/BioPolymer/IHasSequenceVariants.cs @@ -47,13 +47,19 @@ public interface IHasSequenceVariants /// List TruncationProducts { get; } - + /// - /// Used to construct a new variant of the same type as the original and is called in + /// Constructs a new variant biopolymer instance from the original, applying the specified sequence variants and modifications. + /// The method sets both the full set of database sequence variants and the subset of applied variants, ensuring that only unapplied variants remain in the SequenceVariations list, + /// while applied variants are tracked in AppliedSequenceVariations. This enables accurate representation of both the original and variant-specific annotations. /// - /// The generic structure enables proteins to produce proteins and RNA to produce RNA - /// - TBioPolymerType CreateVariant(string variantBaseSequence, TBioPolymerType original, IEnumerable appliedSequenceVariants, - IEnumerable applicableProteolysisProducts, IDictionary> oneBasedModifications, string sampleNameForVariants) - where TBioPolymerType : IHasSequenceVariants; + TBioPolymerType CreateVariant( + string variantBaseSequence, + TBioPolymerType original, + IEnumerable appliedSequenceVariants, + IEnumerable applicableProteolysisProducts, + IDictionary> oneBasedModifications, + string sampleNameForVariants) + where TBioPolymerType : IHasSequenceVariants; } \ No newline at end of file diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs index 1669c3afe..b280f2662 100644 --- a/mzLib/Omics/BioPolymer/VariantApplication.cs +++ b/mzLib/Omics/BioPolymer/VariantApplication.cs @@ -1,4 +1,5 @@ -using MzLibUtil; +using System.Linq; +using MzLibUtil; using Omics.BioPolymer; using Omics.Modifications; @@ -94,24 +95,36 @@ public static int RestoreModificationIndex(IHasSequenceVariants protein, int var /// /// Applies multiple variant changes to a protein sequence /// - public static List ApplyVariants(TBioPolymerType protein, IEnumerable sequenceVariations, int maxAllowedVariantsForCombinitorics, int minAlleleDepth) + public static List ApplyVariants(TBioPolymerType protein, IEnumerable sequenceVariations, int maxAllowedVariantsForCombinatorics, int minAlleleDepth) where TBioPolymerType : IHasSequenceVariants { List uniqueEffectsToApply = sequenceVariations .GroupBy(v => v.SimpleString()) .Select(x => x.First()) .Where(v => v.Description.Genotypes.Count > 0) // this is a VCF line - .OrderByDescending(v => v.OneBasedBeginPosition) // apply variants at the end of the protein sequence first + .OrderBy(v => v.OneBasedBeginPosition) + .ThenBy(v => v.OneBasedEndPosition) + .ThenBy(v => v.OriginalSequence ?? string.Empty) + .ThenBy(v => v.VariantSequence ?? string.Empty) // apply variants at the end of the protein sequence first .ToList(); - TBioPolymerType proteinCopy = protein.CreateVariant(protein.BaseSequence, protein, null, protein.TruncationProducts, protein.OneBasedPossibleLocalizedModifications, null); - - // If there aren't any variants to apply, just return the base protein + // If there aren't any variants to apply, just return the original as-is if (uniqueEffectsToApply.Count == 0) { - return new List { proteinCopy }; + return new List { protein }; } + // Start from a normalized copy whose “unapplied” list is the DB list. + TBioPolymerType proteinCopy = protein.CreateVariant( + protein.BaseSequence, + protein, + null, // none applied yet + protein.TruncationProducts, + protein.OneBasedPossibleLocalizedModifications, + null); + + + HashSet individuals = new HashSet(uniqueEffectsToApply.SelectMany(v => v.Description.Genotypes.Keys)); List variantProteins = new(); List newVariantProteins = new(); @@ -121,7 +134,7 @@ public static List ApplyVariants(TBioPolymerTy newVariantProteins.Clear(); newVariantProteins.Add(proteinCopy); - bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.Description.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics; + bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.Description.Heterozygous[individual]) > maxAllowedVariantsForCombinatorics; foreach (var variant in uniqueEffectsToApply) { bool variantAlleleIsInTheGenotype = variant.Description.Genotypes[individual].Contains(variant.Description.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff @@ -145,12 +158,12 @@ public static List ApplyVariants(TBioPolymerTy { if (isDeepAlternateAllele && isDeepReferenceAllele) { - if (newVariantProteins.Count == 1 && maxAllowedVariantsForCombinitorics > 0) + if (newVariantProteins.Count == 1 && maxAllowedVariantsForCombinatorics > 0) { TBioPolymerType variantProtein = ApplySingleVariant(variant, newVariantProteins[0], individual); newVariantProteins.Add(variantProtein); } - else if (maxAllowedVariantsForCombinitorics > 0) + else if (maxAllowedVariantsForCombinatorics > 0) { newVariantProteins[1] = ApplySingleVariant(variant, newVariantProteins[1], individual); } @@ -159,7 +172,7 @@ public static List ApplyVariants(TBioPolymerTy // no heterozygous variants } } - else if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0) + else if (isDeepAlternateAllele && maxAllowedVariantsForCombinatorics > 0) { newVariantProteins = newVariantProteins.Select(p => ApplySingleVariant(variant, p, individual)).ToList(); } @@ -176,7 +189,7 @@ public static List ApplyVariants(TBioPolymerTy foreach (var ppp in newVariantProteins) { - if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0 && isDeepReferenceAllele) + if (isDeepAlternateAllele && maxAllowedVariantsForCombinatorics > 0 && isDeepReferenceAllele) { // keep reference allele if (variant.Description.Genotypes[individual].Contains("0")) @@ -187,7 +200,7 @@ public static List ApplyVariants(TBioPolymerTy // alternate allele (replace all, since in heterozygous with two alternates, both alternates are included) combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual)); } - else if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0) + else if (isDeepAlternateAllele && maxAllowedVariantsForCombinatorics > 0) { combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual)); } @@ -253,7 +266,14 @@ private static TBioPolymerType ApplySingleVariant(SequenceVaria Dictionary> adjustedModifications = AdjustModificationIndices(variantGettingApplied, variantSequence, protein); List adjustedAppliedVariations = AdjustSequenceVariationIndices(variantGettingApplied, variantSequence, appliedVariations); - return protein.CreateVariant(variantSequence, protein, adjustedAppliedVariations, adjustedProteolysisProducts, adjustedModifications, individual); + // Pass BOTH lists: current DB list (possibly already filtered on the interim proteoform) + newly applied + return protein.CreateVariant( + variantSequence, + protein, + adjustedAppliedVariations, + adjustedProteolysisProducts, + adjustedModifications, + individual); } /// @@ -412,12 +432,24 @@ private static Dictionary> AdjustModificationIndices(Seq return mods; } + /// + /// Deterministic ordering for variant-based naming/IDs to avoid order-of-application artifacts. + /// + private static IEnumerable OrderForNaming(IEnumerable variations) => + variations + .OrderBy(v => v.OneBasedBeginPosition) + .ThenBy(v => v.OneBasedEndPosition) + .ThenBy(v => v.OriginalSequence ?? string.Empty) + .ThenBy(v => v.VariantSequence ?? string.Empty); + /// /// Format string to append to accession /// private static string CombineSimpleStrings(IEnumerable? variations) { - return variations.IsNullOrEmpty() ? "" : string.Join("_", variations.Select(v => v.SimpleString())); + return variations.IsNullOrEmpty() + ? "" + : string.Join("_", OrderForNaming(variations!).Select(v => v.SimpleString())); } /// @@ -425,7 +457,9 @@ private static string CombineSimpleStrings(IEnumerable? varia /// public static string CombineDescriptions(IEnumerable? variations) { - return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.Description)); + return variations.IsNullOrEmpty() + ? "" + : string.Join(", variant:", OrderForNaming(variations!).Select(v => v.Description)); } /// /// Applies all possible combinations of the provided SequenceVariation list to the base TBioPolymerType object, @@ -451,7 +485,13 @@ public static IEnumerable ApplyAllVariantCombinations= maxCombinations) yield break; - + // Sort variations to ensure deterministic combination order + variations = variations + .OrderBy(v => v.OneBasedBeginPosition) + .ThenBy(v => v.OneBasedEndPosition) + .ThenBy(v => v.OriginalSequence ?? string.Empty) + .ThenBy(v => v.VariantSequence ?? string.Empty) + .ToList(); int n = variations.Count; for (int size = 1; size <= n; size++) { diff --git a/mzLib/Proteomics/Protein/Protein.cs b/mzLib/Proteomics/Protein/Protein.cs index e6694b560..c9b0c8115 100644 --- a/mzLib/Proteomics/Protein/Protein.cs +++ b/mzLib/Proteomics/Protein/Protein.cs @@ -133,30 +133,35 @@ public Protein(Protein originalProtein, string newBaseSequence) /// /// /// - public Protein(string variantBaseSequence, Protein protein, IEnumerable appliedSequenceVariations, - IEnumerable applicableProteolysisProducts, IDictionary> oneBasedModifications, string sampleNameForVariants) + public Protein( + string variantBaseSequence, + Protein protein, + IEnumerable appliedSequenceVariations, + IEnumerable applicableProteolysisProducts, + IDictionary> oneBasedModifications, + string sampleNameForVariants) : this( - variantBaseSequence, - VariantApplication.GetAccession(protein, appliedSequenceVariations), - organism: protein.Organism, - geneNames: new List>(protein.GeneNames), - oneBasedModifications: oneBasedModifications != null ? oneBasedModifications.ToDictionary(x => x.Key, x => x.Value) : new Dictionary>(), - proteolysisProducts: new List(applicableProteolysisProducts ?? new List()), - name: VariantApplication.GetVariantName(protein.Name, appliedSequenceVariations), - fullName: VariantApplication.GetVariantName(protein.FullName, appliedSequenceVariations), - isDecoy: protein.IsDecoy, - isContaminant: protein.IsContaminant, - databaseReferences: new List(protein.DatabaseReferences), - sequenceVariations: new List(protein.SequenceVariations), - disulfideBonds: new List(protein.DisulfideBonds), - spliceSites: new List(protein.SpliceSites), - databaseFilePath: protein.DatabaseFilePath, - dataset: protein.DatasetEntryTag, - created: protein.CreatedEntryTag, - modified: protein.ModifiedEntryTag, - version: protein.VersionEntryTag, - xmlns: protein.XmlnsEntryTag, - uniProtSequenceAttributes: protein.UniProtSequenceAttributes) + variantBaseSequence, + VariantApplication.GetAccession(protein, appliedSequenceVariations), + organism: protein.Organism, + geneNames: new List>(protein.GeneNames), + oneBasedModifications: oneBasedModifications != null ? oneBasedModifications.ToDictionary(x => x.Key, x => x.Value) : new Dictionary>(), + proteolysisProducts: new List(applicableProteolysisProducts ?? new List()), + name: VariantApplication.GetVariantName(protein.Name, appliedSequenceVariations), + fullName: VariantApplication.GetVariantName(protein.FullName, appliedSequenceVariations), + isDecoy: protein.IsDecoy, + isContaminant: protein.IsContaminant, + databaseReferences: new List(protein.DatabaseReferences), + sequenceVariations: new List(protein.SequenceVariations), + disulfideBonds: new List(protein.DisulfideBonds), + spliceSites: new List(protein.SpliceSites), + databaseFilePath: protein.DatabaseFilePath, + dataset: protein.DatasetEntryTag, + created: protein.CreatedEntryTag, + modified: protein.ModifiedEntryTag, + version: protein.VersionEntryTag, + xmlns: protein.XmlnsEntryTag, + uniProtSequenceAttributes: protein.UniProtSequenceAttributes) { NonVariantProtein = protein.ConsensusVariant as Protein; OriginalNonVariantModifications = ConsensusVariant.OriginalNonVariantModifications; @@ -603,8 +608,9 @@ public TBioPolymerType CreateVariant(string variantBaseSequence if (original is not Protein originalProtein) throw new ArgumentException("The original BioPolymer must be Protein to create a protein variant"); - var variantProtein = new Protein(variantBaseSequence, originalProtein, appliedSequenceVariants, + var variantProtein = new Protein(variantBaseSequence, originalProtein, appliedSequenceVariants, applicableProteolysisProducts, oneBasedModifications, sampleNameForVariants); + return (TBioPolymerType)(IHasSequenceVariants)variantProtein; } #endregion diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 8ced7b208..43a4e58e8 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -50,7 +50,16 @@ public static void TearDown() public static void VariantProtein() { Protein p = new Protein("MAAA", "accession"); - Protein v = new Protein("MAVA", p, new[] { new SequenceVariation(3, "A", "V", "desc", null) }, null, null, null); + + // New ctor: (variantBaseSequence, protein, sequenceVariants, appliedSequenceVariants, proteolysisProducts, mods, sampleName) + Protein v = new Protein( + "MAVA", + p, + new[] { new SequenceVariation(3, "A", "V", "desc", null) }, // applied + null, + null, + null); + Assert.AreEqual(p, v.ConsensusVariant); } @@ -133,19 +142,19 @@ public static void LoadSeqVarModifications(string databaseName, int modIdx, int Assert.AreEqual(modIdx, target.OneBasedPossibleLocalizedModifications.Single().Key); Assert.AreEqual(1, target.AppliedSequenceVariations.Count()); Assert.AreEqual(modIdx, target.AppliedSequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(1, target.SequenceVariations.Count()); - Assert.AreEqual(modIdx, target.SequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(1, target.SequenceVariations.Single().OneBasedModifications.Count); - Assert.AreEqual(modIdx, target.SequenceVariations.Single().OneBasedModifications.Single().Key); //PEP[mod]TID, MEP[mod]TID + Assert.AreEqual(0, target.SequenceVariations.Count()); + Assert.AreEqual(modIdx, target.AppliedSequenceVariations.Single().OneBasedBeginPosition); + Assert.AreEqual(1, target.AppliedSequenceVariations.Single().OneBasedModifications.Count); + Assert.AreEqual(modIdx, target.AppliedSequenceVariations.Single().OneBasedModifications.Single().Key); //PEP[mod]TID, MEP[mod]TID var decoy = proteins[1]; Assert.AreEqual(1, decoy.OneBasedPossibleLocalizedModifications.Count); Assert.AreEqual(reversedModIdx, decoy.OneBasedPossibleLocalizedModifications.Single().Key); //DITP[mod]EP, MDITP[mod]E Assert.AreEqual(1, decoy.AppliedSequenceVariations.Count()); Assert.AreEqual(reversedModIdx, decoy.AppliedSequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(1, decoy.SequenceVariations.Count()); - Assert.AreEqual(reversedModIdx, decoy.SequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(1, decoy.SequenceVariations.Single().OneBasedModifications.Count); - Assert.AreEqual(reversedModIdx, decoy.SequenceVariations.Single().OneBasedModifications.Single().Key); + Assert.AreEqual(0, decoy.SequenceVariations.Count()); + Assert.AreEqual(reversedModIdx, decoy.AppliedSequenceVariations.Single().OneBasedBeginPosition); + Assert.AreEqual(1, decoy.AppliedSequenceVariations.Single().OneBasedModifications.Count); + Assert.AreEqual(reversedModIdx, decoy.AppliedSequenceVariations.Single().OneBasedModifications.Single().Key); string rewriteDbName = $"{Path.GetFileNameWithoutExtension(databaseName)}rewrite.xml"; ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), proteins.Where(p => !p.IsDecoy).ToList(), Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", rewriteDbName)); @@ -156,19 +165,19 @@ public static void LoadSeqVarModifications(string databaseName, int modIdx, int Assert.AreEqual(modIdx, target.OneBasedPossibleLocalizedModifications.Single().Key); Assert.AreEqual(1, target.AppliedSequenceVariations.Count()); Assert.AreEqual(modIdx, target.AppliedSequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(1, target.SequenceVariations.Count()); - Assert.AreEqual(modIdx, target.SequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(1, target.SequenceVariations.Single().OneBasedModifications.Count); - Assert.AreEqual(modIdx, target.SequenceVariations.Single().OneBasedModifications.Single().Key); + Assert.AreEqual(0, target.SequenceVariations.Count()); + Assert.AreEqual(modIdx, target.AppliedSequenceVariations.Single().OneBasedBeginPosition); + Assert.AreEqual(1, target.AppliedSequenceVariations.Single().OneBasedModifications.Count); + Assert.AreEqual(modIdx, target.AppliedSequenceVariations.Single().OneBasedModifications.Single().Key); decoy = proteins[1]; Assert.AreEqual(1, decoy.OneBasedPossibleLocalizedModifications.Count); Assert.AreEqual(reversedModIdx, decoy.OneBasedPossibleLocalizedModifications.Single().Key); Assert.AreEqual(1, decoy.AppliedSequenceVariations.Count()); Assert.AreEqual(reversedModIdx, decoy.AppliedSequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(1, decoy.SequenceVariations.Count()); - Assert.AreEqual(reversedModIdx, decoy.SequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(1, decoy.SequenceVariations.Single().OneBasedModifications.Count); - Assert.AreEqual(reversedModIdx, decoy.SequenceVariations.Single().OneBasedModifications.Single().Key); + Assert.AreEqual(0, decoy.SequenceVariations.Count()); + Assert.AreEqual(reversedModIdx, decoy.AppliedSequenceVariations.Single().OneBasedBeginPosition); + Assert.AreEqual(1, decoy.AppliedSequenceVariations.Single().OneBasedModifications.Count); + Assert.AreEqual(reversedModIdx, decoy.AppliedSequenceVariations.Single().OneBasedModifications.Single().Key); } [TestCase("ranges1.xml", 1, 2, 5, 6)] // without starting methionine @@ -269,15 +278,37 @@ public static void ReverseDecoySpliceSites(string databaseName, int beginIdx, in } [Test] - [TestCase("HomozygousHLA.xml", 1, 18)] - [TestCase("HomozygousHLA.xml", 10, 17)] - public static void HomozygousVariantsAtVariedDepths(string filename, int minVariantDepth, int appliedCount) + public static void HomozygousVariantsDepthOne() + { + string filename = "HomozygousHLA.xml"; + int minVariantDepth = 1; + int appliedCount = 18; + var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", filename), true, + DecoyType.None, null, false, null, out var unknownModifications, minAlleleDepth: minVariantDepth); + Assert.AreEqual(1, proteins.Count); + //all sequence variants are applied and therefore removed from the SequenceVariations list + Assert.AreEqual(0, proteins[0].SequenceVariations.Count()); // some redundant + + Assert.AreEqual(appliedCount, proteins[0].AppliedSequenceVariations.Count()); // some redundant + Assert.AreEqual(appliedCount, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes + Assert.AreEqual(1, proteins[0].GetVariantBioPolymers().Count); + var variantProteins = proteins[0].GetVariantBioPolymers(); + List peptides = proteins.SelectMany(vp => vp.Digest(new DigestionParams(), null, null)).ToList(); + } + + [Test] + + public static void HomozygousVariantsDepth17() { + string filename = "HomozygousHLA.xml"; + int minVariantDepth = 10; + int appliedCount = 17; var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", filename), true, DecoyType.None, null, false, null, out var unknownModifications, minAlleleDepth: minVariantDepth); Assert.AreEqual(1, proteins.Count); - Assert.AreEqual(18, proteins[0].SequenceVariations.Count()); // some redundant - Assert.AreEqual(18, proteins[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes + // 17 of 18 sequence variants are applied and therefore removed from the SequenceVariations list leaving 1 behind + Assert.AreEqual(1, proteins[0].SequenceVariations.Count()); // some redundant + Assert.AreEqual(1, proteins[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes Assert.AreEqual(appliedCount, proteins[0].AppliedSequenceVariations.Count()); // some redundant Assert.AreEqual(appliedCount, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes Assert.AreEqual(1, proteins[0].GetVariantBioPolymers().Count); @@ -398,7 +429,14 @@ public static void CrashOnCreateVariantFromRNA() var rna = new RNA("GUACUGACU"); NUnit.Framework.Assert.Throws(() => { - proteins[0].CreateVariant(proteins[0].BaseSequence, rna, [], [], new Dictionary>(), ""); + // New signature requires sequenceVariants and applicableTruncationProducts + proteins[0].CreateVariant( + proteins[0].BaseSequence, + rna, + [], // appliedSequenceVariants + [], // applicableTruncationProducts + new Dictionary>(), + ""); }); } @@ -576,6 +614,137 @@ public void SequenceVariationIsValidTest() Assert.IsTrue(svValidOriginalSequenceIsEmpty.AreValid()); //This is valid because it is an insertion Assert.IsTrue(svValidVariantSequenceLenthIsZero.AreValid()); // This is valid because it is a deletion } + /// + /// In this unit test, you will not get the base protein because the variant in your VCF is homozygous ALT (GT=1/1) with sufficient depth, + /// and the loader always expands VCF-only entries via ApplyVariants. The maxHeterozygousVariants parameter only limits heterozygous combinatorics; it does not prevent applying homozygous variants. + /// What happens in your path + /// • LoadProteinXML always expands proteins by calling GetVariantBioPolymers(maxHeterozygousVariants, minAlleleDepth). + /// • If a protein has only VCF-style sequence variations, GetVariantBioPolymers routes to ApplyVariants. + /// • In ApplyVariants, a homozygous ALT with depth ≥ minAlleleDepth results in replacing the base with the variant: + /// • isHomozygousAlternate == true and isDeepAlternateAllele == true → newVariantProteins = ApplySingleVariant(...), i.e., base is not retained. + /// • maxHeterozygousVariants only affects heterozygous logic.It does nothing for homozygous ALT variants. + /// Why your specific test applies the variant Your vcfstring is GT= 1 / 1 with AD = 0,15 (ALT depth 15). With the default minAlleleDepth=1 (you didn’t override it), + /// ApplyVariants treats this as homozygous ALT with sufficient depth and applies it, returning only the variant protein. + /// + [Test] + public void VcfProteinWithModOnDeepHomozygousVariant() + { + string vcfstring = + "8\t98089650\t.\tG\tT\t488.77\t.\tANN=T|missense_variant|MODERATE|ERICH5|ENSG00000177459|transcript|ENST00000318528.7|protein_coding|2/3|c.633G>T|p.Lys211Asn|992/1761|633/1125|211/374||\tGT:AD:DP:GQ:PL\t1/1:0,15:15:45:517,45,0"; + + // Example VCF line with snpEff annotation: + // 8 98089650 . G T 488.77 . ANN=T|missense_variant|MODERATE|ERICH5|ENSG00000177459|transcript|ENST00000318528.7|protein_coding|2/3|c.633G>T|p.Lys211Asn|992/1761|633/1125|211/374|| GT:AD:DP:GQ:PL 1/1:0,15:15:45:517,45,0 + // + // --- VCF Standard Columns --- + // CHROM (8) → Chromosome 8 + // POS (98089650) → 1-based position of the variant (98,089,650) + // ID (.) → No variant identifier + // REF (G) → Reference allele is G + // ALT (T) → Alternate allele is T + // QUAL (488.77) → Variant call quality score + // FILTER (.) → No filter applied + // + // --- INFO Column --- + // INFO (ANN=...) holds snpEff annotation data. + // ANN format is: + // Allele | Effect | Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | + // Transcript_Biotype | Rank | HGVS.c | HGVS.p | cDNA_pos/cDNA_len | + // CDS_pos/CDS_len | AA_pos/AA_len | Distance | Errors/Warnings + // + // In this case: ANN=T|missense_variant|MODERATE|ERICH5|ENSG00000177459|transcript|ENST00000318528.7|protein_coding|2/3|c.633G>T|p.Lys211Asn|992/1761|633/1125|211/374|| + // - Allele = T + // - Effect = missense_variant (amino acid change) + // - Impact = MODERATE + // - Gene_Name = ERICH5 + // - Gene_ID = ENSG00000177459 + // - Feature_Type = transcript + // - Feature_ID = ENST00000318528.7 + // - Transcript_Biotype = protein_coding + // - Rank = 2/3 + // - HGVS.c = c.633G>T (cDNA change) + // - HGVS.p = p.Lys211Asn (protein change: Lysine 211 to Asparagine) + // - cDNA_pos/cDNA_len = 992/1761 + // - CDS_pos/CDS_len = 633/1125 + // - AA_pos/AA_len = 211/374 + // - Distance, Errors/Warnings = empty + // + // --- FORMAT Column --- + // FORMAT (GT:AD:DP:GQ:PL) defines how to read the sample column(s): + // GT → Genotype (1/1 = homozygous alternate) + // AD → Allele depth (0 reads for REF, 15 for ALT) + // DP → Read depth (15 reads at this site) + // GQ → Genotype quality (45) + // PL → Phred-scaled genotype likelihoods (517,45,0) + // + // --- SAMPLE Column --- + // Sample entry: 1/1:0,15:15:45:517,45,0 + // GT = 1/1 → Homozygous ALT genotype (both alleles = T) + // AD = 0,15 → 0 reads for REF (G), 15 reads for ALT (T) + // DP = 15 → Total coverage at this site = 15 reads + // GQ = 45 → Genotype quality + // PL = 517,45,0 → Genotype likelihoods + // + // --- Overall Summary --- + // Variant at chr8:98089650 changes G → T. + // The sample is homozygous for the ALT allele (T). + // The variant causes a missense mutation in ERICH5 (Lys211Asn). + // Variant passed filters, with moderate predicted impact. + + string file = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "oneVcfProteinWithModsOnVariant.xml"); + List proteinsNoDecoys = ProteinDbLoader.LoadProteinXML(file, true, DecoyType.None, null, false, null, out var un, + maxHeterozygousVariants: 0); + // What: Ensure only one proteoform was created. + // Expect: 1. + // Why: The VCF contains a single homozygous ALT (GT=1/1) with sufficient depth, so the loader applies the variant and does not retain a separate base proteoform (DecoyType.None, maxHeterozygousVariants=0). + Assert.AreEqual(1, proteinsNoDecoys.Count); + + // What: Verify the returned proteoform is the variant with the expected accession suffix. + // Expect: At least one protein has accession "Q6P6B1_K211N". + // Why: Variant accessions are formed by appending the applied variant’s SimpleString (K211N) to the base accession (VariantApplication.GetAccession). + Assert.That(proteinsNoDecoys.Any(a => a.Accession == "Q6P6B1_K211N")); + + // What: Confirm the unmodified base accession is not present as its own proteoform. + // Expect: No protein has accession "Q6P6B1". + // Why: For homozygous ALT with depth >= minAlleleDepth, only the variant proteoform is emitted; the base (reference) proteoform is not included. + Assert.That(proteinsNoDecoys.Any(a => a.Accession == "Q6P6B1"), Is.False); + + // What: The ConsensusVariant (reference) is still tracked for the proteoform. + // Expect: At least one protein references a ConsensusVariant with accession "Q6P6B1". + // Why: ConsensusVariant points to the original/reference protein, even when the returned proteoform is a variant. + Assert.That(proteinsNoDecoys.Any(a => a.ConsensusVariant.Accession == "Q6P6B1")); + + // What: Check the number of distinct positions with possible localized modifications on the variant proteoform. + // Expect: 7. + // Why: OneBasedPossibleLocalizedModifications is a dictionary keyed by 1-based residue index; Values.Count equals the number of distinct modified positions after variant application and index adjustments. + Assert.AreEqual(7, proteinsNoDecoys.First().OneBasedPossibleLocalizedModifications.Values.Count); + + // What: Verify that, on the variant proteoform, there are no remaining unapplied database sequence variants. + // Expect: 0. + // Why: Applied variants are removed from SequenceVariations during variant construction, and this case has only the single VCF-driven variant applied. + Assert.AreEqual(0, proteinsNoDecoys.First().SequenceVariations.Count); + + // What: Confirm exactly one sequence variant was applied to produce this proteoform. + // Expect: 1. + // Why: The input specifies a single homozygous variant (K211N), so exactly one applied variation is recorded. + Assert.AreEqual(1, proteinsNoDecoys.First().AppliedSequenceVariations.Count); + + // What: Verify the total number of modifications annotated on the applied variant region. + // Expect: 2. + // Why: AppliedSequenceVariations.First().OneBasedModifications is a dictionary of position -> list of modifications on the variant region. + // In this case there is one key (position) with two modifications; summing the list counts across all keys yields 2. + Assert.AreEqual(2, proteinsNoDecoys.First().AppliedSequenceVariations.First().OneBasedModifications.Values.Sum(v => v.Count)); + + // Additional checks on the ConsensusVariant for completeness + var consensusProtein = proteinsNoDecoys.First().ConsensusVariant; + var consensusSequenceVariants = consensusProtein.SequenceVariations; + Assert.That(1, Is.EqualTo(consensusSequenceVariants.Count)); + Assert.That(211, Is.EqualTo(consensusSequenceVariants.First().OneBasedBeginPosition)); + var consensusAppliedVariants = consensusProtein.AppliedSequenceVariations; + Assert.That(0, Is.EqualTo(consensusAppliedVariants.Count)); + var consensusModifications = consensusProtein.OneBasedPossibleLocalizedModifications; + Assert.That(6, Is.EqualTo(consensusModifications.Count)); + } + [Test] public void VariantModificationTest() { @@ -585,9 +754,9 @@ public void VariantModificationTest() List variantTargets = targets.Where(p => p.AppliedSequenceVariations.Count >= 1).ToList(); List decoys = variantProteins.Where(p => p.IsDecoy == true).ToList(); List variantDecoys = decoys.Where(p => p.AppliedSequenceVariations.Count >= 1).ToList(); - bool homozygousVariant = targets.Select(p => p.Accession).Contains("Q6P6B1"); + bool homozygousVariant = targets.Select(p => p.Accession).Contains("Q6P6B1"); - var variantMods = targets.SelectMany(p => p.AppliedSequenceVariations.Where(x=>x.OneBasedModifications.Count>= 1)).ToList(); + var variantMods = targets.SelectMany(p => p.AppliedSequenceVariations.Where(x => x.OneBasedModifications.Count >= 1)).ToList(); var decoyMods = decoys.SelectMany(p => p.AppliedSequenceVariations.Where(x => x.OneBasedModifications.Count >= 1)).ToList(); var negativeResidues = decoyMods.SelectMany(x => x.OneBasedModifications.Where(w => w.Key < 0)).ToList(); bool namingWrong = targets.Select(p => p.Accession).Contains("Q8N865_H300R_A158T_H300R"); @@ -603,7 +772,6 @@ public void VariantModificationTest() Assert.AreEqual(2, variantMods.Count); Assert.AreEqual(2, decoyMods.Count); Assert.AreEqual(0, negativeResidues.Count); - } [Test] public void WriteProteinXmlWithVariantsDiscoveredAsModifications2() @@ -770,5 +938,575 @@ public void Constructor_ParsesDescriptionCorrectly() var adValues = svd.AlleleDepths[adKey]; Assert.AreEqual(new[] { "30", "30" }, adValues); } + /// + /// The humanGAPDH.xml test file contains one protein with two separate amino acid substitution variants. + /// When loaded with ProteinDbLoader.LoadProteinXML with variant expansion enabled, + /// the resulting protein entries reflect these variants appropriately. + /// There are four total proteins: the consensus (no variants), each single variant applied separately, and both variants applied. + /// Writing this list of four proteins back to XML results in an XML file that contains one Protein entry that is exactly the same as the original. + /// There are no entries for the variant proteins because the writer does not currently serialize applied variants as separate proteins. + /// Loading in this XML again results in the same four proteins as the original load. One consensus and three variant-applied proteoforms. + /// + [Test] + public static void TestProteinVariantsRoundTrip() + { + string databaseName = "humanGAPDH.xml"; + var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true, + DecoyType.None, null, false, null, out var unknownModifications, 1, 99); + Assert.AreEqual(4, proteins.Count); // 4 target + var consensusProtein = proteins[0]; + Assert.That(2, Is.EqualTo(consensusProtein.SequenceVariations.Count)); + Assert.That(0, Is.EqualTo(consensusProtein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(consensusProtein.OneBasedPossibleLocalizedModifications.Keys.Count)); + var A22G_Protein = proteins[1]; + Assert.That(1, Is.EqualTo(A22G_Protein.SequenceVariations.Count)); + Assert.That(1, Is.EqualTo(A22G_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + var K251N_Protein = proteins[2]; + Assert.That(1, Is.EqualTo(K251N_Protein.SequenceVariations.Count)); + Assert.That(1, Is.EqualTo(K251N_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(K251N_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + var K251N_A22G_Protein = proteins[3]; + Assert.That(0, Is.EqualTo(K251N_A22G_Protein.SequenceVariations.Count)); + Assert.That(2, Is.EqualTo(K251N_A22G_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(K251N_A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + // Write the non-decoy proteins to a temporary XML database + string tempDir = Path.Combine(Path.GetTempPath(), $"mzLib_roundtrip_{Guid.NewGuid():N}"); + Directory.CreateDirectory(tempDir); + string tempFile = Path.Combine(tempDir, "roundtrip_proteins.xml"); + + ProteinDbWriter.WriteXmlDatabase( + new Dictionary>>(), // no fixed mods to embed + proteins, + tempFile); + + // Count ProteinXmlEntry nodes directly from the XML (namespace- and case-insensitive) + var xdoc = System.Xml.Linq.XDocument.Load(tempFile); + int proteinXmlEntryCount = xdoc + .Descendants() + .Count(e => string.Equals(e.Name.LocalName, "protein", StringComparison.OrdinalIgnoreCase)); + int countOfConsensusProteins = proteins.Select(p => p.ConsensusVariant).Distinct().Count(); + int countOfProteinsWithAppliedVariants = proteins.Where(p => p.AppliedSequenceVariations.Count > 0).Distinct().Count(); + TestContext.WriteLine($"ProteinXmlEntry count in XML: {proteinXmlEntryCount}"); + + // Expect the XML to contain one entry per written protein + // Mysteriously there is only one written protein..... + Assert.That(proteinXmlEntryCount, Is.EqualTo(countOfConsensusProteins + countOfProteinsWithAppliedVariants)); + + // Read the database back in + var roundTripped = ProteinDbLoader.LoadProteinXML( + tempFile, + generateTargets: true, + decoyType: DecoyType.None, + allKnownModifications: null, + isContaminant: false, + modTypesToExclude: null, + out var unknownMods2, + minAlleleDepth: 1, + maxHeterozygousVariants: 99); + + try + { + // Even though we are applying variants all over again, we should end up with the same total number of proteins + // This is because the variants generated by the consensus protein with sequence variations produce the same set of proteins + // as those with applied variants. In other words, the duplicates get collapsed. + Assert.That(roundTripped.Count, Is.EqualTo(countOfConsensusProteins + countOfProteinsWithAppliedVariants)); + + // Ensure accessions match 1:1 + var originalAccs = proteins.Select(p => p.Accession).OrderBy(a => a).ToList(); + var rtAccs = roundTripped.Select(p => p.Accession).OrderBy(a => a).ToList(); + Assert.That(rtAccs, Is.EqualTo(originalAccs)); + + // Validate per-proteoform annotations survived round-trip + foreach (var rt in roundTripped) + { + var orig = proteins.Single(p => p.Accession == rt.Accession); + + Assert.That(rt.BaseSequence, Is.EqualTo(orig.BaseSequence), $"BaseSequence mismatch for {rt.Accession}"); + Assert.That(rt.SequenceVariations.Count, Is.EqualTo(orig.SequenceVariations.Count), $"SequenceVariations count mismatch for {rt.Accession}"); + Assert.That(rt.AppliedSequenceVariations.Count, Is.EqualTo(orig.AppliedSequenceVariations.Count), $"AppliedSequenceVariations count mismatch for {rt.Accession}"); + Assert.That(rt.OneBasedPossibleLocalizedModifications.Keys.Count, Is.EqualTo(orig.OneBasedPossibleLocalizedModifications.Keys.Count), $"Mods count mismatch for {rt.Accession}"); + + // Spot-check variant identity if applied variants exist + if (orig.AppliedSequenceVariations.Count > 0) + { + var origSimple = orig.AppliedSequenceVariations.Select(v => v.SimpleString()).OrderBy(s => s).ToArray(); + var rtSimple = rt.AppliedSequenceVariations.Select(v => v.SimpleString()).OrderBy(s => s).ToArray(); + Assert.That(rtSimple, Is.EqualTo(origSimple), $"Applied variant set mismatch for {rt.Accession}"); + } + } + } + finally + { + // Cleanup + if (Directory.Exists(tempDir)) + { + try { Directory.Delete(tempDir, recursive: true); } catch { /* ignore */ } + } + } + + } + /// + /// The humanGAPDH.xml test file contains one protein with two separate amino acid substitution variants. + /// When loaded with ProteinDbLoader.LoadProteinXML with variant expansion enabled, + /// the resulting protein entries reflect these variants appropriately. + /// There are four total proteins: the consensus (no variants), each single variant applied separately, and both variants applied. + /// In this test we add a modification to the consensus protein to ensure that modifications survive the round trip as well. + /// Writing this list of four proteins back to XML results in an XML file that contains one Protein entry that is exactly the same as the original. + /// There are no entries for the variant proteins because the writer does not currently serialize applied variants as separate proteins. + /// Loading in this XML again results in the same four proteins as the original load. One consensus and three variant-applied proteoforms. + /// + [Test] + public static void TestProteinVariantsAddModsRoundTrip() + { + ModificationMotif.TryGetMotif("M", out ModificationMotif motifM); + Modification mm = new Modification("mod", null, "type", null, motifM, "Anywhere.", null, 16, new Dictionary>(), null, null, null, null, null); + + + string databaseName = "humanGAPDH.xml"; + var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true, + DecoyType.None, null, false, null, out var unknownModifications, 1, 99); + Assert.AreEqual(4, proteins.Count); // 4 target + var consensusProtein = proteins[0]; + Assert.That(2, Is.EqualTo(consensusProtein.SequenceVariations.Count)); + Assert.That(0, Is.EqualTo(consensusProtein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(consensusProtein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + // Add a modification to the consensus protein to test that mods survive the round trip + consensusProtein.OneBasedPossibleLocalizedModifications.Add(1, new List(){mm}); + Assert.That(1, Is.EqualTo(consensusProtein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + + var A22G_Protein = proteins[1]; + Assert.That(1, Is.EqualTo(A22G_Protein.SequenceVariations.Count)); + Assert.That(1, Is.EqualTo(A22G_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + var K251N_Protein = proteins[2]; + Assert.That(1, Is.EqualTo(K251N_Protein.SequenceVariations.Count)); + Assert.That(1, Is.EqualTo(K251N_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(K251N_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + var K251N_A22G_Protein = proteins[3]; + Assert.That(0, Is.EqualTo(K251N_A22G_Protein.SequenceVariations.Count)); + Assert.That(2, Is.EqualTo(K251N_A22G_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(K251N_A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + // Write the non-decoy proteins to a temporary XML database + string tempDir = Path.Combine(Path.GetTempPath(), $"mzLib_roundtrip_{Guid.NewGuid():N}"); + Directory.CreateDirectory(tempDir); + string tempFile = Path.Combine(tempDir, "roundtrip_proteins.xml"); + + ProteinDbWriter.WriteXmlDatabase( + new Dictionary>>(), // no fixed mods to embed + proteins, + tempFile); + + // Count ProteinXmlEntry nodes directly from the XML (namespace- and case-insensitive) + var xdoc = System.Xml.Linq.XDocument.Load(tempFile); + int proteinXmlEntryCount = xdoc + .Descendants() + .Count(e => string.Equals(e.Name.LocalName, "protein", StringComparison.OrdinalIgnoreCase)); + TestContext.WriteLine($"ProteinXmlEntry count in XML: {proteinXmlEntryCount}"); + + // Expect the XML to contain one entry per written protein + // Mysteriously there is only one written protein..... + Assert.That(proteinXmlEntryCount, Is.EqualTo(1)); + + // Read the database back in + var roundTripped = ProteinDbLoader.LoadProteinXML( + tempFile, + generateTargets: true, + decoyType: DecoyType.None, + allKnownModifications: null, + isContaminant: false, + modTypesToExclude: null, + out var unknownMods2, + minAlleleDepth: 1, + maxHeterozygousVariants: 99); + + try + { + // Basic shape checks + Assert.That(roundTripped.Count, Is.EqualTo(proteins.Count)); + + // Ensure accessions match 1:1 + var originalAccs = proteins.Select(p => p.Accession).OrderBy(a => a).ToList(); + var rtAccs = roundTripped.Select(p => p.Accession).OrderBy(a => a).ToList(); + Assert.That(rtAccs, Is.EqualTo(originalAccs)); + + // Validate per-proteoform annotations survived round-trip + foreach (var rt in roundTripped) + { + var orig = proteins.Single(p => p.Accession == rt.Accession); + + Assert.That(rt.BaseSequence, Is.EqualTo(orig.BaseSequence), $"BaseSequence mismatch for {rt.Accession}"); + Assert.That(rt.SequenceVariations.Count, Is.EqualTo(orig.SequenceVariations.Count), $"SequenceVariations count mismatch for {rt.Accession}"); + Assert.That(rt.AppliedSequenceVariations.Count, Is.EqualTo(orig.AppliedSequenceVariations.Count), $"AppliedSequenceVariations count mismatch for {rt.Accession}"); + + // We added a mod to the consensus protein in a position that will be contained in all variant proteins as well + Assert.That(rt.OneBasedPossibleLocalizedModifications.Keys.Count, Is.EqualTo(1), $"Mods count mismatch for {rt.Accession}"); + + // Spot-check variant identity if applied variants exist + if (orig.AppliedSequenceVariations.Count > 0) + { + var origSimple = orig.AppliedSequenceVariations.Select(v => v.SimpleString()).OrderBy(s => s).ToArray(); + var rtSimple = rt.AppliedSequenceVariations.Select(v => v.SimpleString()).OrderBy(s => s).ToArray(); + Assert.That(rtSimple, Is.EqualTo(origSimple), $"Applied variant set mismatch for {rt.Accession}"); + } + } + } + finally + { + // Cleanup + if (Directory.Exists(tempDir)) + { + try { Directory.Delete(tempDir, recursive: true); } catch { /* ignore */ } + } + } + + } + /// + /// The humanGAPDH.xml test file contains one protein with two separate amino acid substitution variants. + /// When loaded with ProteinDbLoader.LoadProteinXML with variant expansion enabled, + /// the resulting protein entries reflect these variants appropriately. + /// There are four total proteins: the consensus (no variants), each single variant applied separately, and both variants applied. + /// In this test we add a modification to a variant protein but on a position shared by all proteins + /// Writing this list of four proteins back to XML results in an XML file that contains one Protein entry that is exactly the same as the original. + /// There are no entries for the variant proteins because the writer does not currently serialize applied variants as separate proteins. + /// Loading in this XML again results in the same four proteins as the original load. One consensus and three variant-applied proteoforms. + /// + [Test] + public static void TestProteinVariantsAddModsToVariantOnSharedAminoAcidRoundTrip() + { + ModificationMotif.TryGetMotif("M", out ModificationMotif motifM); + Modification mm = new Modification("mod", null, "type", null, motifM, "Anywhere.", null, 16, new Dictionary>(), null, null, null, null, null); + + + string databaseName = "humanGAPDH.xml"; + var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true, + DecoyType.None, null, false, null, out var unknownModifications, 1, 99); + Assert.AreEqual(4, proteins.Count); // 4 target + var consensusProtein = proteins[0]; + Assert.That(2, Is.EqualTo(consensusProtein.SequenceVariations.Count)); + Assert.That(0, Is.EqualTo(consensusProtein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(consensusProtein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + + var A22G_Protein = proteins[1]; + Assert.That(1, Is.EqualTo(A22G_Protein.SequenceVariations.Count)); + Assert.That(1, Is.EqualTo(A22G_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + // Add a modification to the consensus protein to test that mods survive the round trip + A22G_Protein.OneBasedPossibleLocalizedModifications.Add(1, new List() { mm }); + Assert.That(1, Is.EqualTo(A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + var K251N_Protein = proteins[2]; + Assert.That(1, Is.EqualTo(K251N_Protein.SequenceVariations.Count)); + Assert.That(1, Is.EqualTo(K251N_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(K251N_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + var K251N_A22G_Protein = proteins[3]; + Assert.That(0, Is.EqualTo(K251N_A22G_Protein.SequenceVariations.Count)); + Assert.That(2, Is.EqualTo(K251N_A22G_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(K251N_A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + // Write the non-decoy proteins to a temporary XML database + string tempDir = Path.Combine(Path.GetTempPath(), $"mzLib_roundtrip_{Guid.NewGuid():N}"); + Directory.CreateDirectory(tempDir); + string tempFile = Path.Combine(tempDir, "roundtrip_proteins.xml"); + + ProteinDbWriter.WriteXmlDatabase( + new Dictionary>>(), // no fixed mods to embed + proteins, + tempFile); + + // Count ProteinXmlEntry nodes directly from the XML (namespace- and case-insensitive) + var xdoc = System.Xml.Linq.XDocument.Load(tempFile); + int proteinXmlEntryCount = xdoc + .Descendants() + .Count(e => string.Equals(e.Name.LocalName, "protein", StringComparison.OrdinalIgnoreCase)); + TestContext.WriteLine($"ProteinXmlEntry count in XML: {proteinXmlEntryCount}"); + + // Expect the XML to contain one entry per written protein + // Mysteriously there is only one written protein..... + Assert.That(proteinXmlEntryCount, Is.EqualTo(1)); + + // Read the database back in + var roundTripped = ProteinDbLoader.LoadProteinXML( + tempFile, + generateTargets: true, + decoyType: DecoyType.None, + allKnownModifications: null, + isContaminant: false, + modTypesToExclude: null, + out var unknownMods2, + minAlleleDepth: 1, + maxHeterozygousVariants: 99); + + try + { + // Basic shape checks + Assert.That(roundTripped.Count, Is.EqualTo(proteins.Count)); + + // Ensure accessions match 1:1 + var originalAccs = proteins.Select(p => p.Accession).OrderBy(a => a).ToList(); + var rtAccs = roundTripped.Select(p => p.Accession).OrderBy(a => a).ToList(); + Assert.That(rtAccs, Is.EqualTo(originalAccs)); + + // Validate per-proteoform annotations survived round-trip + foreach (var rt in roundTripped) + { + var orig = proteins.Single(p => p.Accession == rt.Accession); + + Assert.That(rt.BaseSequence, Is.EqualTo(orig.BaseSequence), $"BaseSequence mismatch for {rt.Accession}"); + Assert.That(rt.SequenceVariations.Count, Is.EqualTo(orig.SequenceVariations.Count), $"SequenceVariations count mismatch for {rt.Accession}"); + Assert.That(rt.AppliedSequenceVariations.Count, Is.EqualTo(orig.AppliedSequenceVariations.Count), $"AppliedSequenceVariations count mismatch for {rt.Accession}"); + + // We added a mod to the consensus protein in a position that will be contained in all variant proteins as well + Assert.That(rt.OneBasedPossibleLocalizedModifications.Keys.Count, Is.EqualTo(1), $"Mods count mismatch for {rt.Accession}"); + + // Spot-check variant identity if applied variants exist + if (orig.AppliedSequenceVariations.Count > 0) + { + var origSimple = orig.AppliedSequenceVariations.Select(v => v.SimpleString()).OrderBy(s => s).ToArray(); + var rtSimple = rt.AppliedSequenceVariations.Select(v => v.SimpleString()).OrderBy(s => s).ToArray(); + Assert.That(rtSimple, Is.EqualTo(origSimple), $"Applied variant set mismatch for {rt.Accession}"); + } + } + } + finally + { + // Cleanup + if (Directory.Exists(tempDir)) + { + try { Directory.Delete(tempDir, recursive: true); } catch { /* ignore */ } + } + } + + } + + [Test] + public static void TestProteinVariantsAddModsToVariantRoundTrip() + { + ModificationMotif.TryGetMotif("G", out ModificationMotif motifM); + Modification mg = new Modification("mod", null, "type", null, motifM, "Anywhere.", null, 16, new Dictionary>(), null, null, null, null, null); + + + + + string databaseName = "humanGAPDH.xml"; + var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true, + DecoyType.None, null, false, null, out var unknownModifications, 1, 99); + Assert.AreEqual(4, proteins.Count); // 4 target + var consensusProtein = proteins[0]; + Assert.That(2, Is.EqualTo(consensusProtein.SequenceVariations.Count)); + Assert.That(0, Is.EqualTo(consensusProtein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(consensusProtein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + + + var A22G_Protein = proteins[1]; + Assert.That(1, Is.EqualTo(A22G_Protein.SequenceVariations.Count)); + Assert.That(1, Is.EqualTo(A22G_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + A22G_Protein.OneBasedPossibleLocalizedModifications.Add(22, new List() { mg }); + Assert.That(1, Is.EqualTo(A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + + var K251N_Protein = proteins[2]; + Assert.That(1, Is.EqualTo(K251N_Protein.SequenceVariations.Count)); + Assert.That(1, Is.EqualTo(K251N_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(K251N_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + var K251N_A22G_Protein = proteins[3]; + Assert.That(0, Is.EqualTo(K251N_A22G_Protein.SequenceVariations.Count)); + Assert.That(2, Is.EqualTo(K251N_A22G_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(K251N_A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + // Write the non-decoy proteins to a temporary XML database + string tempDir = Path.Combine(Path.GetTempPath(), $"mzLib_roundtrip_{Guid.NewGuid():N}"); + Directory.CreateDirectory(tempDir); + string tempFile = Path.Combine(tempDir, "roundtrip_proteins.xml"); + + ProteinDbWriter.WriteXmlDatabase( + new Dictionary>>(), // no fixed mods to embed + proteins, + tempFile); + + // Count ProteinXmlEntry nodes directly from the XML (namespace- and case-insensitive) + var xdoc = System.Xml.Linq.XDocument.Load(tempFile); + int proteinXmlEntryCount = xdoc + .Descendants() + .Count(e => string.Equals(e.Name.LocalName, "protein", StringComparison.OrdinalIgnoreCase)); + TestContext.WriteLine($"ProteinXmlEntry count in XML: {proteinXmlEntryCount}"); + + // Expect the XML to contain one entry per written protein + // Mysteriously there is only one written protein..... + Assert.That(proteinXmlEntryCount, Is.EqualTo(1)); + + // Read the database back in + var roundTripped = ProteinDbLoader.LoadProteinXML( + tempFile, + generateTargets: true, + decoyType: DecoyType.None, + allKnownModifications: null, + isContaminant: false, + modTypesToExclude: null, + out var unknownMods2, + minAlleleDepth: 1, + maxHeterozygousVariants: 99); + + try + { + // Basic shape checks + Assert.That(roundTripped.Count, Is.EqualTo(proteins.Count)); + + // Ensure accessions match 1:1 + var originalAccs = proteins.Select(p => p.Accession).OrderBy(a => a).ToList(); + var rtAccs = roundTripped.Select(p => p.Accession).OrderBy(a => a).ToList(); + Assert.That(rtAccs, Is.EqualTo(originalAccs)); + + // Validate per-proteoform annotations survived round-trip + foreach (var rt in roundTripped) + { + var orig = proteins.Single(p => p.Accession == rt.Accession); + + Assert.That(rt.BaseSequence, Is.EqualTo(orig.BaseSequence), $"BaseSequence mismatch for {rt.Accession}"); + Assert.That(rt.SequenceVariations.Count, Is.EqualTo(orig.SequenceVariations.Count), $"SequenceVariations count mismatch for {rt.Accession}"); + Assert.That(rt.AppliedSequenceVariations.Count, Is.EqualTo(orig.AppliedSequenceVariations.Count), $"AppliedSequenceVariations count mismatch for {rt.Accession}"); + Assert.That(rt.OneBasedPossibleLocalizedModifications.Keys.Count, Is.EqualTo(orig.OneBasedPossibleLocalizedModifications.Keys.Count), $"Mods count mismatch for {rt.Accession}"); + + // Spot-check variant identity if applied variants exist + if (orig.AppliedSequenceVariations.Count > 0) + { + var origSimple = orig.AppliedSequenceVariations.Select(v => v.SimpleString()).OrderBy(s => s).ToArray(); + var rtSimple = rt.AppliedSequenceVariations.Select(v => v.SimpleString()).OrderBy(s => s).ToArray(); + Assert.That(rtSimple, Is.EqualTo(origSimple), $"Applied variant set mismatch for {rt.Accession}"); + } + } + } + finally + { + // Cleanup + if (Directory.Exists(tempDir)) + { + try { Directory.Delete(tempDir, recursive: true); } catch { /* ignore */ } + } + } + + } + [Test] + public static void TestProteinVariantsAddModsToDeletionVariantRoundTrip() + { + ModificationMotif.TryGetMotif("G", out ModificationMotif motifM); + Modification mg = new Modification("mod", null, "type", null, motifM, "Anywhere.", null, 16, new Dictionary>(), null, null, null, null, null); + + + + + string databaseName = "humanGAPDH.xml"; + var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true, + DecoyType.None, null, false, null, out var unknownModifications, 1, 99); + Assert.AreEqual(4, proteins.Count); // 4 target + var consensusProtein = proteins[0]; + Assert.That(2, Is.EqualTo(consensusProtein.SequenceVariations.Count)); + Assert.That(0, Is.EqualTo(consensusProtein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(consensusProtein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + SequenceVariation del = new SequenceVariation(1,1,"M","", "delete M",null); + consensusProtein.SequenceVariations.Add(del); + Assert.That(3, Is.EqualTo(consensusProtein.SequenceVariations.Count)); + + + var A22G_Protein = proteins[1]; + Assert.That(1, Is.EqualTo(A22G_Protein.SequenceVariations.Count)); + Assert.That(1, Is.EqualTo(A22G_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + A22G_Protein.OneBasedPossibleLocalizedModifications.Add(22, new List() { mg }); + Assert.That(1, Is.EqualTo(A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + + var K251N_Protein = proteins[2]; + Assert.That(1, Is.EqualTo(K251N_Protein.SequenceVariations.Count)); + Assert.That(1, Is.EqualTo(K251N_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(K251N_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + var K251N_A22G_Protein = proteins[3]; + Assert.That(0, Is.EqualTo(K251N_A22G_Protein.SequenceVariations.Count)); + Assert.That(2, Is.EqualTo(K251N_A22G_Protein.AppliedSequenceVariations.Count)); + Assert.That(0, Is.EqualTo(K251N_A22G_Protein.OneBasedPossibleLocalizedModifications.Keys.Count)); + + // Write the non-decoy proteins to a temporary XML database + string tempDir = Path.Combine(Path.GetTempPath(), $"mzLib_roundtrip_{Guid.NewGuid():N}"); + Directory.CreateDirectory(tempDir); + string tempFile = Path.Combine(tempDir, "roundtrip_proteins.xml"); + + ProteinDbWriter.WriteXmlDatabase( + new Dictionary>>(), // no fixed mods to embed + proteins, + tempFile); + + // Count ProteinXmlEntry nodes directly from the XML (namespace- and case-insensitive) + var xdoc = System.Xml.Linq.XDocument.Load(tempFile); + int proteinXmlEntryCount = xdoc + .Descendants() + .Count(e => string.Equals(e.Name.LocalName, "protein", StringComparison.OrdinalIgnoreCase)); + TestContext.WriteLine($"ProteinXmlEntry count in XML: {proteinXmlEntryCount}"); + + // Expect the XML to contain one entry per written protein + // Mysteriously there is only one written protein..... + Assert.That(proteinXmlEntryCount, Is.EqualTo(1)); + + // Read the database back in + var roundTripped = ProteinDbLoader.LoadProteinXML( + tempFile, + generateTargets: true, + decoyType: DecoyType.None, + allKnownModifications: null, + isContaminant: false, + modTypesToExclude: null, + out var unknownMods2, + minAlleleDepth: 1, + maxHeterozygousVariants: 99); + + try + { + // Basic shape checks + Assert.That(roundTripped.Count, Is.EqualTo(proteins.Count)); + + // Ensure accessions match 1:1 + var originalAccs = proteins.Select(p => p.Accession).OrderBy(a => a).ToList(); + var rtAccs = roundTripped.Select(p => p.Accession).OrderBy(a => a).ToList(); + Assert.That(rtAccs, Is.EqualTo(originalAccs)); + + // Validate per-proteoform annotations survived round-trip + foreach (var rt in roundTripped) + { + var orig = proteins.Single(p => p.Accession == rt.Accession); + + Assert.That(rt.BaseSequence, Is.EqualTo(orig.BaseSequence), $"BaseSequence mismatch for {rt.Accession}"); + Assert.That(rt.SequenceVariations.Count, Is.EqualTo(orig.SequenceVariations.Count), $"SequenceVariations count mismatch for {rt.Accession}"); + Assert.That(rt.AppliedSequenceVariations.Count, Is.EqualTo(orig.AppliedSequenceVariations.Count), $"AppliedSequenceVariations count mismatch for {rt.Accession}"); + Assert.That(rt.OneBasedPossibleLocalizedModifications.Keys.Count, Is.EqualTo(orig.OneBasedPossibleLocalizedModifications.Keys.Count), $"Mods count mismatch for {rt.Accession}"); + + // Spot-check variant identity if applied variants exist + if (orig.AppliedSequenceVariations.Count > 0) + { + var origSimple = orig.AppliedSequenceVariations.Select(v => v.SimpleString()).OrderBy(s => s).ToArray(); + var rtSimple = rt.AppliedSequenceVariations.Select(v => v.SimpleString()).OrderBy(s => s).ToArray(); + Assert.That(rtSimple, Is.EqualTo(origSimple), $"Applied variant set mismatch for {rt.Accession}"); + } + } + } + finally + { + // Cleanup + if (Directory.Exists(tempDir)) + { + try { Directory.Delete(tempDir, recursive: true); } catch { /* ignore */ } + } + } + + } } } \ No newline at end of file diff --git a/mzLib/Test/DatabaseTests/oneVcfProteinWithModsOnVariant.xml b/mzLib/Test/DatabaseTests/oneVcfProteinWithModsOnVariant.xml new file mode 100644 index 000000000..d4721125a --- /dev/null +++ b/mzLib/Test/DatabaseTests/oneVcfProteinWithModsOnVariant.xml @@ -0,0 +1,141 @@ + + + ID Ammonia loss on N +MT Common Artifact +TG N +PP Anywhere. +CF H-3N-1 +MM -17.026549101 + +// + ID Hydroxylation on K +MT Common Biological +TG K +PP Anywhere. +CF O +MM 15.99491462 + +// + ID Hydroxylation on N +MT Common Biological +TG N +PP Anywhere. +CF O +MM 15.99491462 + +// + ID Hydroxylation on P +MT Common Biological +TG P +PP Anywhere. +CF O +MM 15.99491462 +DI HCD:170.060123533 + +// + ID Phosphoserine on S +AC PTM-0253 +MT UniProt +FT MOD_RES +TG S +PP Anywhere. +CF HO3P +MM 79.966331 +DR PSI-MOD; MOD:00046 +DR RESID; AA0037 +TR Archaea; taxId:2157 (Archaea) +TR Bacteria; taxId:2 (Bacteria) +TR Eukaryota; taxId:2759 (Eukaryota) +TR Viruses; taxId:10239 (Viruses) +KW Phosphoprotein +// + + Q6P6B1 + ENST00000318528 + + + Glutamate-rich protein 5 + + + + ERICH5 + C8orf47 + ERICH5 + ENSG00000177459 + + + Homo sapiens + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + K + N + + + + + + + + + + + + + + + + + + + + + + + + + MGCSSSALNKAGDSSRFPSVTSNEHFSTAEESESCFAQPKPHALGRESTVDGNVQRESRPPLQKLKVSAEPTANGVKPLQEQPLAKDVAPGRDATDQSGSTEKTQPGEGLEESGPPQPGGKEDAPAAEGKKKDAGAGTEAESLKGNAEAQPLGPEAKGQPLQAAVEKDSLRAVEVTENPQTAAEMKPLGTTENVLTLQIAGELQPQGTVGKDEQAPLLETISKENESPEILEGSQFVETAEEQQLQATLGKEEQPQLLERIPKENVTPEVLDRSQLVEKPVMNDPFHKTPEGPGNMEQIQPEGIVGSMEHPARNVEAGAYVEMIRNIHTNEEDQRIEGETGEKVETDMENEKVSEGAETKEEETGEVVDLSAAT + + + + diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj index 59c393b39..c2e14a6c2 100644 --- a/mzLib/Test/Test.csproj +++ b/mzLib/Test/Test.csproj @@ -43,6 +43,9 @@ Always + + Always + Always diff --git a/mzLib/Test/Transcriptomics/TestVariantOligo.cs b/mzLib/Test/Transcriptomics/TestVariantOligo.cs index aa643dec5..96486982c 100644 --- a/mzLib/Test/Transcriptomics/TestVariantOligo.cs +++ b/mzLib/Test/Transcriptomics/TestVariantOligo.cs @@ -35,23 +35,38 @@ public static void VariantRna() RNA v = new RNA("CAUA", p, new[] { new SequenceVariation(3, "A", "U", "desc", null) }, null, null, null); Assert.That(v.ConsensusVariant, Is.EqualTo(p)); } - + /// + /// This is an interesting test case. The RNA XML contains one entry. + /// The entry has 5 sequence variations, but they are perfectly identical + /// I guess the correct behavior is to only apply the variant once, since applying it multiple times does nothing + /// [Test] public void VariantXml() { string file = Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics", "TestData", "SeqVar.xml"); - List variantProteins = RnaDbLoader.LoadRnaXML(file, true, DecoyType.None, false, AllKnownMods, [], out _); + List variantProteins = RnaDbLoader.LoadRnaXML(file, true, DecoyType.None, false, AllKnownMods, [], out _, + minAlleleDepth: 1000); + + Assert.That(1, Is.EqualTo(variantProteins.Count)); + Assert.That(0, Is.EqualTo(variantProteins.Count(a => a.AppliedSequenceVariations.Count > 0))); + + variantProteins = RnaDbLoader.LoadRnaXML(file, true, DecoyType.None, false, AllKnownMods, [], out _, + maxHeterozygousVariants: 10, + minAlleleDepth: 0); + + Assert.That(1, Is.EqualTo(variantProteins.Count)); + Assert.That(1, Is.EqualTo(variantProteins.Count(a => a.AppliedSequenceVariations.Count > 0))); - Assert.That(variantProteins.First().ConsensusVariant.SequenceVariations.Count(), Is.EqualTo(5)); - Assert.That(variantProteins.Count, Is.EqualTo(1)); // there is only one unique amino acid change Assert.That(variantProteins.First().ConsensusVariant.BaseSequence, Is.Not.EqualTo(variantProteins.First().BaseSequence)); Assert.That(variantProteins.First().ConsensusVariant.BaseSequence[116], Is.EqualTo('C')); Assert.That(variantProteins.First().BaseSequence[116], Is.EqualTo('G')); + Assert.That(variantProteins.First().ConsensusVariant.Name, Is.Not.EqualTo(variantProteins.First().Name)); Assert.That(variantProteins.First().ConsensusVariant.FullName, Is.Not.EqualTo(variantProteins.First().FullName)); Assert.That(variantProteins.First().ConsensusVariant.Accession, Is.Not.EqualTo(variantProteins.First().Accession)); List oligos = variantProteins.SelectMany(vp => vp.Digest(new RnaDigestionParams(), null, null)).ToList(); + Assert.That(44, Is.EqualTo(oligos.Count)); } [Test] @@ -67,19 +82,19 @@ public static void LoadSeqVarModifications(string databaseName, int modIdx, int Assert.That(target.OneBasedPossibleLocalizedModifications.Single().Key, Is.EqualTo(modIdx)); Assert.That(target.AppliedSequenceVariations.Count(), Is.EqualTo(1)); Assert.That(target.AppliedSequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(modIdx)); - Assert.That(target.SequenceVariations.Count(), Is.EqualTo(1)); - Assert.That(target.SequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(modIdx)); - Assert.That(target.SequenceVariations.Single().OneBasedModifications.Count, Is.EqualTo(1)); - Assert.That(target.SequenceVariations.Single().OneBasedModifications.Single().Key, Is.EqualTo(modIdx)); //PEP[mod]TID, MEP[mod]TID + Assert.That(target.SequenceVariations.Count(), Is.EqualTo(0)); + Assert.That(target.AppliedSequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(modIdx)); + Assert.That(target.AppliedSequenceVariations.Single().OneBasedModifications.Count, Is.EqualTo(1)); + Assert.That(target.AppliedSequenceVariations.Single().OneBasedModifications.Single().Key, Is.EqualTo(modIdx)); //PEP[mod]TID, MEP[mod]TID var decoy = rna[1]; Assert.That(decoy.OneBasedPossibleLocalizedModifications.Count, Is.EqualTo(1)); Assert.That(decoy.OneBasedPossibleLocalizedModifications.Single().Key, Is.EqualTo(reversedModIdx)); //DITP[mod]EP, MDITP[mod]E Assert.That(decoy.AppliedSequenceVariations.Count(), Is.EqualTo(1)); Assert.That(decoy.AppliedSequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(reversedModIdx)); - Assert.That(decoy.SequenceVariations.Count(), Is.EqualTo(1)); - Assert.That(decoy.SequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(reversedModIdx)); - Assert.That(decoy.SequenceVariations.Single().OneBasedModifications.Count, Is.EqualTo(1)); - Assert.That(decoy.SequenceVariations.Single().OneBasedModifications.Single().Key, Is.EqualTo(reversedModIdx)); + Assert.That(decoy.SequenceVariations.Count(), Is.EqualTo(0)); + Assert.That(decoy.AppliedSequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(reversedModIdx)); + Assert.That(decoy.AppliedSequenceVariations.Single().OneBasedModifications.Count, Is.EqualTo(1)); + Assert.That(decoy.AppliedSequenceVariations.Single().OneBasedModifications.Single().Key, Is.EqualTo(reversedModIdx)); string rewriteDbName = $"{Path.GetFileNameWithoutExtension(databaseName)}rewrite.xml"; ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), rna.Where(p => !p.IsDecoy).ToList(), Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics", "TestData", rewriteDbName)); @@ -90,19 +105,19 @@ public static void LoadSeqVarModifications(string databaseName, int modIdx, int Assert.That(target.OneBasedPossibleLocalizedModifications.Single().Key, Is.EqualTo(modIdx)); Assert.That(target.AppliedSequenceVariations.Count(), Is.EqualTo(1)); Assert.That(target.AppliedSequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(modIdx)); - Assert.That(target.SequenceVariations.Count(), Is.EqualTo(1)); - Assert.That(target.SequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(modIdx)); - Assert.That(target.SequenceVariations.Single().OneBasedModifications.Count, Is.EqualTo(1)); - Assert.That(target.SequenceVariations.Single().OneBasedModifications.Single().Key, Is.EqualTo(modIdx)); + Assert.That(target.SequenceVariations.Count(), Is.EqualTo(0)); + Assert.That(target.AppliedSequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(modIdx)); + Assert.That(target.AppliedSequenceVariations.Single().OneBasedModifications.Count, Is.EqualTo(1)); + Assert.That(target.AppliedSequenceVariations.Single().OneBasedModifications.Single().Key, Is.EqualTo(modIdx)); decoy = rna[1]; Assert.That(decoy.OneBasedPossibleLocalizedModifications.Count, Is.EqualTo(1)); Assert.That(decoy.OneBasedPossibleLocalizedModifications.Single().Key, Is.EqualTo(reversedModIdx)); Assert.That(decoy.AppliedSequenceVariations.Count(), Is.EqualTo(1)); Assert.That(decoy.AppliedSequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(reversedModIdx)); - Assert.That(decoy.SequenceVariations.Count(), Is.EqualTo(1)); - Assert.That(decoy.SequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(reversedModIdx)); - Assert.That(decoy.SequenceVariations.Single().OneBasedModifications.Count, Is.EqualTo(1)); - Assert.That(decoy.SequenceVariations.Single().OneBasedModifications.Single().Key, Is.EqualTo(reversedModIdx)); + Assert.That(decoy.SequenceVariations.Count(), Is.EqualTo(0)); + Assert.That(decoy.AppliedSequenceVariations.Single().OneBasedBeginPosition, Is.EqualTo(reversedModIdx)); + Assert.That(decoy.AppliedSequenceVariations.Single().OneBasedModifications.Count, Is.EqualTo(1)); + Assert.That(decoy.AppliedSequenceVariations.Single().OneBasedModifications.Single().Key, Is.EqualTo(reversedModIdx)); } [TestCase("ranges1.xml", 1, 2, 5, 6)] // trunc excludes natural 3' @@ -135,17 +150,51 @@ public static void ReverseDecoyProteolysisProducts(string databaseName, int begi } [Test] - [TestCase("HomozygousHLA.xml", 1, 18)] - [TestCase("HomozygousHLA.xml", 10, 17)] - public static void HomozygousVariantsAtVariedDepths(string filename, int minVariantDepth, int appliedCount) + public static void HomozygousVariantsAtDepth1() + { + string filename = "HomozygousHLA.xml"; + int minVariantDepth = 1; + int appliedCount = 18; + + string dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics", "TestData", filename); + var rna = RnaDbLoader.LoadRnaXML(dbPath, true, DecoyType.None, false, AllKnownMods, [], out var unknownModifications, minAlleleDepth: minVariantDepth); + + Assert.That(rna.Count, Is.EqualTo(1)); + + // All variants applied => none left in SequenceVariations on the variant + Assert.That(rna[0].SequenceVariations.Count(), Is.EqualTo(0)); + Assert.That(rna[0].AppliedSequenceVariations.Count(), Is.EqualTo(appliedCount)); + Assert.That(rna[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), Is.EqualTo(appliedCount)); + + // Optional: DB annotations still live on the consensus/reference + Assert.That(rna[0].ConsensusVariant.SequenceVariations.Count, Is.EqualTo(18)); + + Assert.That(rna[0].GetVariantBioPolymers().Count, Is.EqualTo(1)); + var variantProteins = rna[0].GetVariantBioPolymers(); + List peptides = rna.SelectMany(vp => vp.Digest(new RnaDigestionParams(), null, null)).ToList(); + } + + [Test] + public static void HomozygousVariantsAtDepth10() { + string filename = "HomozygousHLA.xml"; + int minVariantDepth = 10; + int appliedCount = 17; + string dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics", "TestData", filename); var rna = RnaDbLoader.LoadRnaXML(dbPath, true, DecoyType.None, false, AllKnownMods, [], out var unknownModifications, minAlleleDepth: minVariantDepth); + Assert.That(rna.Count, Is.EqualTo(1)); - Assert.That(rna[0].SequenceVariations.Count(), Is.EqualTo(18)); // some redundant - Assert.That(rna[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), Is.EqualTo(18)); // unique changes - Assert.That(rna[0].AppliedSequenceVariations.Count(), Is.EqualTo(appliedCount)); // some redundant - Assert.That(rna[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), Is.EqualTo(appliedCount)); // unique changes + + // 17 applied => 1 unapplied remains in SequenceVariations on the variant + Assert.That(rna[0].SequenceVariations.Count(), Is.EqualTo(1)); + Assert.That(rna[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), Is.EqualTo(1)); + Assert.That(rna[0].AppliedSequenceVariations.Count(), Is.EqualTo(appliedCount)); + Assert.That(rna[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), Is.EqualTo(appliedCount)); + + // Optional: consensus still has all DB annotations + Assert.That(rna[0].ConsensusVariant.SequenceVariations.Count, Is.EqualTo(18)); + Assert.That(rna[0].GetVariantBioPolymers().Count, Is.EqualTo(1)); var variantProteins = rna[0].GetVariantBioPolymers(); List peptides = rna.SelectMany(vp => vp.Digest(new RnaDigestionParams(), null, null)).ToList(); @@ -310,7 +359,13 @@ public static void CrashOnCreateVariantFromProtein() var protein = new Protein("PEPTIDE", "accession"); NUnit.Framework.Assert.Throws(() => { - rnas[0].CreateVariant(rnas[0].BaseSequence, protein, [], [], new Dictionary>(), ""); + rnas[0].CreateVariant( + rnas[0].BaseSequence, + protein, + [], // appliedSequenceVariants + [], // applicableTruncationProducts + new Dictionary>(), // mods + ""); // sampleNameForVariants }); } @@ -387,7 +442,13 @@ public void VariantModificationTest() // One of the mod residues is removed by the variant. string dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics", "TestData", "VariantModsGPTMD.xml"); List rna = RnaDbLoader.LoadRnaXML(dbPath, true, DecoyType.Reverse, false, AllKnownMods, [], out var unknownModifications); - Assert.That(rna.All(p => p.SequenceVariations.Count == 1)); + + List rnaWithAppliedVariants = rna.Where(p => p.AppliedSequenceVariations.Count > 0).ToList(); + + List rnaWithoutAppliedVariants = rna.Where(p => p.AppliedSequenceVariations.Count == 0).ToList(); + + Assert.That(rnaWithoutAppliedVariants.All(p => p.SequenceVariations.Count == 1)); + Assert.That(rnaWithAppliedVariants.All(p => p.SequenceVariations.Count == 0)); List targets = rna.Where(p => p.IsDecoy == false).ToList(); RNA variantTarget = targets.First(p => p.AppliedSequenceVariations.Count >= 1); @@ -432,13 +493,17 @@ public void VariantModificationTest() } } - [Test] public void TwoTruncationsAndSequenceVariant_DbLoading() { string dbPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "Transcriptomics", "TestData", "TruncationAndVariantMods.xml"); List rna = RnaDbLoader.LoadRnaXML(dbPath, true, DecoyType.Reverse, false, AllKnownMods, [], out var unknownModifications); - Assert.That(rna.All(p => p.SequenceVariations.Count == 1)); + // With new behavior: + // - RNAs with applied variants should have 0 unapplied SequenceVariations + // - RNAs without applied variants should retain the 1 DB SequenceVariation + Assert.That(rna.Where(p => p.AppliedSequenceVariations.Count > 0).All(p => p.SequenceVariations.Count == 0)); + Assert.That(rna.Where(p => p.AppliedSequenceVariations.Count == 0).All(p => p.SequenceVariations.Count == 1)); + Assert.That(rna.All(p => p.OriginalNonVariantModifications.Count == 2)); List targets = rna.Where(p => p.IsDecoy == false).ToList(); @@ -471,7 +536,10 @@ public void TwoTruncationsAndSequenceVariant_Digestion(string testCase, string b List rna = RnaDbLoader.LoadRnaXML(dbPath, true, DecoyType.Reverse, false, AllKnownMods, [], out var unknownModifications); RnaDigestionParams digestionParams = new RnaDigestionParams("RNase T1", missedCleavages, 2); - Assert.That(rna.All(p => p.SequenceVariations.Count == 1)); + // New behavior consistent with DbLoading test above + Assert.That(rna.Where(p => p.AppliedSequenceVariations.Count > 0).All(p => p.SequenceVariations.Count == 0)); + Assert.That(rna.Where(p => p.AppliedSequenceVariations.Count == 0).All(p => p.SequenceVariations.Count == 1)); + Assert.That(rna.All(p => p.OriginalNonVariantModifications.Count == 2)); Assert.That(rna.All(p => p.TruncationProducts.Count == 2)); diff --git a/mzLib/Transcriptomics/RNA.cs b/mzLib/Transcriptomics/RNA.cs index bfdefdf87..e68bfe00b 100644 --- a/mzLib/Transcriptomics/RNA.cs +++ b/mzLib/Transcriptomics/RNA.cs @@ -5,6 +5,7 @@ using Omics; using Omics.BioPolymer; using Omics.Modifications; +using System.Security.Cryptography; namespace Transcriptomics { @@ -38,21 +39,21 @@ public RNA(string sequence, string accession, truncationProducts, sequenceVariations, appliedSequenceVariations, sampleNameForVariants, fullName) { } - + /// /// For creating a variant of an existing nucleic acid. Filters out modifications that do not match their nucleotide target site. /// public RNA(string variantBaseSequence, NucleicAcid original, IEnumerable? appliedSequenceVariants, IEnumerable? applicableTruncationProducts, IDictionary> oneBasedModifications, string sampleNameForVariants) - : this(variantBaseSequence, VariantApplication.GetAccession(original, appliedSequenceVariants), - oneBasedModifications, - original.FivePrimeTerminus, original.ThreePrimeTerminus, - VariantApplication.GetVariantName(original.Name, appliedSequenceVariants), - original.Organism, original.DatabaseFilePath, original.IsContaminant, - original.IsDecoy, original.GeneNames, original.AdditionalDatabaseFields, - [..applicableTruncationProducts ?? new List()], original.SequenceVariations, - [..appliedSequenceVariants ?? new List()], sampleNameForVariants, - VariantApplication.GetVariantName(original.FullName, appliedSequenceVariants)) + : this(variantBaseSequence, VariantApplication.GetAccession(original, appliedSequenceVariants), + oneBasedModifications, + original.FivePrimeTerminus, original.ThreePrimeTerminus, + VariantApplication.GetVariantName(original.Name, appliedSequenceVariants), + original.Organism, original.DatabaseFilePath, original.IsContaminant, + original.IsDecoy, original.GeneNames, original.AdditionalDatabaseFields, + [.. applicableTruncationProducts ?? new List()], original.SequenceVariations, + [.. appliedSequenceVariants ?? new List()], sampleNameForVariants, + VariantApplication.GetVariantName(original.FullName, appliedSequenceVariants)) { ConsensusVariant = original.ConsensusVariant; OriginalNonVariantModifications = ConsensusVariant.OriginalNonVariantModifications; @@ -73,7 +74,7 @@ public override TBioPolymerType CreateVariant(string variantBas if (original is not RNA rna) throw new ArgumentException("The original nucleic acid must be RNA to create an RNA variant."); - var variantRNA = new RNA(variantBaseSequence, rna, appliedSequenceVariants, + var variantRNA = new RNA(variantBaseSequence, rna, appliedSequenceVariants, applicableTruncationProducts, oneBasedModifications, sampleNameForVariants); return (TBioPolymerType)(IHasSequenceVariants)variantRNA; } diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs index 252179982..96ea653ac 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbLoader.cs @@ -138,7 +138,13 @@ public static List LoadProteinXML(string proteinDbLocation, bool genera decoys.AddRange(DecoyProteinGenerator.GenerateDecoys(targets, decoyType, maxThreads, decoyIdentifier)); IEnumerable proteinsToExpand = generateTargets ? targets.Concat(decoys) : decoys; + var bubba = proteinsToExpand + .SelectMany(p => p.GetVariantBioPolymers(maxHeterozygousVariants, minAlleleDepth)).ToList(); + var t = targets.ToList(); + var e = t.SelectMany(p => p.GetVariantBioPolymers(maxHeterozygousVariants, minAlleleDepth)).ToList(); + var toReturn = proteinsToExpand.SelectMany(p => p.GetVariantBioPolymers(maxHeterozygousVariants, minAlleleDepth)); + return Merge(toReturn).ToList(); } diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs index 24ff44dd4..9a0155b54 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs @@ -297,8 +297,16 @@ public static Dictionary WriteXmlDatabase(Dictionary>>(); - // write nonvariant proteins (for cases where variants aren't applied, this just gets the rna itself) - var nonVariantProteins = proteinList.Select(p => p.ConsensusVariant).Distinct().ToList(); + // create the list of distinct consensus proteins. + var proteinsToWrite = proteinList + .Select(p => p.ConsensusVariant) + .Distinct() + .ToList(); + // add any proteins with applied variants + proteinsToWrite = proteinsToWrite + .Concat(proteinList.Where(p => p.AppliedSequenceVariations.Count > 0)) + .Distinct() + .ToList(); var xmlWriterSettings = new XmlWriterSettings { @@ -314,7 +322,7 @@ public static Dictionary WriteXmlDatabase(Dictionary myModificationList = new List(); - foreach (Protein p in nonVariantProteins) + foreach (Protein p in proteinsToWrite) { foreach (KeyValuePair> entry in p.OneBasedPossibleLocalizedModifications) { @@ -323,13 +331,13 @@ public static Dictionary WriteXmlDatabase(Dictionary allRelevantModifications = new HashSet( - nonVariantProteins + proteinsToWrite .SelectMany(p => p.SequenceVariations .SelectMany(sv => sv.OneBasedModifications) .Concat(p.OneBasedPossibleLocalizedModifications) .SelectMany(kv => kv.Value)) .Concat(additionalModsToAddToProteins - .Where(kv => nonVariantProteins + .Where(kv => proteinsToWrite .SelectMany(p => p.SequenceVariations .Select(sv => VariantApplication.GetAccession(p, new[] { sv })).Concat(new[] { p.Accession })) .Contains(kv.Key)) @@ -342,7 +350,7 @@ public static Dictionary WriteXmlDatabase(Dictionary