From 3b1ddb8bb22ce5a7c39caef945df539f3a3b797b Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Fri, 22 Aug 2025 15:08:12 -0500 Subject: [PATCH 1/7] New clean repo with ptm_stoch contents. The methods for occupancy calculation in mzlibutils were copied from the previous branch onto this one. Need to add/remake the tests next. --- mzLib/MzLibUtil/ClassExtensions.cs | 8 + mzLib/MzLibUtil/PositionFrequencyAnalysis.cs | 314 +++++++++++++++++++ 2 files changed, 322 insertions(+) create mode 100644 mzLib/MzLibUtil/PositionFrequencyAnalysis.cs diff --git a/mzLib/MzLibUtil/ClassExtensions.cs b/mzLib/MzLibUtil/ClassExtensions.cs index 36bd1092d..5eb425276 100644 --- a/mzLib/MzLibUtil/ClassExtensions.cs +++ b/mzLib/MzLibUtil/ClassExtensions.cs @@ -25,6 +25,8 @@ namespace MzLibUtil { public static class ClassExtensions { + public static readonly string ModificationPattern = @"-?\[(.+?)(? /// Applies a boxcar smoothing algorithm to the input data. /// @@ -283,6 +285,12 @@ public static Dictionary ParseModifications(this string fullSeq) return modDict; } + public static string GetBaseSequenceFromFullSequence(this string fullSeq, string? modPattern=null, string? replacement=null) + { + Regex regex = new(modPattern ?? ModificationPattern); + return regex.Replace(fullSeq, replacement ?? string.Empty); + } + /// /// Fixes an issue where the | appears and throws off the numbering if there are multiple mods on a single amino acid. /// diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs new file mode 100644 index 000000000..f750eb3fc --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs @@ -0,0 +1,314 @@ +using Easy.Common.Extensions; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; + +namespace MzLibUtil +{ + public class QuantifiedModification + { + public string IdWithMotif { get; set; } + public string ModificationLocalization { get; set; } // e.g. "N-terminus", "C-terminus", or amino acid name + public int PeptidePositionZeroIsNTerminus { get; set; } + public int ProteinPositionZeroIsNTerminus { get; set; } + public double Intensity { get; set; } + + public QuantifiedModification(string idWithMotif, int positionInPeptide, int? positionInProtein = null, string modLocalization = null, double intensity = 0) + { + IdWithMotif = idWithMotif; + PeptidePositionZeroIsNTerminus = positionInPeptide; + ProteinPositionZeroIsNTerminus = positionInProtein ?? -1; // -1 means that the position in the protein is unknown + ModificationLocalization = modLocalization ?? "Unknown"; + Intensity = intensity; + } + } + /// + /// A class to store information about a quantified peptides sharing the same base sequence. + /// + public class QuantifiedPeptide + { + public HashSet FullSequences { get; set; } + public string BaseSequence { get; set; } + public QuantifiedProtein ParentProtein { get; set; } + public int OneBasedStartIndexInProtein { get; set; } + public Dictionary> ModifiedAminoAcidPositions { get; set; } + public double Intensity { get; set; } + + public QuantifiedPeptide(string fullSequence, int oneBasedStartIndexInProtein = -1, double intensity = 0, string modPattern = null) + { + ModifiedAminoAcidPositions = new Dictionary>(); + OneBasedStartIndexInProtein = oneBasedStartIndexInProtein; // -1 means that the position in the protein is unknown + Intensity = intensity; + FullSequences = new HashSet { fullSequence }; + _SetBaseSequence(fullSequence, modPattern); + _SetModifications(fullSequence, intensity); + } + + public void AddFullSequence(string fullSeq, double intensity = 0, string modPattern = null) + { + if (BaseSequence.Equals(fullSeq.GetBaseSequenceFromFullSequence())) + { + FullSequences.Add(fullSeq); + Intensity += intensity; + _SetModifications(fullSeq, intensity); // updating the intensity is done here + } + else + { + throw new Exception("The base sequence of the peptide does not match the full sequence."); + } + } + + public void MergePeptide(QuantifiedPeptide peptideToMerge) + { + if (peptideToMerge == null || peptideToMerge.BaseSequence != BaseSequence) + { + throw new Exception("The base sequence of the peptide to merge does not match the base sequence of this peptide."); + } + foreach (var fullSeq in peptideToMerge.FullSequences) + { + FullSequences.Add(fullSeq); + _SetModifications(fullSeq, peptideToMerge.Intensity); // updating the intensity is done here + } + Intensity += peptideToMerge.Intensity; + } + + private void _SetModifications(string fullSeq, double intensity = 0) + { + var mods = fullSeq.ParseModifications(); + + if (mods.IsNotNullOrEmpty()) + { + foreach (var modpos in mods.Keys) + { + var mod = mods[modpos]; + if (!ModifiedAminoAcidPositions.ContainsKey(modpos)) + { + ModifiedAminoAcidPositions[modpos] = new Dictionary(); + } + + if (!ModifiedAminoAcidPositions[modpos].ContainsKey(mod)) + { + var modLocalization = modpos == 0 ? "N-terminus" : (modpos == BaseSequence.Length + 1 ? "C-terminus" : BaseSequence[modpos - 1].ToString()); + ModifiedAminoAcidPositions[modpos][mod] = new QuantifiedModification(mod, modpos, modLocalization: modLocalization, intensity: 0); + } + ModifiedAminoAcidPositions[modpos][mod].Intensity += intensity; + + // Maybe should update/pass position in protein from here, too. + } + } + } + + private void _SetBaseSequence(string fullSeq, string modPattern) + { + BaseSequence = fullSeq.GetBaseSequenceFromFullSequence(modPattern: modPattern); + } + + public Dictionary> GetModStoichiometryForPeptide() + { + var aaModsStoichiometry = ModifiedAminoAcidPositions; + + foreach (var modpos in aaModsStoichiometry) + { + foreach (var mod in modpos.Value.Values) + { + mod.Intensity /= Intensity; + } + } + return aaModsStoichiometry; + } + } + + public class QuantifiedProtein + { + public string Accession { get; set; } + public string Sequence { get; set; } + public Dictionary Peptides { get; set; } + public Dictionary> ModifiedAminoAcidPositionsInProtein { get; set; } + public Dictionary> PeptidesByProteinPosition { get; set; } + + public QuantifiedProtein(string accession, string sequence = null, Dictionary peptides = null) + { + Accession = accession; + Sequence = sequence; + Peptides = peptides ?? new Dictionary(); + } + + public void SetProteinModsFromPeptides() + { + if (!Sequence.IsNotNullOrEmpty() || !Peptides.IsNotNullOrEmpty()) + { + throw new Exception("The protein sequence is unknown, or there're no peptides."); + } + + ModifiedAminoAcidPositionsInProtein = new Dictionary>(); + PeptidesByProteinPosition = new Dictionary>(); + + foreach (var peptide in Peptides.Values) + { + // if peptide has no modifications, add to all its positions + if (!peptide.ModifiedAminoAcidPositions.IsNotNullOrEmpty()) + { + for (int i = 0; i < peptide.BaseSequence.Length; i++) + { + var pos = peptide.OneBasedStartIndexInProtein + i; + if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(pos)) + { + ModifiedAminoAcidPositionsInProtein[pos] = new Dictionary(); + PeptidesByProteinPosition[pos] = new HashSet(); + } + PeptidesByProteinPosition[pos].Add(peptide.BaseSequence); + } + continue; + } + + else // if peptide has modifications, add to modified positions + { + foreach (var modpos in peptide.ModifiedAminoAcidPositions.Keys) + { + var modPositionInProtein = modpos + peptide.OneBasedStartIndexInProtein - 1; + + // Ignore peptide terminal modifications that are not at the protein terminal + if ((modPositionInProtein != 0 && modpos == 0) // if the mod is at the N-terminus of the peptide, but not the protein. + || (modPositionInProtein != Sequence.Length + 1 && modpos == peptide.BaseSequence.Length + 1)) // if the mod is at the C-terminus of the peptide, but not the protein. + { + continue; + } + + if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(modPositionInProtein)) + { + ModifiedAminoAcidPositionsInProtein[modPositionInProtein] = new Dictionary(); + PeptidesByProteinPosition[modPositionInProtein] = new HashSet(); + } + PeptidesByProteinPosition[modPositionInProtein].Add(peptide.BaseSequence); + + foreach (var mod in peptide.ModifiedAminoAcidPositions[modpos].Values) + { + mod.ProteinPositionZeroIsNTerminus = modPositionInProtein; + + if (!ModifiedAminoAcidPositionsInProtein[modPositionInProtein].ContainsKey(mod.IdWithMotif)) + { + ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.IdWithMotif] = new QuantifiedModification(mod.IdWithMotif, mod.PeptidePositionZeroIsNTerminus, modPositionInProtein, null, 0); + } + ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.IdWithMotif].Intensity += mod.Intensity; + } + } + } + } + + // clean up the dictionary to remove any empty modifications + var noModPositions = ModifiedAminoAcidPositionsInProtein.Where(x => !x.Value.IsNotNullOrEmpty()).ToDictionary().Keys; + foreach (var pos in noModPositions) + { + ModifiedAminoAcidPositionsInProtein.Remove(pos); + PeptidesByProteinPosition.Remove(pos); + } + + } + + public Dictionary> GetModStoichiometryFromProteinMods() + { + SetProteinModsFromPeptides(); + + var aaModsStoichiometry = ModifiedAminoAcidPositionsInProtein; + foreach (var modpos in aaModsStoichiometry.Keys) + { + double totalPositionIntensity = Peptides.Where(pep => PeptidesByProteinPosition[modpos].Contains(pep.Key)).Sum(x => x.Value.Intensity); + foreach (var mod in aaModsStoichiometry[modpos].Values) + { + mod.Intensity /= totalPositionIntensity; + } + } + return aaModsStoichiometry; + } + } + + public class QuantifiedProteinGroup + { + public string Name { get; set; } + public Dictionary Proteins { get; set; } + public string OccupancyLevel { get; set; } + + public QuantifiedProteinGroup(string name, Dictionary proteins = null) + { + Name = name; + if (proteins != null) Proteins = proteins; + else Proteins = new Dictionary(); + } + } + public class PositionFrequencyAnalysis + { + + public Dictionary ProteinGroupOccupancies { get; private set; } + public Dictionary PeptideOccupancies { get; private set; } + + /// + /// Calculates the occupancy of post-translational modifications at the peptide level. + /// + /// A List of Tuples whose entries are ordered as (string FullSequence, string BaseSequence, List ProteinGroups, Intensity) for each peptide. + /// If true, terminal modifications will be ignored. + /// A nested dictionary whose key mappings are as follows: string ProteinGroup-> string Protein-> string BaseSequence-> int ModifiedAminoAcidIndex-> string ModificationName-> double Intensity + /// Note: Each BaseSequence dictionary contains a ModifiedAminoAcidIndex key of -1 that then contains a ModificationName key called "Total" that is used to track the total intensity observed for + /// all of the amino acids in that peptide. + /// + public void CalculateOccupancies(List<(string fullSeq, List proteinGroups, double intensity)> peptides, bool ignoreTerminusMod = false) + { + // ToDo: change first argument to Dictionary + ProteinGroupOccupancies = new Dictionary(); + PeptideOccupancies = new Dictionary(); + + // Go through the peptides given + foreach (var pep in peptides) + { + //string baseSeq = pep.Item2.IsNotNullOrEmpty() ? pep.Item2 : new string(pep.Item1.ToCharArray()); // in case it is null or empty and we need to get the base sequence from the full sequence + //ClassExtensions.RemoveSpecialCharacters(ref baseSeq, @"", ClassExtensions.modificationPattern); + string baseSeq = pep.fullSeq.GetBaseSequenceFromFullSequence(); + + if (!PeptideOccupancies.ContainsKey(pep.fullSeq)) + { + // Need to make sure clustering of proteingroups is correct + string proteinGroupsJoined = string.Join(";", pep.proteinGroups); + PeptideOccupancies[pep.fullSeq] = (new QuantifiedPeptide(pep.fullSeq, intensity: pep.intensity), proteinGroupsJoined); + } + else + { + PeptideOccupancies[pep.fullSeq].QuantifiedPeptide.AddFullSequence(pep.fullSeq, intensity: pep.intensity); + } + + // Go through the peptide's protein groups + foreach (var pg in pep.proteinGroups) + { + // If have not seen that protein group, store it + if (!ProteinGroupOccupancies.ContainsKey(pg)) + { + ProteinGroupOccupancies[pg] = new QuantifiedProteinGroup(pg); + ProteinGroupOccupancies[pg].OccupancyLevel = "peptide"; + } + var proteinGroup = ProteinGroupOccupancies[pg]; + + // Go through the proteins in each protein group + foreach (var proteinName in pg.Split('|')) + { + // Add the protein to the protein group's dictionary if it has not been added + if (!proteinGroup.Proteins.ContainsKey(proteinName)) + { + proteinGroup.Proteins[proteinName] = new QuantifiedProtein(proteinName); + } + var protein = proteinGroup.Proteins[proteinName]; + + // If the peptide's base sequence has not been seen, add it to the protein's dictionary + if (!protein.Peptides.ContainsKey(baseSeq)) + { + protein.Peptides[baseSeq] = new QuantifiedPeptide(pep.fullSeq, intensity: pep.intensity); + } + else + { + // If the peptide's base sequence has been seen, add the new full sequence to the existing peptide + protein.Peptides[baseSeq].AddFullSequence(pep.fullSeq, intensity: pep.intensity); + } + } + } + } + } + } +} From 2949457741f9b4c70a14012a76a6a8051545953e Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Mon, 25 Aug 2025 16:09:55 -0500 Subject: [PATCH 2/7] Added TestMzLibUtils tests for quantified mods, peptides, and proteins. Need tests for the protein groups and the occupancy set up (currently called CalculateOccupancies). --- mzLib/MzLibUtil/PositionFrequencyAnalysis.cs | 23 ++-- mzLib/Test/TestMzLibUtil.cs | 122 +++++++++++++++++++ 2 files changed, 137 insertions(+), 8 deletions(-) diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs index f750eb3fc..7fea0bd93 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs @@ -55,7 +55,7 @@ public void AddFullSequence(string fullSeq, double intensity = 0, string modPatt } else { - throw new Exception("The base sequence of the peptide does not match the full sequence."); + throw new Exception("The base sequence of the peptide being added does not match the base sequence of this peptide."); } } @@ -63,7 +63,7 @@ public void MergePeptide(QuantifiedPeptide peptideToMerge) { if (peptideToMerge == null || peptideToMerge.BaseSequence != BaseSequence) { - throw new Exception("The base sequence of the peptide to merge does not match the base sequence of this peptide."); + throw new Exception("The base sequence of the peptide being added does not match the base sequence of this peptide."); } foreach (var fullSeq in peptideToMerge.FullSequences) { @@ -146,6 +146,11 @@ public void SetProteinModsFromPeptides() foreach (var peptide in Peptides.Values) { + // if peptide position in protein is unknown, set it using the protein sequence + if (peptide.OneBasedStartIndexInProtein == -1) + { + peptide.OneBasedStartIndexInProtein = Sequence.IndexOf(peptide.BaseSequence) + 1; + } // if peptide has no modifications, add to all its positions if (!peptide.ModifiedAminoAcidPositions.IsNotNullOrEmpty()) { @@ -206,17 +211,19 @@ public void SetProteinModsFromPeptides() } - public Dictionary> GetModStoichiometryFromProteinMods() + public Dictionary> GetModStoichiometryFromProteinMods() { SetProteinModsFromPeptides(); - - var aaModsStoichiometry = ModifiedAminoAcidPositionsInProtein; - foreach (var modpos in aaModsStoichiometry.Keys) + var aaModsStoichiometry = new Dictionary>(); + foreach (var modpos in ModifiedAminoAcidPositionsInProtein.Keys) { + aaModsStoichiometry[modpos] = new Dictionary(); + double totalPositionIntensity = Peptides.Where(pep => PeptidesByProteinPosition[modpos].Contains(pep.Key)).Sum(x => x.Value.Intensity); - foreach (var mod in aaModsStoichiometry[modpos].Values) + foreach (var mod in ModifiedAminoAcidPositionsInProtein[modpos].Values) { - mod.Intensity /= totalPositionIntensity; + double modFraction = mod.Intensity / totalPositionIntensity; + aaModsStoichiometry[modpos].Add(mod.IdWithMotif, modFraction); } } return aaModsStoichiometry; diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs index a33ee4f80..50b1ca571 100644 --- a/mzLib/Test/TestMzLibUtil.cs +++ b/mzLib/Test/TestMzLibUtil.cs @@ -3,6 +3,9 @@ using MzLibUtil; using Readers; using System.Collections.Generic; +using FlashLFQ; +using System.Linq; +using Proteomics.AminoAcidPolymer; namespace Test { @@ -163,8 +166,127 @@ public void TestRemoveSpecialCharacters() string cleanSeq = seqWithHash.ToString(); ClassExtensions.RemoveSpecialCharacters(ref cleanSeq, specialCharacter: "#"); Assert.AreEqual("PEPTIDE", cleanSeq); + } + + [Test] + public void TestQuantifiedModification() + { + var quantmod = new QuantifiedModification(idWithMotif: "TestMod: ModX on AAY", positionInPeptide: 1, positionInProtein: 2, intensity: 10); + Assert.AreEqual(quantmod.IdWithMotif, "TestMod: ModX on AAY"); + Assert.AreEqual(quantmod.PeptidePositionZeroIsNTerminus, 1); + Assert.AreEqual(quantmod.ProteinPositionZeroIsNTerminus, 2); + Assert.AreEqual(quantmod.Intensity, 10); + Assert.AreEqual(quantmod.ModificationLocalization, "Unknown"); + } + + [Test] + public void TestQuantifiedPeptide() + { + var fullSeq1 = "[UniProt: N - palmitoyl glycine on G]G[UniProt: N - methylglycine on G]K[UniProt: O - linked(Hex) hydroxylysine on K]"; + var peptide1 = new QuantifiedPeptide(fullSeq1, intensity: 1); + Assert.That(peptide1.FullSequences.Contains(fullSeq1)); + Assert.AreEqual(peptide1.BaseSequence, "GK"); + Assert.AreEqual(peptide1.Intensity, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions.Count, 3); + Assert.That(peptide1.ModifiedAminoAcidPositions.ContainsKey(0)); + Assert.That(peptide1.ModifiedAminoAcidPositions.ContainsKey(1)); + Assert.That(peptide1.ModifiedAminoAcidPositions.ContainsKey(2)); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0].First().Value.IdWithMotif, "UniProt: N - palmitoyl glycine on G"); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].First().Value.IdWithMotif, "UniProt: N - methylglycine on G"); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].First().Value.IdWithMotif, "UniProt: O - linked(Hex) hydroxylysine on K"); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0].First().Value.Intensity, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].First().Value.Intensity, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].First().Value.Intensity, 1); + + // Test MergePeptide method + var fullSeq2 = "[UniProt: N - acetylglycine on G]G[UniProt: N - methylglycine on G]K[UniProt: O - linked(Hex) hydroxylysine on K]"; + var peptide2 = new QuantifiedPeptide(fullSeq2, intensity: 10); + peptide1.MergePeptide(peptide2); + + Assert.That(peptide1.FullSequences.Contains(fullSeq2)); + Assert.AreEqual(peptide1.Intensity, 11); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions.Count, 3); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0].Count, 2); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].Count, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].Count, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0]["UniProt: N - palmitoyl glycine on G"].Intensity, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0]["UniProt: N - acetylglycine on G"].Intensity, 10); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].First().Value.Intensity, 11); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].First().Value.Intensity, 11); + + // Test AddFullSequence method + var fullSeq3 = "GK[UniProt: O - linked(Hex) hydroxylysine on K]"; + peptide1.AddFullSequence(fullSeq3, intensity:100); + + Assert.That(peptide1.FullSequences.Contains(fullSeq3)); + Assert.AreEqual(peptide1.Intensity, 111); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions.Count, 3); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0].Count, 2); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].Count, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].Count, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0]["UniProt: N - palmitoyl glycine on G"].Intensity, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0]["UniProt: N - acetylglycine on G"].Intensity, 10); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].First().Value.Intensity, 11); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].First().Value.Intensity, 111); + + // Test failed merge due to base sequence mismatch + var errorMessage = "The base sequence of the peptide being added does not match the base sequence of this peptide."; + var exception1 = Assert.Throws(() => peptide1.AddFullSequence("AK", intensity: 1)); + Assert.AreEqual(exception1.Message, errorMessage); + + var peptide3 = new QuantifiedPeptide("AK", intensity: 1); + var exception2 = Assert.Throws(() => peptide1.MergePeptide(peptide3)); + Assert.AreEqual(exception2.Message, errorMessage); + } + + [Test] + public void TestQuantifiedProtein() + { + + var fullSeq1 = "[UniProt: N - palmitoyl glycine on G]G[UniProt: N - methylglycine on G]K[UniProt: O - linked(Hex) hydroxylysine on K]"; + var fullSeq2 = "[UniProt: N - acetylglycine on G]G[UniProt: N - methylglycine on G]K-[C-Terminal UniProt: Lysine Amide on K]"; + var fullSeq3 = "A[UniProt:N-methylalanine on A]K[UniProt: O - linked(Hex) hydroxylysine on K]-[C-Terminal UniProt: Lysine Amide on K]"; + + var basePeptide1 = new QuantifiedPeptide(fullSeq1, intensity: 1); + var basePeptide2 = new QuantifiedPeptide(fullSeq3, intensity: 100); + + basePeptide1.AddFullSequence(fullSeq2, intensity: 10); + var peptides = new Dictionary {{ basePeptide1.BaseSequence, basePeptide1}, + { basePeptide2.BaseSequence, basePeptide2 }}; + + var proteinSeq = "GKAAAAAAK"; + var protein = new QuantifiedProtein(accession: "TESTPROT", sequence: proteinSeq, peptides: peptides); + var stoich = protein.GetModStoichiometryFromProteinMods(); + + // Check object fields modified by SetProteinModsFromPeptides, which gets called first in the GetModStoichiometryFromProteinMods method. + Assert.AreEqual(protein.Accession, "TESTPROT"); + Assert.AreEqual(protein.Sequence, proteinSeq); + Assert.AreEqual(protein.Peptides.Count, 2); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein.Count, 6); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[0].Count, 2); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[1].Count, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[2].Count, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[8].Count, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[9].Count, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[10].Count, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[0]["UniProt: N - palmitoyl glycine on G"].Intensity, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[0]["UniProt: N - acetylglycine on G"].Intensity, 10); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[1]["UniProt: N - methylglycine on G"].Intensity, 11); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[2]["UniProt: O - linked(Hex) hydroxylysine on K"].Intensity, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[8]["UniProt:N-methylalanine on A"].Intensity, 100); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[9]["UniProt: O - linked(Hex) hydroxylysine on K"].Intensity, 100); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[10]["C-Terminal UniProt: Lysine Amide on K"].Intensity, 100); + // Check stoichiometry results + Assert.AreEqual(stoich.Count, 6); + Assert.AreEqual(stoich[0]["UniProt: N - palmitoyl glycine on G"], 1 / 11.0); + Assert.AreEqual(stoich[0]["UniProt: N - acetylglycine on G"], 10 / 11.0); + Assert.AreEqual(stoich[1]["UniProt: N - methylglycine on G"], 11 / 11.0); + Assert.AreEqual(stoich[2]["UniProt: O - linked(Hex) hydroxylysine on K"], 1 / 11.0); + Assert.AreEqual(stoich[8]["UniProt:N-methylalanine on A"], 1); + Assert.AreEqual(stoich[9]["UniProt: O - linked(Hex) hydroxylysine on K"], 1); + Assert.AreEqual(stoich[10]["C-Terminal UniProt: Lysine Amide on K"], 1); } public struct TestStruct From 25bf8da195a7d6f526f6beabdc0c2f688ea884ca Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Tue, 26 Aug 2025 16:13:56 -0500 Subject: [PATCH 3/7] Added PG and Quant object setup tests. Need to finish these tests, though --- mzLib/MzLibUtil/PositionFrequencyAnalysis.cs | 43 +++++++++++------- mzLib/Test/TestMzLibUtil.cs | 48 ++++++++++++++++++++ 2 files changed, 75 insertions(+), 16 deletions(-) diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs index 7fea0bd93..3e63547d3 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs @@ -238,16 +238,24 @@ public class QuantifiedProteinGroup public QuantifiedProteinGroup(string name, Dictionary proteins = null) { - Name = name; - if (proteins != null) Proteins = proteins; - else Proteins = new Dictionary(); + string splitPattern = @";|\|"; + var proteinAccessions = Regex.Split(name, splitPattern); + if ((proteinAccessions.Length == proteins.Count && proteinAccessions.OrderBy(x => x).SequenceEqual(proteins.Keys.OrderBy(x => x))) || proteins.IsNullOrEmpty()) + { + Name = name; + Proteins = proteins ?? new Dictionary(); + } + else + {ProteinGroupQuantObjects + throw new Exception("The number of proteins provided does not match the number of proteins in the protein group name."); + } } } public class PositionFrequencyAnalysis { - public Dictionary ProteinGroupOccupancies { get; private set; } - public Dictionary PeptideOccupancies { get; private set; } + public Dictionary ProteinGroupQuantObjects { get; private set; } + public Dictionary PeptideQuantObjects { get; private set; } /// /// Calculates the occupancy of post-translational modifications at the peptide level. @@ -258,11 +266,10 @@ public class PositionFrequencyAnalysis /// Note: Each BaseSequence dictionary contains a ModifiedAminoAcidIndex key of -1 that then contains a ModificationName key called "Total" that is used to track the total intensity observed for /// all of the amino acids in that peptide. /// - public void CalculateOccupancies(List<(string fullSeq, List proteinGroups, double intensity)> peptides, bool ignoreTerminusMod = false) + public void SetUpQuantificationObjects(List<(string fullSeq, List proteinGroups, double intensity)> peptides, Dictionary proteinSequences=null) { - // ToDo: change first argument to Dictionary - ProteinGroupOccupancies = new Dictionary(); - PeptideOccupancies = new Dictionary(); + ProteinGroupQuantObjects = new Dictionary(); + PeptideQuantObjects = new Dictionary(); // Go through the peptides given foreach (var pep in peptides) @@ -271,27 +278,27 @@ public void CalculateOccupancies(List<(string fullSeq, List proteinGroup //ClassExtensions.RemoveSpecialCharacters(ref baseSeq, @"", ClassExtensions.modificationPattern); string baseSeq = pep.fullSeq.GetBaseSequenceFromFullSequence(); - if (!PeptideOccupancies.ContainsKey(pep.fullSeq)) + if (!PeptideQuantObjects.ContainsKey(pep.fullSeq)) { // Need to make sure clustering of proteingroups is correct string proteinGroupsJoined = string.Join(";", pep.proteinGroups); - PeptideOccupancies[pep.fullSeq] = (new QuantifiedPeptide(pep.fullSeq, intensity: pep.intensity), proteinGroupsJoined); + PeptideQuantObjects[pep.fullSeq] = (new QuantifiedPeptide(pep.fullSeq, intensity: pep.intensity), proteinGroupsJoined); } else { - PeptideOccupancies[pep.fullSeq].QuantifiedPeptide.AddFullSequence(pep.fullSeq, intensity: pep.intensity); + PeptideQuantObjects[pep.fullSeq].QuantifiedPeptide.AddFullSequence(pep.fullSeq, intensity: pep.intensity); } // Go through the peptide's protein groups foreach (var pg in pep.proteinGroups) { // If have not seen that protein group, store it - if (!ProteinGroupOccupancies.ContainsKey(pg)) + if (!ProteinGroupQuantObjects.ContainsKey(pg)) { - ProteinGroupOccupancies[pg] = new QuantifiedProteinGroup(pg); - ProteinGroupOccupancies[pg].OccupancyLevel = "peptide"; + ProteinGroupQuantObjects[pg] = new QuantifiedProteinGroup(pg); + ProteinGroupQuantObjects[pg].OccupancyLevel = "peptide"; } - var proteinGroup = ProteinGroupOccupancies[pg]; + var proteinGroup = ProteinGroupQuantObjects[pg]; // Go through the proteins in each protein group foreach (var proteinName in pg.Split('|')) @@ -300,6 +307,10 @@ public void CalculateOccupancies(List<(string fullSeq, List proteinGroup if (!proteinGroup.Proteins.ContainsKey(proteinName)) { proteinGroup.Proteins[proteinName] = new QuantifiedProtein(proteinName); + if (proteinSequences.IsNotNullOrEmpty() && proteinSequences.ContainsKey(proteinName)) + { + proteinGroup.Proteins[proteinName].Sequence = proteinSequences[proteinName]; + } } var protein = proteinGroup.Proteins[proteinName]; diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs index 50b1ca571..0a0cf3130 100644 --- a/mzLib/Test/TestMzLibUtil.cs +++ b/mzLib/Test/TestMzLibUtil.cs @@ -6,6 +6,7 @@ using FlashLFQ; using System.Linq; using Proteomics.AminoAcidPolymer; +using System; namespace Test { @@ -289,6 +290,53 @@ public void TestQuantifiedProtein() Assert.AreEqual(stoich[10]["C-Terminal UniProt: Lysine Amide on K"], 1); } + [Test] + public void TestQuantifiedProteinGroup() + { + // Test correct arguments where protein group name contains the names of the proteins + var protein1 = new QuantifiedProtein(accession: "PROT1", sequence: "AAAYYY", peptides: new Dictionary()); + var protein2 = new QuantifiedProtein(accession: "PROT2", sequence: "AAARRR", peptides: new Dictionary()); + var proteins = new Dictionary { { protein1.Accession, protein1 }, + { protein2.Accession, protein2 } }; + var proteinGroup = new QuantifiedProteinGroup("PROT1|PROT2", proteins); + Assert.AreEqual(proteinGroup.Proteins.Count, 2); + Assert.AreEqual(proteinGroup.Proteins["PROT1"].Accession, "PROT1"); + Assert.AreEqual(proteinGroup.Proteins["PROT2"].Accession, "PROT2"); + + // Test incorrect argument where protein group name does not contain the names of the proteins + var errorMessage = "The number of proteins provided does not match the number of proteins in the protein group name."; + var exception1 = Assert.Throws(() => new QuantifiedProteinGroup("PROT1|PROT2", new Dictionary { { protein1.Accession, protein1 } })); + Assert.AreEqual(exception1.Message, errorMessage); + + var exception2 = Assert.Throws(() => new QuantifiedProteinGroup("PROT1", proteins)); + Assert.AreEqual(exception2.Message, errorMessage); + + var exception3 = Assert.Throws(() => new QuantifiedProteinGroup("PROT1|PROT2|PROT3", proteins)); + Assert.AreEqual(exception3.Message, errorMessage); + } + + [Test] + public void TestSetUpQuantificationObjects() + { + var fullSeq1 = "[UniProt: N - palmitoyl glycine on G]G[UniProt: N - methylglycine on G]K[UniProt: O - linked(Hex) hydroxylysine on K]"; + var fullSeq2 = "[UniProt: N - acetylglycine on G]G[UniProt: N - methylglycine on G]K-[C-Terminal UniProt: Lysine Amide on K]"; + var fullSequences = new List { fullSeq1, fullSeq2 }; + var proteinGroups = new List { "TESTPROT1|TESTPROT2", "TESTPROT3" }; + var proteinSequences = new Dictionary { { "TESTPROT1", "GKAAAAAAK" }, + { "TESTPROT2", "AKAAAAAGK" }, + { "TESTPROT3", "AKGK"} }; + var intensities = new List { 1, 5 }; + var sequenceInputs = new List<(string, List, double)> { }; + for (int i = 0; i < 2; i++) + { + sequenceInputs.Add((fullSequences[i], proteinGroups, intensities[i])); + } + + var quantificationObjects = new PositionFrequencyAnalysis(); + quantificationObjects.SetUpQuantificationObjects(sequenceInputs, proteinSequences); + // NEED TO FINISH THIS TEST + } + public struct TestStruct { public int X { get; set; } From 31c40cde3da54c980267599a67b86e582c82a9ed Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Thu, 28 Aug 2025 13:34:58 -0500 Subject: [PATCH 4/7] Finshed TestSetUpQuantificationObjects. Removed Peptides field (and its population) from SetUpQuantificationObjects method for now. --- mzLib/MzLibUtil/PositionFrequencyAnalysis.cs | 32 +++++--------------- mzLib/Test/TestMzLibUtil.cs | 27 ++++++++++++++--- 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs index 3e63547d3..b195c6932 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs @@ -234,10 +234,10 @@ public class QuantifiedProteinGroup { public string Name { get; set; } public Dictionary Proteins { get; set; } - public string OccupancyLevel { get; set; } public QuantifiedProteinGroup(string name, Dictionary proteins = null) { + proteins = proteins ?? new Dictionary(); string splitPattern = @";|\|"; var proteinAccessions = Regex.Split(name, splitPattern); if ((proteinAccessions.Length == proteins.Count && proteinAccessions.OrderBy(x => x).SequenceEqual(proteins.Keys.OrderBy(x => x))) || proteins.IsNullOrEmpty()) @@ -246,7 +246,7 @@ public QuantifiedProteinGroup(string name, Dictionary Proteins = proteins ?? new Dictionary(); } else - {ProteinGroupQuantObjects + { throw new Exception("The number of proteins provided does not match the number of proteins in the protein group name."); } } @@ -254,51 +254,35 @@ public QuantifiedProteinGroup(string name, Dictionary public class PositionFrequencyAnalysis { - public Dictionary ProteinGroupQuantObjects { get; private set; } - public Dictionary PeptideQuantObjects { get; private set; } + public Dictionary ProteinGroups { get; private set; } + //public Dictionary Peptides { get; private set; } /// /// Calculates the occupancy of post-translational modifications at the peptide level. /// /// A List of Tuples whose entries are ordered as (string FullSequence, string BaseSequence, List ProteinGroups, Intensity) for each peptide. - /// If true, terminal modifications will be ignored. /// A nested dictionary whose key mappings are as follows: string ProteinGroup-> string Protein-> string BaseSequence-> int ModifiedAminoAcidIndex-> string ModificationName-> double Intensity /// Note: Each BaseSequence dictionary contains a ModifiedAminoAcidIndex key of -1 that then contains a ModificationName key called "Total" that is used to track the total intensity observed for /// all of the amino acids in that peptide. /// public void SetUpQuantificationObjects(List<(string fullSeq, List proteinGroups, double intensity)> peptides, Dictionary proteinSequences=null) { - ProteinGroupQuantObjects = new Dictionary(); - PeptideQuantObjects = new Dictionary(); + ProteinGroups = new Dictionary(); // Go through the peptides given foreach (var pep in peptides) { - //string baseSeq = pep.Item2.IsNotNullOrEmpty() ? pep.Item2 : new string(pep.Item1.ToCharArray()); // in case it is null or empty and we need to get the base sequence from the full sequence - //ClassExtensions.RemoveSpecialCharacters(ref baseSeq, @"", ClassExtensions.modificationPattern); string baseSeq = pep.fullSeq.GetBaseSequenceFromFullSequence(); - if (!PeptideQuantObjects.ContainsKey(pep.fullSeq)) - { - // Need to make sure clustering of proteingroups is correct - string proteinGroupsJoined = string.Join(";", pep.proteinGroups); - PeptideQuantObjects[pep.fullSeq] = (new QuantifiedPeptide(pep.fullSeq, intensity: pep.intensity), proteinGroupsJoined); - } - else - { - PeptideQuantObjects[pep.fullSeq].QuantifiedPeptide.AddFullSequence(pep.fullSeq, intensity: pep.intensity); - } - // Go through the peptide's protein groups foreach (var pg in pep.proteinGroups) { // If have not seen that protein group, store it - if (!ProteinGroupQuantObjects.ContainsKey(pg)) + if (!ProteinGroups.ContainsKey(pg)) { - ProteinGroupQuantObjects[pg] = new QuantifiedProteinGroup(pg); - ProteinGroupQuantObjects[pg].OccupancyLevel = "peptide"; + ProteinGroups[pg] = new QuantifiedProteinGroup(pg); } - var proteinGroup = ProteinGroupQuantObjects[pg]; + var proteinGroup = ProteinGroups[pg]; // Go through the proteins in each protein group foreach (var proteinName in pg.Split('|')) diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs index 0a0cf3130..bb95ab520 100644 --- a/mzLib/Test/TestMzLibUtil.cs +++ b/mzLib/Test/TestMzLibUtil.cs @@ -7,6 +7,7 @@ using System.Linq; using Proteomics.AminoAcidPolymer; using System; +using NUnit.Framework.Legacy; namespace Test { @@ -331,10 +332,28 @@ public void TestSetUpQuantificationObjects() { sequenceInputs.Add((fullSequences[i], proteinGroups, intensities[i])); } - - var quantificationObjects = new PositionFrequencyAnalysis(); - quantificationObjects.SetUpQuantificationObjects(sequenceInputs, proteinSequences); - // NEED TO FINISH THIS TEST + sequenceInputs.Add(("AAAA", new List { "TESTPROT1|TESTPROT2" }, 10)); + + var quant = new PositionFrequencyAnalysis(); + quant.SetUpQuantificationObjects(sequenceInputs, proteinSequences); + Assert.AreEqual(quant.ProteinGroups.Count, 2); + Assert.That(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins.Keys.Contains("TESTPROT1")); + Assert.That(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins.Keys.Contains("TESTPROT2")); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT1"].Accession, "TESTPROT1"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT1"].Sequence, "GKAAAAAAK"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT1"].Peptides.Count, 2); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT1"].Peptides["GK"].FullSequences.Count, 2); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT1"].Peptides["AAAA"].FullSequences.Count, 1); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT2"].Accession, "TESTPROT2"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT2"].Sequence, "AKAAAAAGK"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT2"].Peptides.Count, 2); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT2"].Peptides["GK"].FullSequences.Count, 2); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT2"].Peptides["AAAA"].FullSequences.Count, 1); + + Assert.That(quant.ProteinGroups["TESTPROT3"].Proteins.Keys.Contains("TESTPROT3")); + Assert.AreEqual(quant.ProteinGroups["TESTPROT3"].Proteins["TESTPROT3"].Accession, "TESTPROT3"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT3"].Proteins["TESTPROT3"].Sequence, "AKGK"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT3"].Proteins["TESTPROT3"].Peptides.Count, 1); } public struct TestStruct From 1cbfbaf81ff9dcc540f68b27729efa9e35c91194 Mon Sep 17 00:00:00 2001 From: pcruzparri Date: Mon, 1 Sep 2025 18:36:05 -0500 Subject: [PATCH 5/7] Refactored quantification util classes --- mzLib/MzLibUtil/PositionFrequencyAnalysis.cs | 316 ------------------ .../PositionFrequencyAnalysis.cs | 68 ++++ .../QuantifiedModification.cs | 20 ++ .../QuantifiedPeptide.cs | 102 ++++++ .../QuantifiedProtein.cs | 118 +++++++ .../QuantifiedProteinGroup.cs | 29 ++ mzLib/Test/TestMzLibUtil.cs | 3 +- 7 files changed, 339 insertions(+), 317 deletions(-) delete mode 100644 mzLib/MzLibUtil/PositionFrequencyAnalysis.cs create mode 100644 mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs create mode 100644 mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs create mode 100644 mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs create mode 100644 mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs create mode 100644 mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs deleted file mode 100644 index b195c6932..000000000 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis.cs +++ /dev/null @@ -1,316 +0,0 @@ -using Easy.Common.Extensions; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text.RegularExpressions; - -namespace MzLibUtil -{ - public class QuantifiedModification - { - public string IdWithMotif { get; set; } - public string ModificationLocalization { get; set; } // e.g. "N-terminus", "C-terminus", or amino acid name - public int PeptidePositionZeroIsNTerminus { get; set; } - public int ProteinPositionZeroIsNTerminus { get; set; } - public double Intensity { get; set; } - - public QuantifiedModification(string idWithMotif, int positionInPeptide, int? positionInProtein = null, string modLocalization = null, double intensity = 0) - { - IdWithMotif = idWithMotif; - PeptidePositionZeroIsNTerminus = positionInPeptide; - ProteinPositionZeroIsNTerminus = positionInProtein ?? -1; // -1 means that the position in the protein is unknown - ModificationLocalization = modLocalization ?? "Unknown"; - Intensity = intensity; - } - } - /// - /// A class to store information about a quantified peptides sharing the same base sequence. - /// - public class QuantifiedPeptide - { - public HashSet FullSequences { get; set; } - public string BaseSequence { get; set; } - public QuantifiedProtein ParentProtein { get; set; } - public int OneBasedStartIndexInProtein { get; set; } - public Dictionary> ModifiedAminoAcidPositions { get; set; } - public double Intensity { get; set; } - - public QuantifiedPeptide(string fullSequence, int oneBasedStartIndexInProtein = -1, double intensity = 0, string modPattern = null) - { - ModifiedAminoAcidPositions = new Dictionary>(); - OneBasedStartIndexInProtein = oneBasedStartIndexInProtein; // -1 means that the position in the protein is unknown - Intensity = intensity; - FullSequences = new HashSet { fullSequence }; - _SetBaseSequence(fullSequence, modPattern); - _SetModifications(fullSequence, intensity); - } - - public void AddFullSequence(string fullSeq, double intensity = 0, string modPattern = null) - { - if (BaseSequence.Equals(fullSeq.GetBaseSequenceFromFullSequence())) - { - FullSequences.Add(fullSeq); - Intensity += intensity; - _SetModifications(fullSeq, intensity); // updating the intensity is done here - } - else - { - throw new Exception("The base sequence of the peptide being added does not match the base sequence of this peptide."); - } - } - - public void MergePeptide(QuantifiedPeptide peptideToMerge) - { - if (peptideToMerge == null || peptideToMerge.BaseSequence != BaseSequence) - { - throw new Exception("The base sequence of the peptide being added does not match the base sequence of this peptide."); - } - foreach (var fullSeq in peptideToMerge.FullSequences) - { - FullSequences.Add(fullSeq); - _SetModifications(fullSeq, peptideToMerge.Intensity); // updating the intensity is done here - } - Intensity += peptideToMerge.Intensity; - } - - private void _SetModifications(string fullSeq, double intensity = 0) - { - var mods = fullSeq.ParseModifications(); - - if (mods.IsNotNullOrEmpty()) - { - foreach (var modpos in mods.Keys) - { - var mod = mods[modpos]; - if (!ModifiedAminoAcidPositions.ContainsKey(modpos)) - { - ModifiedAminoAcidPositions[modpos] = new Dictionary(); - } - - if (!ModifiedAminoAcidPositions[modpos].ContainsKey(mod)) - { - var modLocalization = modpos == 0 ? "N-terminus" : (modpos == BaseSequence.Length + 1 ? "C-terminus" : BaseSequence[modpos - 1].ToString()); - ModifiedAminoAcidPositions[modpos][mod] = new QuantifiedModification(mod, modpos, modLocalization: modLocalization, intensity: 0); - } - ModifiedAminoAcidPositions[modpos][mod].Intensity += intensity; - - // Maybe should update/pass position in protein from here, too. - } - } - } - - private void _SetBaseSequence(string fullSeq, string modPattern) - { - BaseSequence = fullSeq.GetBaseSequenceFromFullSequence(modPattern: modPattern); - } - - public Dictionary> GetModStoichiometryForPeptide() - { - var aaModsStoichiometry = ModifiedAminoAcidPositions; - - foreach (var modpos in aaModsStoichiometry) - { - foreach (var mod in modpos.Value.Values) - { - mod.Intensity /= Intensity; - } - } - return aaModsStoichiometry; - } - } - - public class QuantifiedProtein - { - public string Accession { get; set; } - public string Sequence { get; set; } - public Dictionary Peptides { get; set; } - public Dictionary> ModifiedAminoAcidPositionsInProtein { get; set; } - public Dictionary> PeptidesByProteinPosition { get; set; } - - public QuantifiedProtein(string accession, string sequence = null, Dictionary peptides = null) - { - Accession = accession; - Sequence = sequence; - Peptides = peptides ?? new Dictionary(); - } - - public void SetProteinModsFromPeptides() - { - if (!Sequence.IsNotNullOrEmpty() || !Peptides.IsNotNullOrEmpty()) - { - throw new Exception("The protein sequence is unknown, or there're no peptides."); - } - - ModifiedAminoAcidPositionsInProtein = new Dictionary>(); - PeptidesByProteinPosition = new Dictionary>(); - - foreach (var peptide in Peptides.Values) - { - // if peptide position in protein is unknown, set it using the protein sequence - if (peptide.OneBasedStartIndexInProtein == -1) - { - peptide.OneBasedStartIndexInProtein = Sequence.IndexOf(peptide.BaseSequence) + 1; - } - // if peptide has no modifications, add to all its positions - if (!peptide.ModifiedAminoAcidPositions.IsNotNullOrEmpty()) - { - for (int i = 0; i < peptide.BaseSequence.Length; i++) - { - var pos = peptide.OneBasedStartIndexInProtein + i; - if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(pos)) - { - ModifiedAminoAcidPositionsInProtein[pos] = new Dictionary(); - PeptidesByProteinPosition[pos] = new HashSet(); - } - PeptidesByProteinPosition[pos].Add(peptide.BaseSequence); - } - continue; - } - - else // if peptide has modifications, add to modified positions - { - foreach (var modpos in peptide.ModifiedAminoAcidPositions.Keys) - { - var modPositionInProtein = modpos + peptide.OneBasedStartIndexInProtein - 1; - - // Ignore peptide terminal modifications that are not at the protein terminal - if ((modPositionInProtein != 0 && modpos == 0) // if the mod is at the N-terminus of the peptide, but not the protein. - || (modPositionInProtein != Sequence.Length + 1 && modpos == peptide.BaseSequence.Length + 1)) // if the mod is at the C-terminus of the peptide, but not the protein. - { - continue; - } - - if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(modPositionInProtein)) - { - ModifiedAminoAcidPositionsInProtein[modPositionInProtein] = new Dictionary(); - PeptidesByProteinPosition[modPositionInProtein] = new HashSet(); - } - PeptidesByProteinPosition[modPositionInProtein].Add(peptide.BaseSequence); - - foreach (var mod in peptide.ModifiedAminoAcidPositions[modpos].Values) - { - mod.ProteinPositionZeroIsNTerminus = modPositionInProtein; - - if (!ModifiedAminoAcidPositionsInProtein[modPositionInProtein].ContainsKey(mod.IdWithMotif)) - { - ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.IdWithMotif] = new QuantifiedModification(mod.IdWithMotif, mod.PeptidePositionZeroIsNTerminus, modPositionInProtein, null, 0); - } - ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.IdWithMotif].Intensity += mod.Intensity; - } - } - } - } - - // clean up the dictionary to remove any empty modifications - var noModPositions = ModifiedAminoAcidPositionsInProtein.Where(x => !x.Value.IsNotNullOrEmpty()).ToDictionary().Keys; - foreach (var pos in noModPositions) - { - ModifiedAminoAcidPositionsInProtein.Remove(pos); - PeptidesByProteinPosition.Remove(pos); - } - - } - - public Dictionary> GetModStoichiometryFromProteinMods() - { - SetProteinModsFromPeptides(); - var aaModsStoichiometry = new Dictionary>(); - foreach (var modpos in ModifiedAminoAcidPositionsInProtein.Keys) - { - aaModsStoichiometry[modpos] = new Dictionary(); - - double totalPositionIntensity = Peptides.Where(pep => PeptidesByProteinPosition[modpos].Contains(pep.Key)).Sum(x => x.Value.Intensity); - foreach (var mod in ModifiedAminoAcidPositionsInProtein[modpos].Values) - { - double modFraction = mod.Intensity / totalPositionIntensity; - aaModsStoichiometry[modpos].Add(mod.IdWithMotif, modFraction); - } - } - return aaModsStoichiometry; - } - } - - public class QuantifiedProteinGroup - { - public string Name { get; set; } - public Dictionary Proteins { get; set; } - - public QuantifiedProteinGroup(string name, Dictionary proteins = null) - { - proteins = proteins ?? new Dictionary(); - string splitPattern = @";|\|"; - var proteinAccessions = Regex.Split(name, splitPattern); - if ((proteinAccessions.Length == proteins.Count && proteinAccessions.OrderBy(x => x).SequenceEqual(proteins.Keys.OrderBy(x => x))) || proteins.IsNullOrEmpty()) - { - Name = name; - Proteins = proteins ?? new Dictionary(); - } - else - { - throw new Exception("The number of proteins provided does not match the number of proteins in the protein group name."); - } - } - } - public class PositionFrequencyAnalysis - { - - public Dictionary ProteinGroups { get; private set; } - //public Dictionary Peptides { get; private set; } - - /// - /// Calculates the occupancy of post-translational modifications at the peptide level. - /// - /// A List of Tuples whose entries are ordered as (string FullSequence, string BaseSequence, List ProteinGroups, Intensity) for each peptide. - /// A nested dictionary whose key mappings are as follows: string ProteinGroup-> string Protein-> string BaseSequence-> int ModifiedAminoAcidIndex-> string ModificationName-> double Intensity - /// Note: Each BaseSequence dictionary contains a ModifiedAminoAcidIndex key of -1 that then contains a ModificationName key called "Total" that is used to track the total intensity observed for - /// all of the amino acids in that peptide. - /// - public void SetUpQuantificationObjects(List<(string fullSeq, List proteinGroups, double intensity)> peptides, Dictionary proteinSequences=null) - { - ProteinGroups = new Dictionary(); - - // Go through the peptides given - foreach (var pep in peptides) - { - string baseSeq = pep.fullSeq.GetBaseSequenceFromFullSequence(); - - // Go through the peptide's protein groups - foreach (var pg in pep.proteinGroups) - { - // If have not seen that protein group, store it - if (!ProteinGroups.ContainsKey(pg)) - { - ProteinGroups[pg] = new QuantifiedProteinGroup(pg); - } - var proteinGroup = ProteinGroups[pg]; - - // Go through the proteins in each protein group - foreach (var proteinName in pg.Split('|')) - { - // Add the protein to the protein group's dictionary if it has not been added - if (!proteinGroup.Proteins.ContainsKey(proteinName)) - { - proteinGroup.Proteins[proteinName] = new QuantifiedProtein(proteinName); - if (proteinSequences.IsNotNullOrEmpty() && proteinSequences.ContainsKey(proteinName)) - { - proteinGroup.Proteins[proteinName].Sequence = proteinSequences[proteinName]; - } - } - var protein = proteinGroup.Proteins[proteinName]; - - // If the peptide's base sequence has not been seen, add it to the protein's dictionary - if (!protein.Peptides.ContainsKey(baseSeq)) - { - protein.Peptides[baseSeq] = new QuantifiedPeptide(pep.fullSeq, intensity: pep.intensity); - } - else - { - // If the peptide's base sequence has been seen, add the new full sequence to the existing peptide - protein.Peptides[baseSeq].AddFullSequence(pep.fullSeq, intensity: pep.intensity); - } - } - } - } - } - } -} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs new file mode 100644 index 000000000..0b1b123f0 --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs @@ -0,0 +1,68 @@ +using Easy.Common.Extensions; +using System.Collections.Generic; + +namespace MzLibUtil.PositionFrequencyAnalysis +{ + public class PositionFrequencyAnalysis + { + public Dictionary ProteinGroups { get; private set; } + + //public Dictionary Peptides { get; private set; } + + /// + /// Calculates the occupancy of post-translational modifications at the peptide level. + /// + /// A List of Tuples whose entries are ordered as (string FullSequence, string BaseSequence, List ProteinGroups, Intensity) for each peptide. + /// A nested dictionary whose key mappings are as follows: string ProteinGroup-> string Protein-> string BaseSequence-> int ModifiedAminoAcidIndex-> string ModificationName-> double Intensity + /// Note: Each BaseSequence dictionary contains a ModifiedAminoAcidIndex key of -1 that then contains a ModificationName key called "Total" that is used to track the total intensity observed for + /// all of the amino acids in that peptide. + /// + public void SetUpQuantificationObjectsFromFullSequences(List<(string fullSeq, List proteinGroups, double intensity)> peptides, Dictionary proteinSequences=null) + { + ProteinGroups = new Dictionary(); + + // Go through the peptides given + foreach (var pep in peptides) + { + string baseSeq = pep.fullSeq.GetBaseSequenceFromFullSequence(); + + // Go through the peptide's protein groups + foreach (var pg in pep.proteinGroups) + { + // If have not seen that protein group, store it + if (!ProteinGroups.ContainsKey(pg)) + { + ProteinGroups[pg] = new QuantifiedProteinGroup(pg); + } + var proteinGroup = ProteinGroups[pg]; + + // Go through the proteins in each protein group + foreach (var proteinName in pg.Split('|')) + { + // Add the protein to the protein group's dictionary if it has not been added + if (!proteinGroup.Proteins.ContainsKey(proteinName)) + { + proteinGroup.Proteins[proteinName] = new QuantifiedProtein(proteinName); + if (proteinSequences.IsNotNullOrEmpty() && proteinSequences.ContainsKey(proteinName)) + { + proteinGroup.Proteins[proteinName].Sequence = proteinSequences[proteinName]; + } + } + var protein = proteinGroup.Proteins[proteinName]; + + // If the peptide's base sequence has not been seen, add it to the protein's dictionary + if (!protein.Peptides.ContainsKey(baseSeq)) + { + protein.Peptides[baseSeq] = new QuantifiedPeptide(pep.fullSeq, intensity: pep.intensity); + } + else + { + // If the peptide's base sequence has been seen, add the new full sequence to the existing peptide + protein.Peptides[baseSeq].AddFullSequence(pep.fullSeq, intensity: pep.intensity); + } + } + } + } + } + } +} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs new file mode 100644 index 000000000..cc3571101 --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs @@ -0,0 +1,20 @@ +namespace MzLibUtil.PositionFrequencyAnalysis +{ + public class QuantifiedModification + { + public string IdWithMotif { get; set; } + public string ModificationLocalization { get; set; } // e.g. "N-terminus", "C-terminus", or amino acid name + public int PeptidePositionZeroIsNTerminus { get; set; } + public int ProteinPositionZeroIsNTerminus { get; set; } + public double Intensity { get; set; } + + public QuantifiedModification(string idWithMotif, int positionInPeptide, int? positionInProtein = null, string modLocalization = null, double intensity = 0) + { + IdWithMotif = idWithMotif; + PeptidePositionZeroIsNTerminus = positionInPeptide; + ProteinPositionZeroIsNTerminus = positionInProtein ?? -1; // -1 means that the position in the protein is unknown + ModificationLocalization = modLocalization ?? "Unknown"; + Intensity = intensity; + } + } +} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs new file mode 100644 index 000000000..83ac0b965 --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs @@ -0,0 +1,102 @@ +using Easy.Common.Extensions; +using System; +using System.Collections.Generic; + +namespace MzLibUtil.PositionFrequencyAnalysis +{ + /// + /// A class to store information about a quantified peptides sharing the same base sequence. + /// + public class QuantifiedPeptide + { + public HashSet FullSequences { get; set; } + public string BaseSequence { get; set; } + public QuantifiedProtein ParentProtein { get; set; } + public int OneBasedStartIndexInProtein { get; set; } + public Dictionary> ModifiedAminoAcidPositions { get; set; } + public double Intensity { get; set; } + + public QuantifiedPeptide(string fullSequence, int oneBasedStartIndexInProtein = -1, double intensity = 0, string modPattern = null) + { + ModifiedAminoAcidPositions = new Dictionary>(); + OneBasedStartIndexInProtein = oneBasedStartIndexInProtein; // -1 means that the position in the protein is unknown + Intensity = intensity; + FullSequences = new HashSet { fullSequence }; + _SetBaseSequence(fullSequence, modPattern); + _SetModifications(fullSequence, intensity); + } + + public void AddFullSequence(string fullSeq, double intensity = 0, string modPattern = null) + { + if (BaseSequence.Equals(fullSeq.GetBaseSequenceFromFullSequence())) + { + FullSequences.Add(fullSeq); + Intensity += intensity; + _SetModifications(fullSeq, intensity); // updating the intensity is done here + } + else + { + throw new Exception("The base sequence of the peptide being added does not match the base sequence of this peptide."); + } + } + + public void MergePeptide(QuantifiedPeptide peptideToMerge) + { + if (peptideToMerge == null || peptideToMerge.BaseSequence != BaseSequence) + { + throw new Exception("The base sequence of the peptide being added does not match the base sequence of this peptide."); + } + foreach (var fullSeq in peptideToMerge.FullSequences) + { + FullSequences.Add(fullSeq); + _SetModifications(fullSeq, peptideToMerge.Intensity); // updating the intensity is done here + } + Intensity += peptideToMerge.Intensity; + } + + private void _SetModifications(string fullSeq, double intensity = 0) + { + var mods = fullSeq.ParseModifications(); + + if (mods.IsNotNullOrEmpty()) + { + foreach (var modpos in mods.Keys) + { + var mod = mods[modpos]; + if (!ModifiedAminoAcidPositions.ContainsKey(modpos)) + { + ModifiedAminoAcidPositions[modpos] = new Dictionary(); + } + + if (!ModifiedAminoAcidPositions[modpos].ContainsKey(mod)) + { + var modLocalization = modpos == 0 ? "N-terminus" : modpos == BaseSequence.Length + 1 ? "C-terminus" : BaseSequence[modpos - 1].ToString(); + ModifiedAminoAcidPositions[modpos][mod] = new QuantifiedModification(mod, modpos, modLocalization: modLocalization, intensity: 0); + } + ModifiedAminoAcidPositions[modpos][mod].Intensity += intensity; + + // Maybe should update/pass position in protein from here, too. + } + } + } + + private void _SetBaseSequence(string fullSeq, string modPattern) + { + BaseSequence = fullSeq.GetBaseSequenceFromFullSequence(modPattern: modPattern); + } + + public Dictionary> GetModStoichiometryForPeptide() + { + var aaModsStoichiometry = ModifiedAminoAcidPositions; + + foreach (var modpos in aaModsStoichiometry) + { + foreach (var mod in modpos.Value.Values) + { + mod.Intensity /= Intensity; + } + } + return aaModsStoichiometry; + } + } +} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs new file mode 100644 index 000000000..86a6cc18b --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs @@ -0,0 +1,118 @@ +using Easy.Common.Extensions; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace MzLibUtil.PositionFrequencyAnalysis +{ + public class QuantifiedProtein + { + public string Accession { get; set; } + public string Sequence { get; set; } + public Dictionary Peptides { get; set; } + public Dictionary> ModifiedAminoAcidPositionsInProtein { get; set; } + public Dictionary> PeptidesByProteinPosition { get; set; } + + public QuantifiedProtein(string accession, string sequence = null, Dictionary peptides = null) + { + Accession = accession; + Sequence = sequence; + Peptides = peptides ?? new Dictionary(); + } + + public void SetProteinModsFromPeptides() + { + if (!Sequence.IsNotNullOrEmpty() || !Peptides.IsNotNullOrEmpty()) + { + throw new Exception("The protein sequence is unknown, or there're no peptides."); + } + + ModifiedAminoAcidPositionsInProtein = new Dictionary>(); + PeptidesByProteinPosition = new Dictionary>(); + + foreach (var peptide in Peptides.Values) + { + // if peptide position in protein is unknown, set it using the protein sequence + if (peptide.OneBasedStartIndexInProtein == -1) + { + peptide.OneBasedStartIndexInProtein = Sequence.IndexOf(peptide.BaseSequence) + 1; + } + // if peptide has no modifications, add to all its positions + if (!peptide.ModifiedAminoAcidPositions.IsNotNullOrEmpty()) + { + for (int i = 0; i < peptide.BaseSequence.Length; i++) + { + var pos = peptide.OneBasedStartIndexInProtein + i; + if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(pos)) + { + ModifiedAminoAcidPositionsInProtein[pos] = new Dictionary(); + PeptidesByProteinPosition[pos] = new HashSet(); + } + PeptidesByProteinPosition[pos].Add(peptide.BaseSequence); + } + continue; + } + + else // if peptide has modifications, add to modified positions + { + foreach (var modpos in peptide.ModifiedAminoAcidPositions.Keys) + { + var modPositionInProtein = modpos + peptide.OneBasedStartIndexInProtein - 1; + + // Ignore peptide terminal modifications that are not at the protein terminal + if (modPositionInProtein != 0 && modpos == 0 // if the mod is at the N-terminus of the peptide, but not the protein. + || modPositionInProtein != Sequence.Length + 1 && modpos == peptide.BaseSequence.Length + 1) // if the mod is at the C-terminus of the peptide, but not the protein. + { + continue; + } + + if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(modPositionInProtein)) + { + ModifiedAminoAcidPositionsInProtein[modPositionInProtein] = new Dictionary(); + PeptidesByProteinPosition[modPositionInProtein] = new HashSet(); + } + PeptidesByProteinPosition[modPositionInProtein].Add(peptide.BaseSequence); + + foreach (var mod in peptide.ModifiedAminoAcidPositions[modpos].Values) + { + mod.ProteinPositionZeroIsNTerminus = modPositionInProtein; + + if (!ModifiedAminoAcidPositionsInProtein[modPositionInProtein].ContainsKey(mod.IdWithMotif)) + { + ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.IdWithMotif] = new QuantifiedModification(mod.IdWithMotif, mod.PeptidePositionZeroIsNTerminus, modPositionInProtein, null, 0); + } + ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.IdWithMotif].Intensity += mod.Intensity; + } + } + } + } + + // clean up the dictionary to remove any empty modifications + var noModPositions = ModifiedAminoAcidPositionsInProtein.Where(x => !x.Value.IsNotNullOrEmpty()).ToDictionary().Keys; + foreach (var pos in noModPositions) + { + ModifiedAminoAcidPositionsInProtein.Remove(pos); + PeptidesByProteinPosition.Remove(pos); + } + + } + + public Dictionary> GetModStoichiometryFromProteinMods() + { + SetProteinModsFromPeptides(); + var aaModsStoichiometry = new Dictionary>(); + foreach (var modpos in ModifiedAminoAcidPositionsInProtein.Keys) + { + aaModsStoichiometry[modpos] = new Dictionary(); + + double totalPositionIntensity = Peptides.Where(pep => PeptidesByProteinPosition[modpos].Contains(pep.Key)).Sum(x => x.Value.Intensity); + foreach (var mod in ModifiedAminoAcidPositionsInProtein[modpos].Values) + { + double modFraction = mod.Intensity / totalPositionIntensity; + aaModsStoichiometry[modpos].Add(mod.IdWithMotif, modFraction); + } + } + return aaModsStoichiometry; + } + } +} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs new file mode 100644 index 000000000..fbc1dc94f --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs @@ -0,0 +1,29 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; + +namespace MzLibUtil.PositionFrequencyAnalysis +{ + public class QuantifiedProteinGroup + { + public string Name { get; set; } + public Dictionary Proteins { get; set; } + + public QuantifiedProteinGroup(string name, Dictionary proteins = null) + { + proteins = proteins ?? new Dictionary(); + string splitPattern = @";|\|"; + var proteinAccessions = Regex.Split(name, splitPattern); + if (proteinAccessions.Length == proteins.Count && proteinAccessions.OrderBy(x => x).SequenceEqual(proteins.Keys.OrderBy(x => x)) || proteins.IsNullOrEmpty()) + { + Name = name; + Proteins = proteins ?? new Dictionary(); + } + else + { + throw new Exception("The number of proteins provided does not match the number of proteins in the protein group name."); + } + } + } +} diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs index bb95ab520..5851397f5 100644 --- a/mzLib/Test/TestMzLibUtil.cs +++ b/mzLib/Test/TestMzLibUtil.cs @@ -8,6 +8,7 @@ using Proteomics.AminoAcidPolymer; using System; using NUnit.Framework.Legacy; +using MzLibUtil.PositionFrequencyAnalysis; namespace Test { @@ -335,7 +336,7 @@ public void TestSetUpQuantificationObjects() sequenceInputs.Add(("AAAA", new List { "TESTPROT1|TESTPROT2" }, 10)); var quant = new PositionFrequencyAnalysis(); - quant.SetUpQuantificationObjects(sequenceInputs, proteinSequences); + quant.SetUpQuantificationObjectsFromFullSequences(sequenceInputs, proteinSequences); Assert.AreEqual(quant.ProteinGroups.Count, 2); Assert.That(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins.Keys.Contains("TESTPROT1")); Assert.That(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins.Keys.Contains("TESTPROT2")); From 6389de1f37ab94040b0856d175dc43f5acab3869 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Thu, 2 Oct 2025 13:38:32 -0500 Subject: [PATCH 6/7] improving quantprot exception throw. --- .../PositionFrequencyAnalysis/QuantifiedProtein.cs | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs index 86a6cc18b..1097b09eb 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs @@ -22,9 +22,14 @@ public QuantifiedProtein(string accession, string sequence = null, Dictionary>(); From 302edd75e2233d074112819ee21cf9952500d639 Mon Sep 17 00:00:00 2001 From: Peter Cruz Parrilla Date: Tue, 28 Oct 2025 18:12:26 -0500 Subject: [PATCH 7/7] Extended commenting. Added a peptide record class that stores the peptide input for setting up the protein groups and the quantifications. --- mzLib/MzLibUtil/ClassExtensions.cs | 6 +++ .../PositionFrequencyAnalysis.cs | 43 +++++++++--------- .../QuantifiedModification.cs | 18 +++++--- .../QuantifiedPeptide.cs | 40 ++++++++++++++--- .../QuantifiedPeptideRecord.cs | 28 ++++++++++++ .../QuantifiedProtein.cs | 44 ++++++++++++++++--- .../QuantifiedProteinGroup.cs | 13 ++++++ 7 files changed, 154 insertions(+), 38 deletions(-) create mode 100644 mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs diff --git a/mzLib/MzLibUtil/ClassExtensions.cs b/mzLib/MzLibUtil/ClassExtensions.cs index 5eb425276..995123380 100644 --- a/mzLib/MzLibUtil/ClassExtensions.cs +++ b/mzLib/MzLibUtil/ClassExtensions.cs @@ -26,6 +26,7 @@ namespace MzLibUtil public static class ClassExtensions { public static readonly string ModificationPattern = @"-?\[(.+?)(? /// Applies a boxcar smoothing algorithm to the input data. @@ -304,5 +305,10 @@ public static void RemoveSpecialCharacters(ref string fullSeq, string replacemen Regex regexSpecialChar = new(specialCharacter); fullSeq = regexSpecialChar.Replace(fullSeq, replacement); } + + public static string[] SplitProteinAccessions(this string proteinGroupName) + { + return Regex.Split(proteinGroupName, ProteinSplitPattern); + } } } \ No newline at end of file diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs index 0b1b123f0..16a6eee95 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs @@ -3,31 +3,33 @@ namespace MzLibUtil.PositionFrequencyAnalysis { + /// + /// Handles analysis and organization of protein group quantification from peptide records. + /// public class PositionFrequencyAnalysis { + /// + /// Dictionary mapping protein group names to their quantification data. + /// public Dictionary ProteinGroups { get; private set; } - //public Dictionary Peptides { get; private set; } - /// - /// Calculates the occupancy of post-translational modifications at the peptide level. + /// Populates protein groups with their respective proteins and peptides from a list of quantifide peptide records. + /// The resulting protein groups are stored in the ProteinGroups property with the protein group name strings as keys. /// - /// A List of Tuples whose entries are ordered as (string FullSequence, string BaseSequence, List ProteinGroups, Intensity) for each peptide. - /// A nested dictionary whose key mappings are as follows: string ProteinGroup-> string Protein-> string BaseSequence-> int ModifiedAminoAcidIndex-> string ModificationName-> double Intensity - /// Note: Each BaseSequence dictionary contains a ModifiedAminoAcidIndex key of -1 that then contains a ModificationName key called "Total" that is used to track the total intensity observed for - /// all of the amino acids in that peptide. - /// - public void SetUpQuantificationObjectsFromFullSequences(List<(string fullSeq, List proteinGroups, double intensity)> peptides, Dictionary proteinSequences=null) + /// A list of QuantifiedPeptideRecord, which store a peptide's full sequence, mapped protein groupsm and intensity. + /// An optional dictionary of protein sequences to use for mapping peptides to proteins. + /// If not provided, the protein sequences will be left null in the QuantifiedProtein objects. However, this parameter should not be null if what we want + /// is a protein stoichiometry, since it is needed to align the peptides to the parent protein." + public void SetUpQuantificationFromQuantifiedPeptideRecords(List peptides, Dictionary proteinSequences=null) { ProteinGroups = new Dictionary(); - - // Go through the peptides given - foreach (var pep in peptides) + foreach (var peptide in peptides) { - string baseSeq = pep.fullSeq.GetBaseSequenceFromFullSequence(); - - // Go through the peptide's protein groups - foreach (var pg in pep.proteinGroups) + // Iterate through the peptide's protein groups in case it is a shared peptide protein groups. + // We want to map the peptide separately to each protein group it belongs to, primarily due to + // each protein group is reported separately in MetaMorpheus. + foreach (var pg in peptide.ProteinGroups) { // If have not seen that protein group, store it if (!ProteinGroups.ContainsKey(pg)) @@ -36,8 +38,7 @@ public void SetUpQuantificationObjectsFromFullSequences(List<(string fullSeq, Li } var proteinGroup = ProteinGroups[pg]; - // Go through the proteins in each protein group - foreach (var proteinName in pg.Split('|')) + foreach (var proteinName in pg.SplitProteinAccessions()) { // Add the protein to the protein group's dictionary if it has not been added if (!proteinGroup.Proteins.ContainsKey(proteinName)) @@ -51,14 +52,14 @@ public void SetUpQuantificationObjectsFromFullSequences(List<(string fullSeq, Li var protein = proteinGroup.Proteins[proteinName]; // If the peptide's base sequence has not been seen, add it to the protein's dictionary - if (!protein.Peptides.ContainsKey(baseSeq)) + if (!protein.Peptides.ContainsKey(peptide.BaseSequence)) { - protein.Peptides[baseSeq] = new QuantifiedPeptide(pep.fullSeq, intensity: pep.intensity); + protein.Peptides[peptide.BaseSequence] = new QuantifiedPeptide(peptide.FullSequence, intensity: peptide.Intensity); } else { // If the peptide's base sequence has been seen, add the new full sequence to the existing peptide - protein.Peptides[baseSeq].AddFullSequence(pep.fullSeq, intensity: pep.intensity); + protein.Peptides[peptide.BaseSequence].AddFullSequence(peptide.FullSequence, intensity: peptide.Intensity); } } } diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs index cc3571101..63e955f74 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs @@ -1,19 +1,27 @@ namespace MzLibUtil.PositionFrequencyAnalysis { + /// + /// A class to store information about a quantified modification. + /// public class QuantifiedModification { - public string IdWithMotif { get; set; } - public string ModificationLocalization { get; set; } // e.g. "N-terminus", "C-terminus", or amino acid name + public string Name { get; set; } public int PeptidePositionZeroIsNTerminus { get; set; } public int ProteinPositionZeroIsNTerminus { get; set; } public double Intensity { get; set; } - public QuantifiedModification(string idWithMotif, int positionInPeptide, int? positionInProtein = null, string modLocalization = null, double intensity = 0) + /// + /// Constructor for a QuantifiedModification object. + /// + /// Full name of the modification, including the in the format "MODTYPE: MODID on MOTIF" + /// Zero-based postion in the peptide. + /// Zero-based postion in the peptide's parent protein. + /// + public QuantifiedModification(string name, int positionInPeptide, int? positionInProtein = null, double intensity = 0) { - IdWithMotif = idWithMotif; + Name = name; PeptidePositionZeroIsNTerminus = positionInPeptide; ProteinPositionZeroIsNTerminus = positionInProtein ?? -1; // -1 means that the position in the protein is unknown - ModificationLocalization = modLocalization ?? "Unknown"; Intensity = intensity; } } diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs index 83ac0b965..0f5ec12fb 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs @@ -12,20 +12,40 @@ public class QuantifiedPeptide public HashSet FullSequences { get; set; } public string BaseSequence { get; set; } public QuantifiedProtein ParentProtein { get; set; } - public int OneBasedStartIndexInProtein { get; set; } + public int ZeroBasedStartIndexInProtein { get; set; } + + /// + /// Dictionary mapping zero-based amino acid positions in the peptide to dictionaries of + /// modification IDs and their corresponding QuantifiedModification objects. This property + /// stores ALL of the modifications observed for this peptide across all full sequences. + /// public Dictionary> ModifiedAminoAcidPositions { get; set; } public double Intensity { get; set; } - public QuantifiedPeptide(string fullSequence, int oneBasedStartIndexInProtein = -1, double intensity = 0, string modPattern = null) + /// + /// Constructor for a QuantifiedPeptide object. The base sequence and modifications are parsed from the full sequence. + /// + /// + /// + /// + /// + public QuantifiedPeptide(string fullSequence, int zeroBasedStartIndexInProtein = -1, double intensity = 0, string modPattern = null) { ModifiedAminoAcidPositions = new Dictionary>(); - OneBasedStartIndexInProtein = oneBasedStartIndexInProtein; // -1 means that the position in the protein is unknown + ZeroBasedStartIndexInProtein = zeroBasedStartIndexInProtein; // -1 means that the position in the protein is unknown Intensity = intensity; FullSequences = new HashSet { fullSequence }; _SetBaseSequence(fullSequence, modPattern); _SetModifications(fullSequence, intensity); } + /// + /// Adds a new full sequence to the peptide, updating modifications and intensity accordingly. + /// + /// + /// + /// + /// public void AddFullSequence(string fullSeq, double intensity = 0, string modPattern = null) { if (BaseSequence.Equals(fullSeq.GetBaseSequenceFromFullSequence())) @@ -40,6 +60,11 @@ public void AddFullSequence(string fullSeq, double intensity = 0, string modPatt } } + /// + /// Merges another QuantifiedPeptide object into this one, combining their full sequences and intensities. + /// + /// + /// public void MergePeptide(QuantifiedPeptide peptideToMerge) { if (peptideToMerge == null || peptideToMerge.BaseSequence != BaseSequence) @@ -70,8 +95,7 @@ private void _SetModifications(string fullSeq, double intensity = 0) if (!ModifiedAminoAcidPositions[modpos].ContainsKey(mod)) { - var modLocalization = modpos == 0 ? "N-terminus" : modpos == BaseSequence.Length + 1 ? "C-terminus" : BaseSequence[modpos - 1].ToString(); - ModifiedAminoAcidPositions[modpos][mod] = new QuantifiedModification(mod, modpos, modLocalization: modLocalization, intensity: 0); + ModifiedAminoAcidPositions[modpos][mod] = new QuantifiedModification(mod, modpos, intensity: 0); } ModifiedAminoAcidPositions[modpos][mod].Intensity += intensity; @@ -85,6 +109,12 @@ private void _SetBaseSequence(string fullSeq, string modPattern) BaseSequence = fullSeq.GetBaseSequenceFromFullSequence(modPattern: modPattern); } + /// + /// Returns the modification stoichiometry for this peptide as a dictionary mapping + /// zero-based amino acid positions in the peptide to dictionaries of modification IDs and their corresponding + /// QuantifiedModification objects with normalized intensities (i.e., divided by the total peptide intensity). + /// + /// public Dictionary> GetModStoichiometryForPeptide() { var aaModsStoichiometry = ModifiedAminoAcidPositions; diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs new file mode 100644 index 000000000..9cff54391 --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs @@ -0,0 +1,28 @@ +using System.Collections.Generic; +using System.Text.RegularExpressions; + +namespace MzLibUtil.PositionFrequencyAnalysis +{ + public class QuantifiedPeptideRecord + { + public string FullSequence { get; set; } + public string BaseSequence { get; set; } + public HashSet ProteinGroups { get; set; } + public double Intensity { get; set; } + /// + /// A record of a quantified peptide, storing its full sequence (with modifications), base sequence (without modifications), + /// protein groups it maps to, and intensity. The base sequence is derived from the full sequence and is not passed + /// as initialization parameter. + /// + /// + /// + /// + public QuantifiedPeptideRecord(string fullSequence, HashSet proteinGroups, double intensity) + { + FullSequence = fullSequence; + ProteinGroups = proteinGroups; + Intensity = intensity; + BaseSequence = fullSequence.GetBaseSequenceFromFullSequence(); + } + } +} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs index 1097b09eb..21412f1e5 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs @@ -5,12 +5,32 @@ namespace MzLibUtil.PositionFrequencyAnalysis { + /// + /// A class to store information about a quantified protein. The protein contains peptides + /// clustered by their base sequence, rather than by their full sequence. Full sequences are stored + /// in the QuantifiedPeptide objects. + /// public class QuantifiedProtein { public string Accession { get; set; } public string Sequence { get; set; } + + /// + /// Dictionary mapping peptide base sequences to their corresponding QuantifiedPeptide objects. + /// public Dictionary Peptides { get; set; } + + /// + /// Dictionary mapping zero-based amino acid positions in the protein to dictionaries of + /// modification IDs and their corresponding QuantifiedModification objects. + /// Note: the modification positions are 0-based with the N-terminus of the protein being position 0. + /// public Dictionary> ModifiedAminoAcidPositionsInProtein { get; set; } + + /// + /// Dictionary mapping zero-based amino acid positions in the protein to sets of peptide base sequences + /// This is useful to know which peptides contribute to the modification and total intensity at a given position. + /// public Dictionary> PeptidesByProteinPosition { get; set; } public QuantifiedProtein(string accession, string sequence = null, Dictionary peptides = null) @@ -20,6 +40,10 @@ public QuantifiedProtein(string accession, string sequence = null, Dictionary(); } + /// + /// Parses and aggregates modifications from the protein's peptides to set the ModifiedAminoAcidPositionsInProtein property. + /// + /// public void SetProteinModsFromPeptides() { if (Sequence.IsNullOrEmpty()) @@ -38,16 +62,16 @@ public void SetProteinModsFromPeptides() foreach (var peptide in Peptides.Values) { // if peptide position in protein is unknown, set it using the protein sequence - if (peptide.OneBasedStartIndexInProtein == -1) + if (peptide.ZeroBasedStartIndexInProtein == -1) { - peptide.OneBasedStartIndexInProtein = Sequence.IndexOf(peptide.BaseSequence) + 1; + peptide.ZeroBasedStartIndexInProtein = Sequence.IndexOf(peptide.BaseSequence) + 1; } - // if peptide has no modifications, add to all its positions - if (!peptide.ModifiedAminoAcidPositions.IsNotNullOrEmpty()) + // if peptide has no modifications, add to all of the aminoacid positions in the protein that it covers + if (peptide.ModifiedAminoAcidPositions.IsNullOrEmpty()) { for (int i = 0; i < peptide.BaseSequence.Length; i++) { - var pos = peptide.OneBasedStartIndexInProtein + i; + var pos = peptide.ZeroBasedStartIndexInProtein + i; if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(pos)) { ModifiedAminoAcidPositionsInProtein[pos] = new Dictionary(); @@ -62,7 +86,7 @@ public void SetProteinModsFromPeptides() { foreach (var modpos in peptide.ModifiedAminoAcidPositions.Keys) { - var modPositionInProtein = modpos + peptide.OneBasedStartIndexInProtein - 1; + var modPositionInProtein = modpos + peptide.ZeroBasedStartIndexInProtein - 1; // Ignore peptide terminal modifications that are not at the protein terminal if (modPositionInProtein != 0 && modpos == 0 // if the mod is at the N-terminus of the peptide, but not the protein. @@ -93,7 +117,7 @@ public void SetProteinModsFromPeptides() } // clean up the dictionary to remove any empty modifications - var noModPositions = ModifiedAminoAcidPositionsInProtein.Where(x => !x.Value.IsNotNullOrEmpty()).ToDictionary().Keys; + var noModPositions = ModifiedAminoAcidPositionsInProtein.Where(x => x.Value.IsNullOrEmpty()).ToDictionary().Keys; foreach (var pos in noModPositions) { ModifiedAminoAcidPositionsInProtein.Remove(pos); @@ -102,6 +126,12 @@ public void SetProteinModsFromPeptides() } + /// + /// Calculates the stoichiometry of modifications at each amino acid position in the protein. + /// The output is a dictionary keyed by zero-based amino acid positions in the protein and + /// and the modification names with their corresponding stoichiometry values (fractions). + /// + /// public Dictionary> GetModStoichiometryFromProteinMods() { SetProteinModsFromPeptides(); diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs index fbc1dc94f..e28f9b6c9 100644 --- a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs @@ -5,11 +5,24 @@ namespace MzLibUtil.PositionFrequencyAnalysis { + /// + /// Represents a group of proteins for quantification purposes. + /// public class QuantifiedProteinGroup { + /// + /// The name of the protein group, typically a concatenation of protein accessions in the + /// format "ProteinA;ProteinB", "ProteinA|ProteinB", or "ProteinA;ProteinB|ProteinC". + /// public string Name { get; set; } + /// + /// Dictionary mapping protein accessions to their corresponding QuantifiedProtein objects. + /// public Dictionary Proteins { get; set; } + /// + /// Initializes a new protein group with the specified name and optional proteins. + /// public QuantifiedProteinGroup(string name, Dictionary proteins = null) { proteins = proteins ?? new Dictionary();