diff --git a/mzLib/MzLibUtil/ClassExtensions.cs b/mzLib/MzLibUtil/ClassExtensions.cs index 36bd1092d..995123380 100644 --- a/mzLib/MzLibUtil/ClassExtensions.cs +++ b/mzLib/MzLibUtil/ClassExtensions.cs @@ -25,6 +25,9 @@ namespace MzLibUtil { public static class ClassExtensions { + public static readonly string ModificationPattern = @"-?\[(.+?)(? /// Applies a boxcar smoothing algorithm to the input data. /// @@ -283,6 +286,12 @@ public static Dictionary ParseModifications(this string fullSeq) return modDict; } + public static string GetBaseSequenceFromFullSequence(this string fullSeq, string? modPattern=null, string? replacement=null) + { + Regex regex = new(modPattern ?? ModificationPattern); + return regex.Replace(fullSeq, replacement ?? string.Empty); + } + /// /// Fixes an issue where the | appears and throws off the numbering if there are multiple mods on a single amino acid. /// @@ -296,5 +305,10 @@ public static void RemoveSpecialCharacters(ref string fullSeq, string replacemen Regex regexSpecialChar = new(specialCharacter); fullSeq = regexSpecialChar.Replace(fullSeq, replacement); } + + public static string[] SplitProteinAccessions(this string proteinGroupName) + { + return Regex.Split(proteinGroupName, ProteinSplitPattern); + } } } \ No newline at end of file diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs new file mode 100644 index 000000000..16a6eee95 --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/PositionFrequencyAnalysis.cs @@ -0,0 +1,69 @@ +using Easy.Common.Extensions; +using System.Collections.Generic; + +namespace MzLibUtil.PositionFrequencyAnalysis +{ + /// + /// Handles analysis and organization of protein group quantification from peptide records. + /// + public class PositionFrequencyAnalysis + { + /// + /// Dictionary mapping protein group names to their quantification data. + /// + public Dictionary ProteinGroups { get; private set; } + + /// + /// Populates protein groups with their respective proteins and peptides from a list of quantifide peptide records. + /// The resulting protein groups are stored in the ProteinGroups property with the protein group name strings as keys. + /// + /// A list of QuantifiedPeptideRecord, which store a peptide's full sequence, mapped protein groupsm and intensity. + /// An optional dictionary of protein sequences to use for mapping peptides to proteins. + /// If not provided, the protein sequences will be left null in the QuantifiedProtein objects. However, this parameter should not be null if what we want + /// is a protein stoichiometry, since it is needed to align the peptides to the parent protein." + public void SetUpQuantificationFromQuantifiedPeptideRecords(List peptides, Dictionary proteinSequences=null) + { + ProteinGroups = new Dictionary(); + foreach (var peptide in peptides) + { + // Iterate through the peptide's protein groups in case it is a shared peptide protein groups. + // We want to map the peptide separately to each protein group it belongs to, primarily due to + // each protein group is reported separately in MetaMorpheus. + foreach (var pg in peptide.ProteinGroups) + { + // If have not seen that protein group, store it + if (!ProteinGroups.ContainsKey(pg)) + { + ProteinGroups[pg] = new QuantifiedProteinGroup(pg); + } + var proteinGroup = ProteinGroups[pg]; + + foreach (var proteinName in pg.SplitProteinAccessions()) + { + // Add the protein to the protein group's dictionary if it has not been added + if (!proteinGroup.Proteins.ContainsKey(proteinName)) + { + proteinGroup.Proteins[proteinName] = new QuantifiedProtein(proteinName); + if (proteinSequences.IsNotNullOrEmpty() && proteinSequences.ContainsKey(proteinName)) + { + proteinGroup.Proteins[proteinName].Sequence = proteinSequences[proteinName]; + } + } + var protein = proteinGroup.Proteins[proteinName]; + + // If the peptide's base sequence has not been seen, add it to the protein's dictionary + if (!protein.Peptides.ContainsKey(peptide.BaseSequence)) + { + protein.Peptides[peptide.BaseSequence] = new QuantifiedPeptide(peptide.FullSequence, intensity: peptide.Intensity); + } + else + { + // If the peptide's base sequence has been seen, add the new full sequence to the existing peptide + protein.Peptides[peptide.BaseSequence].AddFullSequence(peptide.FullSequence, intensity: peptide.Intensity); + } + } + } + } + } + } +} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs new file mode 100644 index 000000000..63e955f74 --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedModification.cs @@ -0,0 +1,28 @@ +namespace MzLibUtil.PositionFrequencyAnalysis +{ + /// + /// A class to store information about a quantified modification. + /// + public class QuantifiedModification + { + public string Name { get; set; } + public int PeptidePositionZeroIsNTerminus { get; set; } + public int ProteinPositionZeroIsNTerminus { get; set; } + public double Intensity { get; set; } + + /// + /// Constructor for a QuantifiedModification object. + /// + /// Full name of the modification, including the in the format "MODTYPE: MODID on MOTIF" + /// Zero-based postion in the peptide. + /// Zero-based postion in the peptide's parent protein. + /// + public QuantifiedModification(string name, int positionInPeptide, int? positionInProtein = null, double intensity = 0) + { + Name = name; + PeptidePositionZeroIsNTerminus = positionInPeptide; + ProteinPositionZeroIsNTerminus = positionInProtein ?? -1; // -1 means that the position in the protein is unknown + Intensity = intensity; + } + } +} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs new file mode 100644 index 000000000..0f5ec12fb --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs @@ -0,0 +1,132 @@ +using Easy.Common.Extensions; +using System; +using System.Collections.Generic; + +namespace MzLibUtil.PositionFrequencyAnalysis +{ + /// + /// A class to store information about a quantified peptides sharing the same base sequence. + /// + public class QuantifiedPeptide + { + public HashSet FullSequences { get; set; } + public string BaseSequence { get; set; } + public QuantifiedProtein ParentProtein { get; set; } + public int ZeroBasedStartIndexInProtein { get; set; } + + /// + /// Dictionary mapping zero-based amino acid positions in the peptide to dictionaries of + /// modification IDs and their corresponding QuantifiedModification objects. This property + /// stores ALL of the modifications observed for this peptide across all full sequences. + /// + public Dictionary> ModifiedAminoAcidPositions { get; set; } + public double Intensity { get; set; } + + /// + /// Constructor for a QuantifiedPeptide object. The base sequence and modifications are parsed from the full sequence. + /// + /// + /// + /// + /// + public QuantifiedPeptide(string fullSequence, int zeroBasedStartIndexInProtein = -1, double intensity = 0, string modPattern = null) + { + ModifiedAminoAcidPositions = new Dictionary>(); + ZeroBasedStartIndexInProtein = zeroBasedStartIndexInProtein; // -1 means that the position in the protein is unknown + Intensity = intensity; + FullSequences = new HashSet { fullSequence }; + _SetBaseSequence(fullSequence, modPattern); + _SetModifications(fullSequence, intensity); + } + + /// + /// Adds a new full sequence to the peptide, updating modifications and intensity accordingly. + /// + /// + /// + /// + /// + public void AddFullSequence(string fullSeq, double intensity = 0, string modPattern = null) + { + if (BaseSequence.Equals(fullSeq.GetBaseSequenceFromFullSequence())) + { + FullSequences.Add(fullSeq); + Intensity += intensity; + _SetModifications(fullSeq, intensity); // updating the intensity is done here + } + else + { + throw new Exception("The base sequence of the peptide being added does not match the base sequence of this peptide."); + } + } + + /// + /// Merges another QuantifiedPeptide object into this one, combining their full sequences and intensities. + /// + /// + /// + public void MergePeptide(QuantifiedPeptide peptideToMerge) + { + if (peptideToMerge == null || peptideToMerge.BaseSequence != BaseSequence) + { + throw new Exception("The base sequence of the peptide being added does not match the base sequence of this peptide."); + } + foreach (var fullSeq in peptideToMerge.FullSequences) + { + FullSequences.Add(fullSeq); + _SetModifications(fullSeq, peptideToMerge.Intensity); // updating the intensity is done here + } + Intensity += peptideToMerge.Intensity; + } + + private void _SetModifications(string fullSeq, double intensity = 0) + { + var mods = fullSeq.ParseModifications(); + + if (mods.IsNotNullOrEmpty()) + { + foreach (var modpos in mods.Keys) + { + var mod = mods[modpos]; + if (!ModifiedAminoAcidPositions.ContainsKey(modpos)) + { + ModifiedAminoAcidPositions[modpos] = new Dictionary(); + } + + if (!ModifiedAminoAcidPositions[modpos].ContainsKey(mod)) + { + ModifiedAminoAcidPositions[modpos][mod] = new QuantifiedModification(mod, modpos, intensity: 0); + } + ModifiedAminoAcidPositions[modpos][mod].Intensity += intensity; + + // Maybe should update/pass position in protein from here, too. + } + } + } + + private void _SetBaseSequence(string fullSeq, string modPattern) + { + BaseSequence = fullSeq.GetBaseSequenceFromFullSequence(modPattern: modPattern); + } + + /// + /// Returns the modification stoichiometry for this peptide as a dictionary mapping + /// zero-based amino acid positions in the peptide to dictionaries of modification IDs and their corresponding + /// QuantifiedModification objects with normalized intensities (i.e., divided by the total peptide intensity). + /// + /// + public Dictionary> GetModStoichiometryForPeptide() + { + var aaModsStoichiometry = ModifiedAminoAcidPositions; + + foreach (var modpos in aaModsStoichiometry) + { + foreach (var mod in modpos.Value.Values) + { + mod.Intensity /= Intensity; + } + } + return aaModsStoichiometry; + } + } +} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs new file mode 100644 index 000000000..9cff54391 --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptideRecord.cs @@ -0,0 +1,28 @@ +using System.Collections.Generic; +using System.Text.RegularExpressions; + +namespace MzLibUtil.PositionFrequencyAnalysis +{ + public class QuantifiedPeptideRecord + { + public string FullSequence { get; set; } + public string BaseSequence { get; set; } + public HashSet ProteinGroups { get; set; } + public double Intensity { get; set; } + /// + /// A record of a quantified peptide, storing its full sequence (with modifications), base sequence (without modifications), + /// protein groups it maps to, and intensity. The base sequence is derived from the full sequence and is not passed + /// as initialization parameter. + /// + /// + /// + /// + public QuantifiedPeptideRecord(string fullSequence, HashSet proteinGroups, double intensity) + { + FullSequence = fullSequence; + ProteinGroups = proteinGroups; + Intensity = intensity; + BaseSequence = fullSequence.GetBaseSequenceFromFullSequence(); + } + } +} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs new file mode 100644 index 000000000..21412f1e5 --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProtein.cs @@ -0,0 +1,153 @@ +using Easy.Common.Extensions; +using System; +using System.Collections.Generic; +using System.Linq; + +namespace MzLibUtil.PositionFrequencyAnalysis +{ + /// + /// A class to store information about a quantified protein. The protein contains peptides + /// clustered by their base sequence, rather than by their full sequence. Full sequences are stored + /// in the QuantifiedPeptide objects. + /// + public class QuantifiedProtein + { + public string Accession { get; set; } + public string Sequence { get; set; } + + /// + /// Dictionary mapping peptide base sequences to their corresponding QuantifiedPeptide objects. + /// + public Dictionary Peptides { get; set; } + + /// + /// Dictionary mapping zero-based amino acid positions in the protein to dictionaries of + /// modification IDs and their corresponding QuantifiedModification objects. + /// Note: the modification positions are 0-based with the N-terminus of the protein being position 0. + /// + public Dictionary> ModifiedAminoAcidPositionsInProtein { get; set; } + + /// + /// Dictionary mapping zero-based amino acid positions in the protein to sets of peptide base sequences + /// This is useful to know which peptides contribute to the modification and total intensity at a given position. + /// + public Dictionary> PeptidesByProteinPosition { get; set; } + + public QuantifiedProtein(string accession, string sequence = null, Dictionary peptides = null) + { + Accession = accession; + Sequence = sequence; + Peptides = peptides ?? new Dictionary(); + } + + /// + /// Parses and aggregates modifications from the protein's peptides to set the ModifiedAminoAcidPositionsInProtein property. + /// + /// + public void SetProteinModsFromPeptides() + { + if (Sequence.IsNullOrEmpty()) + { + throw new Exception("The protein sequence is unknown."); + } + + if (Peptides.IsNullOrEmpty()) + { + return; + } + + ModifiedAminoAcidPositionsInProtein = new Dictionary>(); + PeptidesByProteinPosition = new Dictionary>(); + + foreach (var peptide in Peptides.Values) + { + // if peptide position in protein is unknown, set it using the protein sequence + if (peptide.ZeroBasedStartIndexInProtein == -1) + { + peptide.ZeroBasedStartIndexInProtein = Sequence.IndexOf(peptide.BaseSequence) + 1; + } + // if peptide has no modifications, add to all of the aminoacid positions in the protein that it covers + if (peptide.ModifiedAminoAcidPositions.IsNullOrEmpty()) + { + for (int i = 0; i < peptide.BaseSequence.Length; i++) + { + var pos = peptide.ZeroBasedStartIndexInProtein + i; + if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(pos)) + { + ModifiedAminoAcidPositionsInProtein[pos] = new Dictionary(); + PeptidesByProteinPosition[pos] = new HashSet(); + } + PeptidesByProteinPosition[pos].Add(peptide.BaseSequence); + } + continue; + } + + else // if peptide has modifications, add to modified positions + { + foreach (var modpos in peptide.ModifiedAminoAcidPositions.Keys) + { + var modPositionInProtein = modpos + peptide.ZeroBasedStartIndexInProtein - 1; + + // Ignore peptide terminal modifications that are not at the protein terminal + if (modPositionInProtein != 0 && modpos == 0 // if the mod is at the N-terminus of the peptide, but not the protein. + || modPositionInProtein != Sequence.Length + 1 && modpos == peptide.BaseSequence.Length + 1) // if the mod is at the C-terminus of the peptide, but not the protein. + { + continue; + } + + if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(modPositionInProtein)) + { + ModifiedAminoAcidPositionsInProtein[modPositionInProtein] = new Dictionary(); + PeptidesByProteinPosition[modPositionInProtein] = new HashSet(); + } + PeptidesByProteinPosition[modPositionInProtein].Add(peptide.BaseSequence); + + foreach (var mod in peptide.ModifiedAminoAcidPositions[modpos].Values) + { + mod.ProteinPositionZeroIsNTerminus = modPositionInProtein; + + if (!ModifiedAminoAcidPositionsInProtein[modPositionInProtein].ContainsKey(mod.IdWithMotif)) + { + ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.IdWithMotif] = new QuantifiedModification(mod.IdWithMotif, mod.PeptidePositionZeroIsNTerminus, modPositionInProtein, null, 0); + } + ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.IdWithMotif].Intensity += mod.Intensity; + } + } + } + } + + // clean up the dictionary to remove any empty modifications + var noModPositions = ModifiedAminoAcidPositionsInProtein.Where(x => x.Value.IsNullOrEmpty()).ToDictionary().Keys; + foreach (var pos in noModPositions) + { + ModifiedAminoAcidPositionsInProtein.Remove(pos); + PeptidesByProteinPosition.Remove(pos); + } + + } + + /// + /// Calculates the stoichiometry of modifications at each amino acid position in the protein. + /// The output is a dictionary keyed by zero-based amino acid positions in the protein and + /// and the modification names with their corresponding stoichiometry values (fractions). + /// + /// + public Dictionary> GetModStoichiometryFromProteinMods() + { + SetProteinModsFromPeptides(); + var aaModsStoichiometry = new Dictionary>(); + foreach (var modpos in ModifiedAminoAcidPositionsInProtein.Keys) + { + aaModsStoichiometry[modpos] = new Dictionary(); + + double totalPositionIntensity = Peptides.Where(pep => PeptidesByProteinPosition[modpos].Contains(pep.Key)).Sum(x => x.Value.Intensity); + foreach (var mod in ModifiedAminoAcidPositionsInProtein[modpos].Values) + { + double modFraction = mod.Intensity / totalPositionIntensity; + aaModsStoichiometry[modpos].Add(mod.IdWithMotif, modFraction); + } + } + return aaModsStoichiometry; + } + } +} diff --git a/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs new file mode 100644 index 000000000..e28f9b6c9 --- /dev/null +++ b/mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedProteinGroup.cs @@ -0,0 +1,42 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text.RegularExpressions; + +namespace MzLibUtil.PositionFrequencyAnalysis +{ + /// + /// Represents a group of proteins for quantification purposes. + /// + public class QuantifiedProteinGroup + { + /// + /// The name of the protein group, typically a concatenation of protein accessions in the + /// format "ProteinA;ProteinB", "ProteinA|ProteinB", or "ProteinA;ProteinB|ProteinC". + /// + public string Name { get; set; } + /// + /// Dictionary mapping protein accessions to their corresponding QuantifiedProtein objects. + /// + public Dictionary Proteins { get; set; } + + /// + /// Initializes a new protein group with the specified name and optional proteins. + /// + public QuantifiedProteinGroup(string name, Dictionary proteins = null) + { + proteins = proteins ?? new Dictionary(); + string splitPattern = @";|\|"; + var proteinAccessions = Regex.Split(name, splitPattern); + if (proteinAccessions.Length == proteins.Count && proteinAccessions.OrderBy(x => x).SequenceEqual(proteins.Keys.OrderBy(x => x)) || proteins.IsNullOrEmpty()) + { + Name = name; + Proteins = proteins ?? new Dictionary(); + } + else + { + throw new Exception("The number of proteins provided does not match the number of proteins in the protein group name."); + } + } + } +} diff --git a/mzLib/Test/TestMzLibUtil.cs b/mzLib/Test/TestMzLibUtil.cs index a33ee4f80..5851397f5 100644 --- a/mzLib/Test/TestMzLibUtil.cs +++ b/mzLib/Test/TestMzLibUtil.cs @@ -3,6 +3,12 @@ using MzLibUtil; using Readers; using System.Collections.Generic; +using FlashLFQ; +using System.Linq; +using Proteomics.AminoAcidPolymer; +using System; +using NUnit.Framework.Legacy; +using MzLibUtil.PositionFrequencyAnalysis; namespace Test { @@ -163,8 +169,192 @@ public void TestRemoveSpecialCharacters() string cleanSeq = seqWithHash.ToString(); ClassExtensions.RemoveSpecialCharacters(ref cleanSeq, specialCharacter: "#"); Assert.AreEqual("PEPTIDE", cleanSeq); + } + + [Test] + public void TestQuantifiedModification() + { + var quantmod = new QuantifiedModification(idWithMotif: "TestMod: ModX on AAY", positionInPeptide: 1, positionInProtein: 2, intensity: 10); + Assert.AreEqual(quantmod.IdWithMotif, "TestMod: ModX on AAY"); + Assert.AreEqual(quantmod.PeptidePositionZeroIsNTerminus, 1); + Assert.AreEqual(quantmod.ProteinPositionZeroIsNTerminus, 2); + Assert.AreEqual(quantmod.Intensity, 10); + Assert.AreEqual(quantmod.ModificationLocalization, "Unknown"); + } + + [Test] + public void TestQuantifiedPeptide() + { + var fullSeq1 = "[UniProt: N - palmitoyl glycine on G]G[UniProt: N - methylglycine on G]K[UniProt: O - linked(Hex) hydroxylysine on K]"; + var peptide1 = new QuantifiedPeptide(fullSeq1, intensity: 1); + Assert.That(peptide1.FullSequences.Contains(fullSeq1)); + Assert.AreEqual(peptide1.BaseSequence, "GK"); + Assert.AreEqual(peptide1.Intensity, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions.Count, 3); + Assert.That(peptide1.ModifiedAminoAcidPositions.ContainsKey(0)); + Assert.That(peptide1.ModifiedAminoAcidPositions.ContainsKey(1)); + Assert.That(peptide1.ModifiedAminoAcidPositions.ContainsKey(2)); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0].First().Value.IdWithMotif, "UniProt: N - palmitoyl glycine on G"); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].First().Value.IdWithMotif, "UniProt: N - methylglycine on G"); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].First().Value.IdWithMotif, "UniProt: O - linked(Hex) hydroxylysine on K"); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0].First().Value.Intensity, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].First().Value.Intensity, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].First().Value.Intensity, 1); + + // Test MergePeptide method + var fullSeq2 = "[UniProt: N - acetylglycine on G]G[UniProt: N - methylglycine on G]K[UniProt: O - linked(Hex) hydroxylysine on K]"; + var peptide2 = new QuantifiedPeptide(fullSeq2, intensity: 10); + peptide1.MergePeptide(peptide2); + + Assert.That(peptide1.FullSequences.Contains(fullSeq2)); + Assert.AreEqual(peptide1.Intensity, 11); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions.Count, 3); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0].Count, 2); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].Count, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].Count, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0]["UniProt: N - palmitoyl glycine on G"].Intensity, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0]["UniProt: N - acetylglycine on G"].Intensity, 10); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].First().Value.Intensity, 11); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].First().Value.Intensity, 11); + + // Test AddFullSequence method + var fullSeq3 = "GK[UniProt: O - linked(Hex) hydroxylysine on K]"; + peptide1.AddFullSequence(fullSeq3, intensity:100); + + Assert.That(peptide1.FullSequences.Contains(fullSeq3)); + Assert.AreEqual(peptide1.Intensity, 111); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions.Count, 3); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0].Count, 2); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].Count, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].Count, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0]["UniProt: N - palmitoyl glycine on G"].Intensity, 1); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[0]["UniProt: N - acetylglycine on G"].Intensity, 10); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[1].First().Value.Intensity, 11); + Assert.AreEqual(peptide1.ModifiedAminoAcidPositions[2].First().Value.Intensity, 111); + + // Test failed merge due to base sequence mismatch + var errorMessage = "The base sequence of the peptide being added does not match the base sequence of this peptide."; + var exception1 = Assert.Throws(() => peptide1.AddFullSequence("AK", intensity: 1)); + Assert.AreEqual(exception1.Message, errorMessage); + + var peptide3 = new QuantifiedPeptide("AK", intensity: 1); + var exception2 = Assert.Throws(() => peptide1.MergePeptide(peptide3)); + Assert.AreEqual(exception2.Message, errorMessage); + } + + [Test] + public void TestQuantifiedProtein() + { + + var fullSeq1 = "[UniProt: N - palmitoyl glycine on G]G[UniProt: N - methylglycine on G]K[UniProt: O - linked(Hex) hydroxylysine on K]"; + var fullSeq2 = "[UniProt: N - acetylglycine on G]G[UniProt: N - methylglycine on G]K-[C-Terminal UniProt: Lysine Amide on K]"; + var fullSeq3 = "A[UniProt:N-methylalanine on A]K[UniProt: O - linked(Hex) hydroxylysine on K]-[C-Terminal UniProt: Lysine Amide on K]"; + + var basePeptide1 = new QuantifiedPeptide(fullSeq1, intensity: 1); + var basePeptide2 = new QuantifiedPeptide(fullSeq3, intensity: 100); + + basePeptide1.AddFullSequence(fullSeq2, intensity: 10); + var peptides = new Dictionary {{ basePeptide1.BaseSequence, basePeptide1}, + { basePeptide2.BaseSequence, basePeptide2 }}; + + var proteinSeq = "GKAAAAAAK"; + var protein = new QuantifiedProtein(accession: "TESTPROT", sequence: proteinSeq, peptides: peptides); + var stoich = protein.GetModStoichiometryFromProteinMods(); + + // Check object fields modified by SetProteinModsFromPeptides, which gets called first in the GetModStoichiometryFromProteinMods method. + Assert.AreEqual(protein.Accession, "TESTPROT"); + Assert.AreEqual(protein.Sequence, proteinSeq); + Assert.AreEqual(protein.Peptides.Count, 2); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein.Count, 6); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[0].Count, 2); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[1].Count, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[2].Count, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[8].Count, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[9].Count, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[10].Count, 1); + + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[0]["UniProt: N - palmitoyl glycine on G"].Intensity, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[0]["UniProt: N - acetylglycine on G"].Intensity, 10); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[1]["UniProt: N - methylglycine on G"].Intensity, 11); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[2]["UniProt: O - linked(Hex) hydroxylysine on K"].Intensity, 1); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[8]["UniProt:N-methylalanine on A"].Intensity, 100); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[9]["UniProt: O - linked(Hex) hydroxylysine on K"].Intensity, 100); + Assert.AreEqual(protein.ModifiedAminoAcidPositionsInProtein[10]["C-Terminal UniProt: Lysine Amide on K"].Intensity, 100); + + // Check stoichiometry results + Assert.AreEqual(stoich.Count, 6); + Assert.AreEqual(stoich[0]["UniProt: N - palmitoyl glycine on G"], 1 / 11.0); + Assert.AreEqual(stoich[0]["UniProt: N - acetylglycine on G"], 10 / 11.0); + Assert.AreEqual(stoich[1]["UniProt: N - methylglycine on G"], 11 / 11.0); + Assert.AreEqual(stoich[2]["UniProt: O - linked(Hex) hydroxylysine on K"], 1 / 11.0); + Assert.AreEqual(stoich[8]["UniProt:N-methylalanine on A"], 1); + Assert.AreEqual(stoich[9]["UniProt: O - linked(Hex) hydroxylysine on K"], 1); + Assert.AreEqual(stoich[10]["C-Terminal UniProt: Lysine Amide on K"], 1); + } + + [Test] + public void TestQuantifiedProteinGroup() + { + // Test correct arguments where protein group name contains the names of the proteins + var protein1 = new QuantifiedProtein(accession: "PROT1", sequence: "AAAYYY", peptides: new Dictionary()); + var protein2 = new QuantifiedProtein(accession: "PROT2", sequence: "AAARRR", peptides: new Dictionary()); + var proteins = new Dictionary { { protein1.Accession, protein1 }, + { protein2.Accession, protein2 } }; + var proteinGroup = new QuantifiedProteinGroup("PROT1|PROT2", proteins); + Assert.AreEqual(proteinGroup.Proteins.Count, 2); + Assert.AreEqual(proteinGroup.Proteins["PROT1"].Accession, "PROT1"); + Assert.AreEqual(proteinGroup.Proteins["PROT2"].Accession, "PROT2"); + + // Test incorrect argument where protein group name does not contain the names of the proteins + var errorMessage = "The number of proteins provided does not match the number of proteins in the protein group name."; + var exception1 = Assert.Throws(() => new QuantifiedProteinGroup("PROT1|PROT2", new Dictionary { { protein1.Accession, protein1 } })); + Assert.AreEqual(exception1.Message, errorMessage); + + var exception2 = Assert.Throws(() => new QuantifiedProteinGroup("PROT1", proteins)); + Assert.AreEqual(exception2.Message, errorMessage); + + var exception3 = Assert.Throws(() => new QuantifiedProteinGroup("PROT1|PROT2|PROT3", proteins)); + Assert.AreEqual(exception3.Message, errorMessage); + } + + [Test] + public void TestSetUpQuantificationObjects() + { + var fullSeq1 = "[UniProt: N - palmitoyl glycine on G]G[UniProt: N - methylglycine on G]K[UniProt: O - linked(Hex) hydroxylysine on K]"; + var fullSeq2 = "[UniProt: N - acetylglycine on G]G[UniProt: N - methylglycine on G]K-[C-Terminal UniProt: Lysine Amide on K]"; + var fullSequences = new List { fullSeq1, fullSeq2 }; + var proteinGroups = new List { "TESTPROT1|TESTPROT2", "TESTPROT3" }; + var proteinSequences = new Dictionary { { "TESTPROT1", "GKAAAAAAK" }, + { "TESTPROT2", "AKAAAAAGK" }, + { "TESTPROT3", "AKGK"} }; + var intensities = new List { 1, 5 }; + var sequenceInputs = new List<(string, List, double)> { }; + for (int i = 0; i < 2; i++) + { + sequenceInputs.Add((fullSequences[i], proteinGroups, intensities[i])); + } + sequenceInputs.Add(("AAAA", new List { "TESTPROT1|TESTPROT2" }, 10)); + var quant = new PositionFrequencyAnalysis(); + quant.SetUpQuantificationObjectsFromFullSequences(sequenceInputs, proteinSequences); + Assert.AreEqual(quant.ProteinGroups.Count, 2); + Assert.That(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins.Keys.Contains("TESTPROT1")); + Assert.That(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins.Keys.Contains("TESTPROT2")); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT1"].Accession, "TESTPROT1"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT1"].Sequence, "GKAAAAAAK"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT1"].Peptides.Count, 2); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT1"].Peptides["GK"].FullSequences.Count, 2); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT1"].Peptides["AAAA"].FullSequences.Count, 1); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT2"].Accession, "TESTPROT2"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT2"].Sequence, "AKAAAAAGK"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT2"].Peptides.Count, 2); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT2"].Peptides["GK"].FullSequences.Count, 2); + Assert.AreEqual(quant.ProteinGroups["TESTPROT1|TESTPROT2"].Proteins["TESTPROT2"].Peptides["AAAA"].FullSequences.Count, 1); + Assert.That(quant.ProteinGroups["TESTPROT3"].Proteins.Keys.Contains("TESTPROT3")); + Assert.AreEqual(quant.ProteinGroups["TESTPROT3"].Proteins["TESTPROT3"].Accession, "TESTPROT3"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT3"].Proteins["TESTPROT3"].Sequence, "AKGK"); + Assert.AreEqual(quant.ProteinGroups["TESTPROT3"].Proteins["TESTPROT3"].Peptides.Count, 1); } public struct TestStruct