Skip to content

Commit 42e308f

Browse files
committed
changes to ParseModifications (can now ignore mods at termini) and PositionFrequencyAnalysis UtilProtein class (now updates peptide mod positions to protein positions) and PFA argument (list of named tuple for clarity)
1 parent 665df75 commit 42e308f

File tree

4 files changed

+484
-0
lines changed

4 files changed

+484
-0
lines changed

mzLib/FlashLFQ/FlashLFQResults.cs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,28 @@ public void CalculateProteinResultsTop3(bool useSharedPeptides)
347347
}
348348
}
349349
}
350+
/// <summary>
351+
/// Calculate peptide level ptm occupancy with either all peptides to be quantified (by intensity) or a subset of FlashLFQ-identified peptides with an arbitrary peptide-level quantifier.
352+
/// </summary>
353+
/// <param name="quantifiedPeptides"> Dictionary where keys are string-typed peptide full sequences in PeptideModifiedSequences and the value is a double-typed quantifier of that peptide.</param>
354+
/// <param name="modOnNTerminus"> If true, the index of modifications at the N-terminus will be 0 (zero-based indexing). Otherwise, it is the index of the first amino acid (one-based indexing).</param>
355+
/// <param name="modOnCTerminus"> If true, the index of modifications at the C-terminus will be one more than the index of the last amino acid. Otherwise, it is the index of the last amino acid.</param>
356+
/// <returns> Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod</returns>
357+
public void CalculatePTMOccupancy(Dictionary<string, double> quantifiedPeptides=null, bool modOnNTerminus=true, bool modOnCTerminus=true)
358+
{
359+
quantifiedPeptides = quantifiedPeptides ?? new Dictionary<string, double> { };
360+
361+
var peptides = _peptideModifiedSequencesToQuantify
362+
.Where(pep => PeptideModifiedSequences.ContainsKey(pep))
363+
.Select(pep => (PeptideModifiedSequences[pep].Sequence,
364+
PeptideModifiedSequences[pep].BaseSequence,
365+
PeptideModifiedSequences[pep].ProteinGroups.Select(pg => pg.ProteinGroupName).ToList(),
366+
quantifiedPeptides.GetValueOrDefault(pep, PeptideModifiedSequences[pep].GetTotalIntensity()))).ToList();
367+
368+
PositionFrequencyAnalysis pfa = new PositionFrequencyAnalysis();
369+
pfa.ProteinGroupsOccupancyByPeptide(peptides, modOnNTerminus, modOnCTerminus);
370+
ModInfo = pfa.Occupancy;
371+
}
350372

351373
/// <summary>
352374
/// This method uses the median polish algorithm to calculate protein quantities in each biological replicate.

mzLib/MzLibUtil/ClassExtensions.cs

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,91 @@ namespace MzLibUtil
2525
{
2626
public static class ClassExtensions
2727
{
28+
/// <summary>
29+
/// Parses the full sequence to identify mods.
30+
/// </summary>
31+
/// <param name="fullSequence"> Full sequence of the peptide in question</param>
32+
/// <param name="modOnNTerminus"> If true, the index of modifications at the N-terminus will be 0 (zero-based indexing). Otherwise, it is the index of the first amino acid (one-based indexing).</param>
33+
/// <param name="modOnCTerminus"> If true, the index of modifications at the C-terminus will be one more than the index of the last amino acid. Otherwise, it is the index of the last amino acid.</param>
34+
/// <returns> Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod</returns>
35+
public static Dictionary<int, List<string>> ParseModifications(this string fullSequence, bool modOnNTerminus=false, bool modOnCTerminus=false, bool ignoreTerminusMod=false)
36+
{
37+
// use a regex to get all modifications
38+
string pattern = @"\[(.+?)\](?<!\[I+\])"; //The "look-behind" condition prevents matching ] for metal ion modifications
39+
Regex regex = new(pattern);
40+
41+
// remove each match after adding to the dict. Otherwise, getting positions
42+
// of the modifications will be rather difficult.
43+
//int patternMatches = regex.Matches(fullSequence).Count;
44+
Dictionary<int, List<string>> modDict = new();
45+
46+
string fullSeq = fullSequence;
47+
RemoveSpecialCharacters(ref fullSeq);
48+
MatchCollection matches = regex.Matches(fullSeq);
49+
int captureLengthSum = 0;
50+
foreach (Match match in matches)
51+
{
52+
GroupCollection group = match.Groups;
53+
string val = group[1].Value;
54+
int startIndex = group[0].Index;
55+
int captureLength = group[0].Length;
56+
57+
List<string> modList = new List<string>();
58+
modList.Add(val);
59+
60+
// The position of the amino acids is tracked by the positionToAddToDict variable. It takes the
61+
// startIndex of the modification Match and removes the cumulative length of the modifications
62+
// found (including the brackets). The difference will be the number of nonmodification characters,
63+
// or the number of amino acids prior to the startIndex in the sequence.
64+
int positionToAddToDict = startIndex - captureLengthSum;
65+
66+
if ((positionToAddToDict == 0 || (fullSeq.Length == startIndex + captureLength)) && ignoreTerminusMod)
67+
{
68+
continue;
69+
}
70+
71+
// Handle N terminus indexing
72+
if ((positionToAddToDict == 0) && !modOnNTerminus)
73+
{
74+
positionToAddToDict++;
75+
}
76+
77+
// Handle C terminus indexing
78+
if ((fullSeq.Length == startIndex + captureLength) && modOnCTerminus)
79+
{
80+
positionToAddToDict++;
81+
}
82+
83+
// check to see if key already exist
84+
// if the already key exists, update the current position with the capture length + 1.
85+
// otherwise, add the modification to the dict.
86+
if (modDict.ContainsKey(positionToAddToDict))
87+
{
88+
modDict[positionToAddToDict].Add(val);
89+
}
90+
else
91+
{
92+
modDict.Add(positionToAddToDict, modList);
93+
}
94+
captureLengthSum += captureLength;
95+
}
96+
return modDict;
97+
}
98+
99+
/// <summary>
100+
/// Fixes an issue where the | appears and throws off the numbering if there are multiple mods on a single amino acid.
101+
/// </summary>
102+
/// <param name="fullSequence"></param>
103+
/// <param name="replacement"></param>
104+
/// <param name="specialCharacter"></param>
105+
/// <returns></returns>
106+
public static void RemoveSpecialCharacters(ref string fullSequence, string replacement = @"", string specialCharacter = @"\|")
107+
{
108+
// next regex is used in the event that multiple modifications are on a missed cleavage Lysine (K)
109+
Regex regexSpecialChar = new(specialCharacter);
110+
fullSequence = regexSpecialChar.Replace(fullSequence, replacement);
111+
}
112+
28113
public static double[] BoxCarSmooth(this double[] data, int points)
29114
{
30115
// Force to be odd
Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,238 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Text.RegularExpressions;
4+
using Easy.Common.Extensions;
5+
6+
namespace MzLibUtil
7+
{
8+
// Should this have all of the parent data (i.e. protein group, protein, peptide, peptide position)? Unnecessary for now, but probably useful later.
9+
public class UtilModification
10+
{
11+
public string IdWithMotif { get; set; }
12+
public int PeptidePositionZeroIsNTerminus { get; set; } //NEED TO ENFORCE THIS EVERYWHERE OR CHECK IF ZERO OR ONE
13+
14+
15+
public double Intensity { get; set; }
16+
17+
public UtilModification(string name, int position, double intensity)
18+
{
19+
IdWithMotif = name;
20+
PeptidePositionZeroIsNTerminus = position;
21+
Intensity = intensity;
22+
}
23+
24+
}
25+
public class UtilPeptide
26+
{
27+
public string FullSequence { get; set; }
28+
public string BaseSequence { get; set; }
29+
public UtilProtein ParentProtein { get; set; }
30+
public int IndexInProtein { get; set; }
31+
public Dictionary<int, Dictionary<string, UtilModification>> ModifiedAminoAcidPositions { get; set; }
32+
public double Intensity { get; set; }
33+
34+
public UtilPeptide(string fullSequence, Dictionary<int, Dictionary<string, UtilModification>> mods = null)
35+
{
36+
FullSequence = fullSequence;
37+
ModifiedAminoAcidPositions = mods.IsNotNullOrEmpty() ? mods : new Dictionary<int, Dictionary<string, UtilModification>>();
38+
SetBaseSequence();
39+
}
40+
public void SetBaseSequence(string modPattern = @"\[(.+?)\](?<!\[I+\])")
41+
{
42+
Regex regexSpecialChar = new(modPattern);
43+
BaseSequence = regexSpecialChar.Replace(FullSequence, @"");
44+
}
45+
public void AddModifications(Dictionary<int, string> mods)
46+
{
47+
throw new NotImplementedException();
48+
}
49+
public void PeptideToProteinPositions(int offset=0, bool UseParent=false)
50+
{
51+
if (offset <= 0 && !UseParent)
52+
{
53+
return; // keep current mod indexing if not offsetting.
54+
}
55+
else if (UseParent)
56+
{
57+
offset = ParentProtein.Sequence.IndexOf(BaseSequence);
58+
}
59+
60+
var modificationsToAdd = new Dictionary<int, Dictionary<string, UtilModification>>();
61+
var modificationsToRemove = new List<int>();
62+
63+
foreach (var modpos in ModifiedAminoAcidPositions.Keys)
64+
{
65+
int positionInProtein = modpos + offset;
66+
Dictionary<string, UtilModification> mods = ModifiedAminoAcidPositions[modpos];
67+
foreach (var mod in mods.Values)
68+
{
69+
mod.PeptidePositionZeroIsNTerminus = positionInProtein;
70+
}
71+
modificationsToAdd[positionInProtein] = mods;
72+
modificationsToRemove.Add(modpos);
73+
}
74+
75+
foreach (var modpos in modificationsToRemove)
76+
{
77+
ModifiedAminoAcidPositions.Remove(modpos);
78+
}
79+
80+
foreach (var modpos in modificationsToAdd)
81+
{
82+
ModifiedAminoAcidPositions[modpos.Key] = modpos.Value;
83+
}
84+
}
85+
}
86+
87+
public class UtilProtein
88+
{
89+
public string Name { get; set; }
90+
public string Sequence { get; set; }
91+
public Dictionary<string, UtilPeptide> Peptides { get; set; }
92+
public Dictionary<int, Dictionary<string, UtilModification>> ModifiedAminoAcidPositionsInProtein { get; set; }
93+
94+
public UtilProtein(string name, Dictionary<string, UtilPeptide> peptides=null)
95+
{
96+
Name = name;
97+
if (peptides != null) Peptides = peptides;
98+
else Peptides= new Dictionary<string, UtilPeptide>();
99+
}
100+
101+
public void SetProteinModsFromPeptides()
102+
{
103+
// for now, this method must be used AFTER peptide mod positions are offsetted to protein positions
104+
ModifiedAminoAcidPositionsInProtein = new Dictionary<int, Dictionary<string, UtilModification>>();
105+
foreach (var peptide in Peptides.Values)
106+
{
107+
foreach (var modpos in peptide.ModifiedAminoAcidPositions)
108+
{
109+
if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(modpos.Key))
110+
{
111+
ModifiedAminoAcidPositionsInProtein[modpos.Key] = new Dictionary<string, UtilModification>();
112+
}
113+
foreach (var mod in modpos.Value.Values)
114+
{
115+
if (!ModifiedAminoAcidPositionsInProtein[modpos.Key].ContainsKey(mod.IdWithMotif))
116+
{
117+
ModifiedAminoAcidPositionsInProtein[modpos.Key][mod.IdWithMotif] = new UtilModification(mod.IdWithMotif, modpos.Key, 0);
118+
}
119+
ModifiedAminoAcidPositionsInProtein[modpos.Key][mod.IdWithMotif].Intensity += mod.Intensity/peptide.Intensity; // might need to add some magic later to keep stored the mod intensity and the peptide intensity for MM output
120+
}
121+
}
122+
}
123+
}
124+
}
125+
126+
public class UtilProteinGroup
127+
{
128+
public string Name { get; set;}
129+
public Dictionary<string, UtilProtein> Proteins { get; set; }
130+
public string OccupancyLevel { get; set; }
131+
132+
public UtilProteinGroup(string name, Dictionary<string, UtilProtein> proteins = null)
133+
{
134+
Name = name;
135+
if (proteins != null) Proteins = proteins;
136+
else Proteins= new Dictionary<string, UtilProtein>();
137+
}
138+
}
139+
public class PositionFrequencyAnalysis
140+
{
141+
/// <summary>
142+
/// Calculates the occupancy of post-translational modifications at the peptide level.
143+
/// </summary>
144+
/// <param name="peptides"> A List of Tuples whose entries are ordered as (string FullSequence, string BaseSequence, List<string> ProteinGroups, Intensity) for each peptide.</param>
145+
/// <param name="modOnNTerminus"> If true, the index of modifications at the N-terminus will be 0 (zero-based indexing). Otherwise, it is the index of the first amino acid (one-based indexing).</param>
146+
/// <param name="modOnCTerminus"> If true, the index of modifications at the C-terminus will be one more than the index of the last amino acid. Otherwise, it is the index of the last amino acid.</param>
147+
/// <returns> A nested dictionary whose key mappings are as follows: string ProteinGroup-> string Protein-> string BaseSequence-> int ModifiedAminoAcidIndex-> string ModificationName-> double Intensity
148+
/// Note: Each BaseSequence dictionary contains a ModifiedAminoAcidIndex key of -1 that then contains a ModificationName key called "Total" that is used to track the total intensity observed for
149+
/// all of the amino acids in that peptide.</returns>
150+
///
151+
152+
public Dictionary<string, UtilProteinGroup> Occupancy { get; private set; }
153+
154+
155+
public void ProteinGroupsOccupancyByPeptide(List<(string fullSeq, string baseSeq, List<string> proteinGroup, double intensity)> peptides, bool modOnNTerminus = true, bool modOnCTerminus = true, bool ignoreTerminusMod=false)
156+
{
157+
var proteinGroups = new Dictionary<string, UtilProteinGroup>();
158+
159+
// Go through the peptides given
160+
foreach (var pep in peptides)
161+
{
162+
string baseSeq = pep.Item2.IsNotNullOrEmpty() ? pep.Item2 : new string(pep.Item1.ToCharArray()); // in case it is null or empty and we need to get the base sequence from the full sequence
163+
ClassExtensions.RemoveSpecialCharacters(ref baseSeq, @"", @"\[(.+?)\](?<!\[I+\])");
164+
165+
// Go through the peptide's protein groups
166+
foreach (var pg in pep.proteinGroup)
167+
{
168+
// If have not seen that protein group, store it
169+
if (!proteinGroups.ContainsKey(pg))
170+
{
171+
proteinGroups[pg] = new UtilProteinGroup(pg);
172+
proteinGroups[pg].OccupancyLevel = "peptide";
173+
}
174+
var proteinGroup = proteinGroups[pg];
175+
176+
// Go through the proteins in each protein group
177+
foreach (var proteinName in pg.Split('|'))
178+
{
179+
// Add the protein to the protein group's dictionary if it has not been added
180+
if (!proteinGroup.Proteins.ContainsKey(proteinName))
181+
{
182+
proteinGroup.Proteins[proteinName] = new UtilProtein(proteinName);
183+
}
184+
var protein = proteinGroup.Proteins[proteinName];
185+
186+
// If the peptide's base sequence has not been seen, add it to the protein's dictionary
187+
if (!protein.Peptides.ContainsKey(baseSeq))
188+
{
189+
protein.Peptides[baseSeq] = new UtilPeptide(pep.fullSeq);
190+
protein.Peptides[baseSeq].Intensity = 0;
191+
}
192+
193+
// Increase the total intensity of the peptide base sequence to track the total intensity of all amino acids in that sequence
194+
protein.Peptides[baseSeq].Intensity += pep.intensity;
195+
var peptide = protein.Peptides[baseSeq];
196+
197+
// Want both arguments passed here to be true if need to later filter out peptide terminal mods that are not protein terminal mods
198+
Dictionary<int, List<string>> peptideMods = pep.fullSeq.ParseModifications(modOnNTerminus, modOnCTerminus, ignoreTerminusMod);
199+
// Go through the modified positions found froum the full sequence
200+
foreach (var modpos in peptideMods)
201+
{
202+
// If that position has not been recorded as containing a modification, add it to the base sequence's dictonary
203+
if (!peptide.ModifiedAminoAcidPositions.ContainsKey(modpos.Key))
204+
{
205+
peptide.ModifiedAminoAcidPositions[modpos.Key] = new Dictionary<string, UtilModification>();
206+
}
207+
var modifiedPosition = peptide.ModifiedAminoAcidPositions[modpos.Key];
208+
209+
// Go through the modifications found at a modified amino acid index
210+
foreach (var mod in modpos.Value)
211+
{
212+
//If the name of that modification has not been seen, record that modification in the index's dictionary with an intensity of 0
213+
if (!modifiedPosition.ContainsKey(mod))
214+
{
215+
modifiedPosition[mod] = new UtilModification(mod, modpos.Key, 0);
216+
}
217+
// Increase the intensity of the modification by the intensity of the peptide
218+
modifiedPosition[mod].Intensity += pep.intensity;
219+
}
220+
}
221+
}
222+
}
223+
}
224+
Occupancy = proteinGroups;
225+
}
226+
227+
public void ProteinGroupsOccupancyByProtein(Dictionary<string, string> proteinSequences) // Dictionary<accession, sequence>
228+
{
229+
throw new NotImplementedException();
230+
}
231+
232+
public void ChangePeptideToProteinOccupancyIndex(string proteinGroupName, string proteinName, string peptide, int OneBasedStartResidue)
233+
{
234+
Occupancy[proteinGroupName].OccupancyLevel = "protein";
235+
Occupancy[proteinGroupName].Proteins[proteinName].Peptides[peptide].PeptideToProteinPositions(OneBasedStartResidue);
236+
}
237+
}
238+
}

0 commit comments

Comments
 (0)