Skip to content

Commit ab350e8

Browse files
committed
Simplify ParseModifications to either keep or ignore terminal mods. Currently does not addequately identify N-Terminus Mods. Make sure UtilProtein.SetProteinModsFromPeptides correctly adds terminal protein mods. Saving but needs more rigorous testing once ParseModifications updated (in separate PR) to correctly parse N-Terminus mods. WIP
1 parent 68fd8cc commit ab350e8

File tree

6 files changed

+84
-165
lines changed

6 files changed

+84
-165
lines changed

mzLib/MzLibUtil/ClassExtensions.cs

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,12 @@ public static class ClassExtensions
2828
/// <summary>
2929
/// Parses the full sequence to identify mods.
3030
/// </summary>
31-
/// <param name="fullSequence"> Full sequence of the peptide in question</param>
32-
/// <param name="modOnNTerminus"> If true, the index of modifications at the N-terminus will be 0 (zero-based indexing). Otherwise, it is the index of the first amino acid (one-based indexing).</param>
33-
/// <param name="modOnCTerminus"> If true, the index of modifications at the C-terminus will be one more than the index of the last amino acid. Otherwise, it is the index of the last amino acid.</param>
31+
/// <param name="fullSequence"> Full sequence of the peptide in question.</param>
32+
/// <param name="ignoreTerminusMod"> If true, terminal modifications will be ignored.</param>
3433
/// <returns> Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod</returns>
35-
public static Dictionary<int, List<string>> ParseModifications(this string fullSequence, bool modOnNTerminus=false, bool modOnCTerminus=false, bool ignoreTerminusMod=false)
34+
public static Dictionary<int, List<string>> ParseModifications(this string fullSequence, bool ignoreTerminusMod=false)
3635
{
37-
// use a regex to get all modifications
36+
// use a regex to get modifications
3837
string pattern = @"\[(.+?)\](?<!\[I+\])"; //The "look-behind" condition prevents matching ] for metal ion modifications
3938
Regex regex = new(pattern);
4039

@@ -69,14 +68,9 @@ public static Dictionary<int, List<string>> ParseModifications(this string fullS
6968
continue;
7069
}
7170

72-
// Handle N terminus indexing
73-
if ((positionToAddToDict == 0) && !modOnNTerminus)
74-
{
75-
positionToAddToDict++;
76-
}
77-
78-
// Handle C terminus indexing
79-
if ((fullSeq.Length == startIndex + captureLength) && modOnCTerminus)
71+
// The C-terminus is ambiguous when it comes to how a modification is included in the full sequence string.
72+
// So, extra logic is needed to read the
73+
if ((fullSeq.Length == startIndex + captureLength) && val.Contains("terminal"))
8074
{
8175
positionToAddToDict++;
8276
}

mzLib/MzLibUtil/PositionFrequencyAnalysis.cs

Lines changed: 42 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,19 @@ namespace MzLibUtil
99
public class UtilModification
1010
{
1111
public string IdWithMotif { get; set; }
12-
public int PeptidePositionZeroIsNTerminus { get; set; } //NEED TO ENFORCE THIS EVERYWHERE OR CHECK IF ZERO OR ONE
13-
14-
12+
public int PeptidePositionZeroIsNTerminus { get; set; }
13+
public int ProteinPositionZeroIsNTerminus { get; set; }
1514
public double Intensity { get; set; }
1615

17-
public UtilModification(string name, int position, double intensity)
16+
public UtilModification(string name, int positionInPeptide, int? positionInProtein=null, double intensity=0)
1817
{
1918
IdWithMotif = name;
20-
PeptidePositionZeroIsNTerminus = position;
19+
PeptidePositionZeroIsNTerminus = positionInPeptide;
20+
ProteinPositionZeroIsNTerminus = positionInProtein ?? -1;
2121
Intensity = intensity;
2222
}
23-
2423
}
24+
2525
public class UtilPeptide
2626
{
2727
public string FullSequence { get; set; }
@@ -30,49 +30,32 @@ public class UtilPeptide
3030
public int OneBasedStartIndexInProtein { get; set; }
3131
public Dictionary<int, Dictionary<string, UtilModification>> ModifiedAminoAcidPositions { get; set; }
3232
public double Intensity { get; set; }
33-
public string PositionIndexType { get; set; }
3433

35-
public UtilPeptide(string fullSequence, Dictionary<int, Dictionary<string, UtilModification>> mods = null, int oneBasedStartIndexInProtein = 1, double intensity = 0, string positionIndexType= "peptide")
34+
public UtilPeptide(string fullSequence, Dictionary<int, Dictionary<string, UtilModification>> mods = null, int oneBasedStartIndexInProtein = 1, double intensity = 0)
3635
{
3736
FullSequence = fullSequence;
3837
ModifiedAminoAcidPositions = mods.IsNotNullOrEmpty() ? mods : new Dictionary<int, Dictionary<string, UtilModification>>();
3938
OneBasedStartIndexInProtein = oneBasedStartIndexInProtein;
4039
Intensity = intensity;
41-
PositionIndexType = positionIndexType;
4240
SetBaseSequence();
4341
}
4442
public void SetBaseSequence(string modPattern = @"\[(.+?)\](?<!\[I+\])")
4543
{
4644
Regex regexSpecialChar = new(modPattern);
4745
BaseSequence = regexSpecialChar.Replace(FullSequence, @"");
4846
}
49-
public void PeptideToProteinPositions()
50-
{
51-
PositionIndexType = "protein";
52-
var modificationsToAdd = new Dictionary<int, Dictionary<string, UtilModification>>();
53-
var modificationsToRemove = new List<int>();
5447

55-
foreach (var modpos in ModifiedAminoAcidPositions.Keys)
48+
public Dictionary<int, Dictionary<string, UtilModification>> GetModStoichiometryFromPeptideMods()
49+
{
50+
var aaModsStoichiometry = ModifiedAminoAcidPositions;
51+
foreach (var modpos in aaModsStoichiometry)
5652
{
57-
int positionInProtein = modpos + OneBasedStartIndexInProtein-1;
58-
Dictionary<string, UtilModification> mods = ModifiedAminoAcidPositions[modpos];
59-
foreach (var mod in mods.Values)
53+
foreach (var mod in modpos.Value.Values)
6054
{
61-
mod.PeptidePositionZeroIsNTerminus = positionInProtein;
55+
mod.Intensity = mod.Intensity / Intensity;
6256
}
63-
modificationsToAdd[positionInProtein] = mods;
64-
modificationsToRemove.Add(modpos);
65-
}
66-
67-
foreach (var modpos in modificationsToRemove)
68-
{
69-
ModifiedAminoAcidPositions.Remove(modpos);
70-
}
71-
72-
foreach (var modpos in modificationsToAdd)
73-
{
74-
ModifiedAminoAcidPositions[modpos.Key] = modpos.Value;
7557
}
58+
return aaModsStoichiometry;
7659
}
7760
}
7861

@@ -93,34 +76,46 @@ public UtilProtein(string accession, Dictionary<string, UtilPeptide> peptides=nu
9376

9477
public void SetProteinModsFromPeptides()
9578
{
96-
// for now, this method must be used AFTER peptide mod positions are offsetted to protein positions
9779
ModifiedAminoAcidPositionsInProtein = new Dictionary<int, Dictionary<string, UtilModification>>();
9880
PeptidesByProteinPosition = new Dictionary<int, List<UtilPeptide>>();
9981

10082
foreach (var peptide in Peptides.Values)
10183
{
102-
if (peptide.PositionIndexType != "protein")
103-
{
104-
peptide.PeptideToProteinPositions();
105-
}
106-
10784
foreach (var modpos in peptide.ModifiedAminoAcidPositions)
10885
{
109-
if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(modpos.Key))
86+
var modPositionInProtein = modpos.Key + peptide.OneBasedStartIndexInProtein - 1;
87+
if (Sequence.IsNotNullOrEmpty()) // if the protein sequence is known, ignore terminal modifications that are not protein terminal modifications
88+
{
89+
if ((modPositionInProtein != 0 && modpos.Key == 0) // if the mod is at the N-terminus of the peptide, but not the protein
90+
|| (modPositionInProtein != Sequence.Length + 1 && modpos.Key == peptide.BaseSequence.Length + 1)) // if the mod is at the C-terminus of the peptide, but not the protein
91+
{
92+
continue;
93+
}
94+
}
95+
else // if the protein sequence is not known, ignore peptide terminal modifications
96+
{
97+
if (modpos.Key == 0 || modpos.Key == peptide.BaseSequence.Length + 1)
98+
{
99+
continue;
100+
}
101+
}
102+
103+
if (!ModifiedAminoAcidPositionsInProtein.ContainsKey(modPositionInProtein))
110104
{
111-
ModifiedAminoAcidPositionsInProtein[modpos.Key] = new Dictionary<string, UtilModification>();
112-
PeptidesByProteinPosition[modpos.Key] = new List<UtilPeptide>();
105+
ModifiedAminoAcidPositionsInProtein[modPositionInProtein] = new Dictionary<string, UtilModification>();
106+
PeptidesByProteinPosition[modPositionInProtein] = new List<UtilPeptide>();
113107
}
114108

115-
PeptidesByProteinPosition[modpos.Key].Add(peptide);
109+
PeptidesByProteinPosition[modPositionInProtein].Add(peptide);
116110

117111
foreach (var mod in modpos.Value.Values)
118112
{
119-
if (!ModifiedAminoAcidPositionsInProtein[modpos.Key].ContainsKey(mod.IdWithMotif))
113+
mod.ProteinPositionZeroIsNTerminus = modPositionInProtein;
114+
if (!ModifiedAminoAcidPositionsInProtein[modPositionInProtein].ContainsKey(mod.IdWithMotif))
120115
{
121-
ModifiedAminoAcidPositionsInProtein[modpos.Key][mod.IdWithMotif] = new UtilModification(mod.IdWithMotif, modpos.Key, 0);
116+
ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.IdWithMotif] = new UtilModification(mod.IdWithMotif, mod.PeptidePositionZeroIsNTerminus, modPositionInProtein, 0);
122117
}
123-
ModifiedAminoAcidPositionsInProtein[modpos.Key][mod.IdWithMotif].Intensity += mod.Intensity; // might need to add some magic later to keep stored the mod intensity and the peptide intensity for MM output
118+
ModifiedAminoAcidPositionsInProtein[modPositionInProtein][mod.IdWithMotif].Intensity += mod.Intensity;
124119
}
125120
}
126121
}
@@ -167,13 +162,12 @@ public class PositionFrequencyAnalysis
167162
/// Calculates the occupancy of post-translational modifications at the peptide level.
168163
/// </summary>
169164
/// <param name="peptides"> A List of Tuples whose entries are ordered as (string FullSequence, string BaseSequence, List<string> ProteinGroups, Intensity) for each peptide.</param>
170-
/// <param name="modOnNTerminus"> If true, the index of modifications at the N-terminus will be 0 (zero-based indexing). Otherwise, it is the index of the first amino acid (one-based indexing).</param>
171-
/// <param name="modOnCTerminus"> If true, the index of modifications at the C-terminus will be one more than the index of the last amino acid. Otherwise, it is the index of the last amino acid.</param>
165+
/// <param name="ignoreTerminusMod"> If true, terminal modifications will be ignored.</param>
172166
/// <returns> A nested dictionary whose key mappings are as follows: string ProteinGroup-> string Protein-> string BaseSequence-> int ModifiedAminoAcidIndex-> string ModificationName-> double Intensity
173167
/// Note: Each BaseSequence dictionary contains a ModifiedAminoAcidIndex key of -1 that then contains a ModificationName key called "Total" that is used to track the total intensity observed for
174168
/// all of the amino acids in that peptide.</returns>
175169
///
176-
public void ProteinGroupsOccupancyByPeptide(List<(string fullSeq, string baseSeq, List<string> proteinGroup, double intensity)> peptides, bool modOnNTerminus = true, bool modOnCTerminus = true, bool ignoreTerminusMod=false)
170+
public void ProteinGroupsOccupancyByPeptide(List<(string fullSeq, string baseSeq, List<string> proteinGroup, double intensity)> peptides, bool ignoreTerminusMod=false)
177171
{
178172
// ToDo: change first argument to Dictionary<IPeptide, intensity>
179173
var proteinGroups = new Dictionary<string, UtilProteinGroup>();
@@ -217,7 +211,7 @@ public void ProteinGroupsOccupancyByPeptide(List<(string fullSeq, string baseSeq
217211
var peptide = protein.Peptides[baseSeq];
218212

219213
// Want both arguments passed here to be true if need to later filter out peptide terminal mods that are not protein terminal mods
220-
Dictionary<int, List<string>> peptideMods = pep.fullSeq.ParseModifications(modOnNTerminus, modOnCTerminus, ignoreTerminusMod);
214+
Dictionary<int, List<string>> peptideMods = pep.fullSeq.ParseModifications(ignoreTerminusMod);
221215
// Go through the modified positions found froum the full sequence
222216
foreach (var modpos in peptideMods)
223217
{

mzLib/Omics/SpectrumMatch/SpectrumMatchFromTsv.cs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,12 +96,10 @@ public static string RemoveParentheses(string baseSequence)
9696
/// Parses the full sequence to identify mods.
9797
/// </summary>
9898
/// <param name="fullSeq"> Full sequence of the peptide in question</param>
99-
/// <param name="modOnNTerminus"> If true, the index of modifications at the N-terminus will be 0 (zero-based indexing). Otherwise, it is the index of the first amino acid (one-based indexing).</param>
100-
/// <param name="modOnCTerminus"> If true, the index of modifications at the C-terminus will be one more than the index of the last amino acid. Otherwise, it is the index of the last amino acid.</param>
10199
/// <returns> Dictionary with the key being the amino acid position of the mod and the value being the string representing the mod</returns>
102-
public static Dictionary<int, List<string>> ParseModifications(string fullSeq, bool modOnNTerminus = true, bool modOnCTerminus = true)
100+
public static Dictionary<int, List<string>> ParseModifications(string fullSeq, bool ignoreTerminusMod=false)
103101
{
104-
return fullSeq.ParseModifications(modOnNTerminus, modOnCTerminus);
102+
return fullSeq.ParseModifications(ignoreTerminusMod);
105103
}
106104

107105
/// <summary>

mzLib/Test/FileReadingTests/TestPsmFromTsv.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ public static void TestParseModification()
188188

189189
// psm with two mods on the same amino acid
190190
string fullSeq = "[Common Fixed:Carbamidomethyl on C]|[UniProt:N-acetylserine on S]KPRKIEEIKDFLLTARRKDAKSVKIKKNKDNVKFK";
191-
modDict = Omics.SpectrumMatch.SpectrumMatchFromTsv.ParseModifications(fullSeq, true, true);
191+
modDict = Omics.SpectrumMatch.SpectrumMatchFromTsv.ParseModifications(fullSeq);
192192
Assert.That(modDict.Count == 1);
193193
Assert.That(modDict.ContainsKey(0));
194194
Assert.That(modDict[0].Count == 2);

0 commit comments

Comments
 (0)