Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions mzLib/MzLibUtil/ClassExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
{
public static class ClassExtensions
{
public static readonly string ModificationPattern = @"-?\[(.+?)(?<!\[I+)\]";
public static readonly string ProteinSplitPattern = @";|\|";

/// <summary>
/// Applies a boxcar smoothing algorithm to the input data.
/// </summary>
Expand Down Expand Up @@ -283,6 +286,12 @@
return modDict;
}

public static string GetBaseSequenceFromFullSequence(this string fullSeq, string? modPattern=null, string? replacement=null)

Check warning on line 289 in mzLib/MzLibUtil/ClassExtensions.cs

View workflow job for this annotation

GitHub Actions / build

The annotation for nullable reference types should only be used in code within a '#nullable' annotations context.

Check warning on line 289 in mzLib/MzLibUtil/ClassExtensions.cs

View workflow job for this annotation

GitHub Actions / build

The annotation for nullable reference types should only be used in code within a '#nullable' annotations context.

Check warning on line 289 in mzLib/MzLibUtil/ClassExtensions.cs

View workflow job for this annotation

GitHub Actions / build

The annotation for nullable reference types should only be used in code within a '#nullable' annotations context.

Check warning on line 289 in mzLib/MzLibUtil/ClassExtensions.cs

View workflow job for this annotation

GitHub Actions / build

The annotation for nullable reference types should only be used in code within a '#nullable' annotations context.

Check warning on line 289 in mzLib/MzLibUtil/ClassExtensions.cs

View workflow job for this annotation

GitHub Actions / integration

The annotation for nullable reference types should only be used in code within a '#nullable' annotations context.

Check warning on line 289 in mzLib/MzLibUtil/ClassExtensions.cs

View workflow job for this annotation

GitHub Actions / integration

The annotation for nullable reference types should only be used in code within a '#nullable' annotations context.

Check warning on line 289 in mzLib/MzLibUtil/ClassExtensions.cs

View workflow job for this annotation

GitHub Actions / integration

The annotation for nullable reference types should only be used in code within a '#nullable' annotations context.

Check warning on line 289 in mzLib/MzLibUtil/ClassExtensions.cs

View workflow job for this annotation

GitHub Actions / integration

The annotation for nullable reference types should only be used in code within a '#nullable' annotations context.
{
Regex regex = new(modPattern ?? ModificationPattern);
return regex.Replace(fullSeq, replacement ?? string.Empty);
}

/// <summary>
/// Fixes an issue where the | appears and throws off the numbering if there are multiple mods on a single amino acid.
/// </summary>
Expand All @@ -296,5 +305,10 @@
Regex regexSpecialChar = new(specialCharacter);
fullSeq = regexSpecialChar.Replace(fullSeq, replacement);
}

public static string[] SplitProteinAccessions(this string proteinGroupName)
{
return Regex.Split(proteinGroupName, ProteinSplitPattern);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
using Easy.Common.Extensions;
using System.Collections.Generic;

namespace MzLibUtil.PositionFrequencyAnalysis
{
/// <summary>
/// Handles analysis and organization of protein group quantification from peptide records.
/// </summary>
public class PositionFrequencyAnalysis
{
/// <summary>
/// Dictionary mapping protein group names to their quantification data.
/// </summary>
public Dictionary<string, QuantifiedProteinGroup> ProteinGroups { get; private set; }

/// <summary>
/// Populates protein groups with their respective proteins and peptides from a list of quantifide peptide records.
/// The resulting protein groups are stored in the ProteinGroups property with the protein group name strings as keys.
/// </summary>
/// <param name="peptides"> A list of QuantifiedPeptideRecord, which store a peptide's full sequence, mapped protein groupsm and intensity.</param>
/// <param name="proteinSequences"> An optional dictionary of protein sequences to use for mapping peptides to proteins.
/// If not provided, the protein sequences will be left null in the QuantifiedProtein objects. However, this parameter should not be null if what we want
/// is a protein stoichiometry, since it is needed to align the peptides to the parent protein.</param>"
public void SetUpQuantificationFromQuantifiedPeptideRecords(List<QuantifiedPeptideRecord> peptides, Dictionary<string, string> proteinSequences=null)
{
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of passing in a list of tuples, have you considered making a lightweight class to hold that information? Like a record class that stores the sequence, protein groups, and intensity. Also, it's not clear what the proteinGroups are and what information they contain. In IQuantifiableRecord, a tuple stores accessions, gene names, and organisms for the different protein groups. Just using Accessions would probably work as well, but I would like guidance on what the proteinGroups actually are

ProteinGroups = new Dictionary<string, QuantifiedProteinGroup>();
foreach (var peptide in peptides)
{
// Iterate through the peptide's protein groups in case it is a shared peptide protein groups.
// We want to map the peptide separately to each protein group it belongs to, primarily due to
// each protein group is reported separately in MetaMorpheus.
foreach (var pg in peptide.ProteinGroups)
{
// If have not seen that protein group, store it
if (!ProteinGroups.ContainsKey(pg))
{
ProteinGroups[pg] = new QuantifiedProteinGroup(pg);
}
var proteinGroup = ProteinGroups[pg];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there are actually multiple protein groups associated with the peptide, should we store the combined protein group in the dictionary? What would happen if we split first, as in line 40, then added them to the dictionary?


foreach (var proteinName in pg.SplitProteinAccessions())
{
// Add the protein to the protein group's dictionary if it has not been added
if (!proteinGroup.Proteins.ContainsKey(proteinName))
{
proteinGroup.Proteins[proteinName] = new QuantifiedProtein(proteinName);
if (proteinSequences.IsNotNullOrEmpty() && proteinSequences.ContainsKey(proteinName))
{
proteinGroup.Proteins[proteinName].Sequence = proteinSequences[proteinName];
}
}
var protein = proteinGroup.Proteins[proteinName];

// If the peptide's base sequence has not been seen, add it to the protein's dictionary
if (!protein.Peptides.ContainsKey(peptide.BaseSequence))
{
protein.Peptides[peptide.BaseSequence] = new QuantifiedPeptide(peptide.FullSequence, intensity: peptide.Intensity);
}
else
{
// If the peptide's base sequence has been seen, add the new full sequence to the existing peptide
protein.Peptides[peptide.BaseSequence].AddFullSequence(peptide.FullSequence, intensity: peptide.Intensity);
}
}
}
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
namespace MzLibUtil.PositionFrequencyAnalysis
{
/// <summary>
/// A class to store information about a quantified modification.
/// </summary>
public class QuantifiedModification
{
public string Name { get; set; }
public int PeptidePositionZeroIsNTerminus { get; set; }
public int ProteinPositionZeroIsNTerminus { get; set; }
public double Intensity { get; set; }

/// <summary>
/// Constructor for a QuantifiedModification object.
/// </summary>
/// <param name="name">Full name of the modification, including the in the format "MODTYPE: MODID on MOTIF" </param>
/// <param name="positionInPeptide">Zero-based postion in the peptide.</param>
/// <param name="positionInProtein">Zero-based postion in the peptide's parent protein.</param>
/// <param name="intensity"></param>
public QuantifiedModification(string name, int positionInPeptide, int? positionInProtein = null, double intensity = 0)
{
Name = name;
PeptidePositionZeroIsNTerminus = positionInPeptide;
ProteinPositionZeroIsNTerminus = positionInProtein ?? -1; // -1 means that the position in the protein is unknown
Intensity = intensity;
}
}
}
132 changes: 132 additions & 0 deletions mzLib/MzLibUtil/PositionFrequencyAnalysis/QuantifiedPeptide.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
using Easy.Common.Extensions;
using System;
using System.Collections.Generic;

namespace MzLibUtil.PositionFrequencyAnalysis
{
/// <summary>
/// A class to store information about a quantified peptides sharing the same base sequence.
/// </summary>
public class QuantifiedPeptide
{
public HashSet<string> FullSequences { get; set; }
public string BaseSequence { get; set; }
public QuantifiedProtein ParentProtein { get; set; }
public int ZeroBasedStartIndexInProtein { get; set; }

/// <summary>
/// Dictionary mapping zero-based amino acid positions in the peptide to dictionaries of
/// modification IDs and their corresponding QuantifiedModification objects. This property
/// stores ALL of the modifications observed for this peptide across all full sequences.
/// </summary>
public Dictionary<int, Dictionary<string, QuantifiedModification>> ModifiedAminoAcidPositions { get; set; }
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What string serves as the key in the <string, QuantMod> dictionary?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the string just stores position, could the ModifiedAmminoAcidPositions just be an <int, List> Dictionary? The position string seems redundant

public double Intensity { get; set; }

/// <summary>
/// Constructor for a QuantifiedPeptide object. The base sequence and modifications are parsed from the full sequence.
/// </summary>
/// <param name="fullSequence"></param>
/// <param name="zeroBasedStartIndexInProtein"></param>
/// <param name="intensity"></param>
/// <param name="modPattern"></param>
public QuantifiedPeptide(string fullSequence, int zeroBasedStartIndexInProtein = -1, double intensity = 0, string modPattern = null)
{
ModifiedAminoAcidPositions = new Dictionary<int, Dictionary<string, QuantifiedModification>>();
ZeroBasedStartIndexInProtein = zeroBasedStartIndexInProtein; // -1 means that the position in the protein is unknown
Intensity = intensity;
FullSequences = new HashSet<string> { fullSequence };
_SetBaseSequence(fullSequence, modPattern);
_SetModifications(fullSequence, intensity);
}

/// <summary>
/// Adds a new full sequence to the peptide, updating modifications and intensity accordingly.
/// </summary>
/// <param name="fullSeq"></param>
/// <param name="intensity"></param>
/// <param name="modPattern"></param>
/// <exception cref="Exception"></exception>
public void AddFullSequence(string fullSeq, double intensity = 0, string modPattern = null)
{
if (BaseSequence.Equals(fullSeq.GetBaseSequenceFromFullSequence()))
{
FullSequences.Add(fullSeq);
Intensity += intensity;
_SetModifications(fullSeq, intensity); // updating the intensity is done here
}
else
{
throw new Exception("The base sequence of the peptide being added does not match the base sequence of this peptide.");
}
}

/// <summary>
/// Merges another QuantifiedPeptide object into this one, combining their full sequences and intensities.
/// </summary>
/// <param name="peptideToMerge"></param>
/// <exception cref="Exception"></exception>
public void MergePeptide(QuantifiedPeptide peptideToMerge)
{
if (peptideToMerge == null || peptideToMerge.BaseSequence != BaseSequence)
{
throw new Exception("The base sequence of the peptide being added does not match the base sequence of this peptide.");
}
foreach (var fullSeq in peptideToMerge.FullSequences)
{
FullSequences.Add(fullSeq);
_SetModifications(fullSeq, peptideToMerge.Intensity); // updating the intensity is done here
}
Intensity += peptideToMerge.Intensity;
}

private void _SetModifications(string fullSeq, double intensity = 0)
{
var mods = fullSeq.ParseModifications();

if (mods.IsNotNullOrEmpty())
{
foreach (var modpos in mods.Keys)
{
var mod = mods[modpos];
if (!ModifiedAminoAcidPositions.ContainsKey(modpos))
{
ModifiedAminoAcidPositions[modpos] = new Dictionary<string, QuantifiedModification>();
}

if (!ModifiedAminoAcidPositions[modpos].ContainsKey(mod))
{
ModifiedAminoAcidPositions[modpos][mod] = new QuantifiedModification(mod, modpos, intensity: 0);
}
ModifiedAminoAcidPositions[modpos][mod].Intensity += intensity;

// Maybe should update/pass position in protein from here, too.
}
}
}

private void _SetBaseSequence(string fullSeq, string modPattern)
{
BaseSequence = fullSeq.GetBaseSequenceFromFullSequence(modPattern: modPattern);
}

/// <summary>
/// Returns the modification stoichiometry for this peptide as a dictionary mapping
/// zero-based amino acid positions in the peptide to dictionaries of modification IDs and their corresponding
/// QuantifiedModification objects with normalized intensities (i.e., divided by the total peptide intensity).
/// </summary>
/// <returns></returns>
public Dictionary<int, Dictionary<string, QuantifiedModification>> GetModStoichiometryForPeptide()
{
var aaModsStoichiometry = ModifiedAminoAcidPositions;

foreach (var modpos in aaModsStoichiometry)
{
foreach (var mod in modpos.Value.Values)
{
mod.Intensity /= Intensity;
}
}
return aaModsStoichiometry;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
using System.Collections.Generic;
using System.Text.RegularExpressions;

namespace MzLibUtil.PositionFrequencyAnalysis
{
public class QuantifiedPeptideRecord
{
public string FullSequence { get; set; }
public string BaseSequence { get; set; }
public HashSet<string> ProteinGroups { get; set; }
public double Intensity { get; set; }
/// <summary>
/// A record of a quantified peptide, storing its full sequence (with modifications), base sequence (without modifications),
/// protein groups it maps to, and intensity. The base sequence is derived from the full sequence and is not passed
/// as initialization parameter.
/// </summary>
/// <param name="fullSequence"></param>
/// <param name="proteinGroups"></param>
/// <param name="intensity"></param>
public QuantifiedPeptideRecord(string fullSequence, HashSet<string> proteinGroups, double intensity)
{
FullSequence = fullSequence;
ProteinGroups = proteinGroups;
Intensity = intensity;
BaseSequence = fullSequence.GetBaseSequenceFromFullSequence();
}
}
}
Loading