From 075efe671666354b57a8ece2cab1da11a89a8188 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 25 Jul 2025 17:12:36 -0500 Subject: [PATCH 1/2] Added RNA support to flash LFQ. Also, added local file references --- mzLib/FlashLFQ/FlashLfqEngine.cs | 86 ++++++----------- .../IsotopicDistributionCalculator.cs | 73 +++++++++++++++ mzLib/TestFlashLFQ/TestFlashLFQ.cs | 92 +++++++++++++++++++ 3 files changed, 194 insertions(+), 57 deletions(-) create mode 100644 mzLib/FlashLFQ/IsotopicDistributionCalculator.cs diff --git a/mzLib/FlashLFQ/FlashLfqEngine.cs b/mzLib/FlashLFQ/FlashLfqEngine.cs index 5a4682682..4e31c6187 100644 --- a/mzLib/FlashLFQ/FlashLfqEngine.cs +++ b/mzLib/FlashLFQ/FlashLfqEngine.cs @@ -1,5 +1,9 @@ using Chemistry; -using MathNet.Numerics.Distributions; +using Easy.Common.Extensions; +using FlashLFQ.Interfaces; +using FlashLFQ.IsoTracker; +using FlashLFQ.PEP; +using MassSpectrometry; using MathNet.Numerics.Statistics; using MzLibUtil; using Proteomics.AminoAcidPolymer; @@ -7,16 +11,11 @@ using System.Collections.Concurrent; using System.Collections.Generic; using System.Diagnostics; +using System.IO; using System.Linq; -using System.Threading.Tasks; using System.Runtime.CompilerServices; -using System.IO; -using Easy.Common.Extensions; -using FlashLFQ.PEP; -using FlashLFQ.IsoTracker; using System.Threading; -using FlashLFQ.Interfaces; -using MassSpectrometry; +using System.Threading.Tasks; [assembly: InternalsVisibleTo("TestFlashLFQ")] [assembly: InternalsVisibleTo("Test")] @@ -70,6 +69,7 @@ public class FlashLfqEngine private FlashLfqResults _results; private readonly List _allIdentifications; private readonly Stopwatch _globalStopwatch; + public bool RnaModeActive { get; private set; } = false; // RNA mode is activated when the charge state is less than 1 #endregion /// @@ -352,21 +352,17 @@ public FlashLfqResults Run() /// internal void CalculateTheoreticalIsotopeDistributions() { - ModifiedSequenceToIsotopicDistribution = new Dictionary>(); + // Find the maximum and minimum charge states across all identifications and set the _chargeStates property + var minChargeState = _allIdentifications.Min(p => p.PrecursorChargeState); + var maxChargeState = _allIdentifications.Max(p => p.PrecursorChargeState); + _chargeStates = Enumerable.Range(minChargeState, (maxChargeState - minChargeState) + 1).ToList(); - // calculate averagine (used for isotopic distributions for unknown modifications) - double averageC = 4.9384; - double averageH = 7.7583; - double averageO = 1.4773; - double averageN = 1.3577; - double averageS = 0.0417; + if (minChargeState < 1 || maxChargeState < 1) + { + RnaModeActive = true; //RNA mode activated!!! + } - double averagineMass = - PeriodicTable.GetElement("C").AverageMass * averageC + - PeriodicTable.GetElement("H").AverageMass * averageH + - PeriodicTable.GetElement("O").AverageMass * averageO + - PeriodicTable.GetElement("N").AverageMass * averageN + - PeriodicTable.GetElement("S").AverageMass * averageS; + ModifiedSequenceToIsotopicDistribution = new Dictionary>(); // calculate monoisotopic masses and isotopic envelope for the base sequences foreach (Identification id in _allIdentifications) @@ -377,59 +373,39 @@ internal void CalculateTheoreticalIsotopeDistributions() } ChemicalFormula formula = id.OptionalChemicalFormula; - var isotopicMassesAndNormalizedAbundances = new List<(double massShift, double abundancence)>(); - if(formula is null) { - formula = new ChemicalFormula(); if (id.BaseSequence.AllSequenceResiduesAreValid()) { // there are sometimes non-parsable sequences in the base sequence input - formula = new Proteomics.AminoAcidPolymer.Peptide(id.BaseSequence).GetChemicalFormula(); - double massDiff = id.MonoisotopicMass; - massDiff -= formula.MonoisotopicMass; + formula = RnaModeActive ? + new Transcriptomics.RNA(id.BaseSequence).GetChemicalFormula() : + new Proteomics.AminoAcidPolymer.Peptide(id.BaseSequence).GetChemicalFormula(); + double massDiff = id.MonoisotopicMass - formula.MonoisotopicMass; if (Math.Abs(massDiff) > 20) { - double averagines = massDiff / averagineMass; - - formula.Add("C", (int)Math.Round(averagines * averageC, 0)); - formula.Add("H", (int)Math.Round(averagines * averageH, 0)); - formula.Add("O", (int)Math.Round(averagines * averageO, 0)); - formula.Add("N", (int)Math.Round(averagines * averageN, 0)); - formula.Add("S", (int)Math.Round(averagines * averageS, 0)); + var massDiffFormulaFromAveragine = IsotopicDistributionCalculator.GetAveragineFormula(massDiff); + formula.Add(massDiffFormulaFromAveragine); } } else { - double averagines = id.MonoisotopicMass / averagineMass; - - formula.Add("C", (int)Math.Round(averagines * averageC, 0)); - formula.Add("H", (int)Math.Round(averagines * averageH, 0)); - formula.Add("O", (int)Math.Round(averagines * averageO, 0)); - formula.Add("N", (int)Math.Round(averagines * averageN, 0)); - formula.Add("S", (int)Math.Round(averagines * averageS, 0)); + formula = IsotopicDistributionCalculator.GetAveragineFormula(id.MonoisotopicMass); } } var isotopicDistribution = IsotopicDistribution.GetDistribution(formula, 0.125, 1e-8); - + double[] masses = isotopicDistribution.Masses.ToArray(); double[] abundances = isotopicDistribution.Intensities.ToArray(); - - for (int i = 0; i < masses.Length; i++) - { - masses[i] += (id.MonoisotopicMass - formula.MonoisotopicMass); - } - double highestAbundance = abundances.Max(); - int highestAbundanceIndex = Array.IndexOf(abundances, highestAbundance); for (int i = 0; i < masses.Length; i++) { - // expected isotopic mass shifts for this peptide - masses[i] -= id.MonoisotopicMass; + // expected isotopic mass shifts for this peptide (relative to the monoisotopic mass) + masses[i] -= formula.MonoisotopicMass; // normalized abundance of each isotope abundances[i] /= highestAbundance; @@ -444,12 +420,8 @@ internal void CalculateTheoreticalIsotopeDistributions() ModifiedSequenceToIsotopicDistribution.Add(id.ModifiedSequence, isotopicMassesAndNormalizedAbundances); } - var minChargeState = _allIdentifications.Min(p => p.PrecursorChargeState); - var maxChargeState = _allIdentifications.Max(p => p.PrecursorChargeState); - _chargeStates = Enumerable.Range(minChargeState, (maxChargeState - minChargeState) + 1).ToList(); - - var peptideModifiedSequences = _allIdentifications.GroupBy(p => p.ModifiedSequence); - foreach (var identifications in peptideModifiedSequences) + // Set the peakfinding mass for each identification + foreach (var identifications in _allIdentifications.GroupBy(p => p.ModifiedSequence)) { // isotope where normalized abundance is 1 double mostAbundantIsotopeShift = ModifiedSequenceToIsotopicDistribution[identifications.First().ModifiedSequence] diff --git a/mzLib/FlashLFQ/IsotopicDistributionCalculator.cs b/mzLib/FlashLFQ/IsotopicDistributionCalculator.cs new file mode 100644 index 000000000..67271aa6e --- /dev/null +++ b/mzLib/FlashLFQ/IsotopicDistributionCalculator.cs @@ -0,0 +1,73 @@ +using Chemistry; +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace FlashLFQ +{ + public sealed class IsotopicDistributionCalculator + { + internal static double AveragineMass; // Averagine, for protEINS + internal static double AveratideMass; // Averatide, for ribonucleoTIDES (RNA) + + // Magic numbers determined by https://pmc.ncbi.nlm.nih.gov/articles/PMC6166224/ + // These number are for peptides/proteins + internal static double AverageC = 4.9384; + internal static double AverageH = 7.7583; + internal static double AverageO = 1.4773; + internal static double AverageN = 1.3577; + internal static double AverageS = 0.0417; + + // These numbers are for RNA + internal static double RnaAverageC = 9.5; + internal static double RnaAverageH = 12.75; + internal static double RnaAverageO = 3.75; + internal static double RnaAverageN = 5; + + static IsotopicDistributionCalculator() + { + AveragineMass = + PeriodicTable.GetElement("C").AverageMass * AverageC + + PeriodicTable.GetElement("H").AverageMass * AverageH + + PeriodicTable.GetElement("O").AverageMass * AverageO + + PeriodicTable.GetElement("N").AverageMass * AverageN + + PeriodicTable.GetElement("S").AverageMass * AverageS; + + AveratideMass = + PeriodicTable.GetElement("C").AverageMass * RnaAverageC + + PeriodicTable.GetElement("H").AverageMass * RnaAverageH + + PeriodicTable.GetElement("O").AverageMass * RnaAverageO + + PeriodicTable.GetElement("N").AverageMass * RnaAverageN; + } + + internal static ChemicalFormula GetFormula(bool useAveratideModel) + { + return useAveratideModel ? GetAveratideFormula(AveratideMass) : GetAveragineFormula(AveragineMass); + } + + internal static ChemicalFormula GetAveragineFormula(double mass) + { + double averagines = mass / (double)AveragineMass; + ChemicalFormula formula = new ChemicalFormula(); + formula.Add("C", (int)Math.Round(averagines * AverageC, 0)); + formula.Add("H", (int)Math.Round(averagines * AverageH, 0)); + formula.Add("O", (int)Math.Round(averagines * AverageO, 0)); + formula.Add("N", (int)Math.Round(averagines * AverageN, 0)); + formula.Add("S", (int)Math.Round(averagines * AverageS, 0)); + return formula; + } + + internal static ChemicalFormula GetAveratideFormula(double mass) + { + double averatides = mass / (double)AveratideMass; + ChemicalFormula formula = new ChemicalFormula(); + formula.Add("C", (int)Math.Round(averatides * RnaAverageC, 0)); + formula.Add("H", (int)Math.Round(averatides * RnaAverageH, 0)); + formula.Add("O", (int)Math.Round(averatides * RnaAverageO, 0)); + formula.Add("N", (int)Math.Round(averatides * RnaAverageN, 0)); + return formula; + } + } +} diff --git a/mzLib/TestFlashLFQ/TestFlashLFQ.cs b/mzLib/TestFlashLFQ/TestFlashLFQ.cs index abb970e2f..ab1fb8c5b 100644 --- a/mzLib/TestFlashLFQ/TestFlashLFQ.cs +++ b/mzLib/TestFlashLFQ/TestFlashLFQ.cs @@ -1270,6 +1270,98 @@ public static void TestBayesianProteinQuantification() Assert.That(quantResult.ConditionsWithPeptideSampleQuantities["b"].Count == 2); } + [Test] + public static void TestFlashLfqQoutputRna() + { + string testDataDirectory = @"\\bison.chem.wisc.edu\share\Users\Nic\RNA\Standards\250529_NewStandards_Better"; + string outputDirectory = Path.Combine(testDataDirectory, "FlashLFQ"); + Directory.CreateDirectory(outputDirectory); + + string psmFile = Path.Combine(testDataDirectory, "2025-07-22-18-23-03\\Task1-RnaSearchTask\\AllOSMs.osmtsv"); + + SpectraFileInfo f1r1 = new SpectraFileInfo(Path.Combine(testDataDirectory, "RnaStandard_Subset.mzML"), "one", 1, 1, 1); + + List acceptableProteinGroupAccessions = new() { "Q7KZF4", "Q15149", "P52298" }; + + List ids = new List(); + Dictionary allProteinGroups = new Dictionary(); + foreach (string line in File.ReadAllLines(psmFile)) + { + var split = line.Split(new char[] { '\t' }); + + //skip the header + if (split.Contains("File Name") || string.IsNullOrWhiteSpace(line)) + { + continue; + } + + SpectraFileInfo file = f1r1; + + string baseSequence = split[13]; + string fullSequence = split[14]; + double monoMass = double.Parse(split[23]); + double rt = double.Parse(split[2]); + int z = (int)double.Parse(split[6]); + var proteinSubset = split[26].Split(new char[] { '|' }); + List proteinGroups = new(); + + + foreach (var protein in proteinSubset) + { + if (allProteinGroups.TryGetValue(protein, out var proteinGroup)) + { + proteinGroups.Add(proteinGroup); + } + else + { + allProteinGroups.Add(protein, new ProteinGroup(protein, "", "")); + proteinGroups.Add(allProteinGroups[protein]); + } + } + + Identification id = new Identification(file, baseSequence, fullSequence, monoMass, rt, z, proteinGroups); + ids.Add(id); + } + + var engine = new FlashLfqEngine(ids, + matchBetweenRuns: true, + requireMsmsIdInCondition: false, + useSharedPeptidesForProteinQuant: true, + maxThreads: 1); + var results = engine.Run(); + + results.WriteResults(Path.Combine(outputDirectory, "peaks.tsv"), Path.Combine(outputDirectory, "peptides.tsv"), Path.Combine(outputDirectory, "proteins.tsv"), Path.Combine(outputDirectory, "bayesian.tsv"), true); + + var peaks = results.Peaks.Values.ToList(); + var peptides = results.PeptideModifiedSequences.Values.ToList(); + var proteins = results.ProteinGroups.Values.ToList(); + + Assert.AreEqual(4, peaks[0].Count(m => m.DetectionType != DetectionType.MBR)); + Assert.AreEqual(5, peaks[1].Count(m => m.DetectionType != DetectionType.MBR)); + + CollectionAssert.AreEquivalent(new string[] { "Q7KZF4", "Q7KZF4", "P52298", "Q15149", "Q15149" }, peaks[0].SelectMany(i => i.Identifications).Select(g => g.ProteinGroups.First()).Select(m => m.ProteinGroupName).ToArray()); + CollectionAssert.AreEquivalent(new string[] { "Q7KZF4", "P52298", "Q15149", "Q15149", "Q7KZF4", "Q7KZF4", "P52298" }, peaks[1].SelectMany(i => i.Identifications).Select(g => g.ProteinGroups.First()).Select(m => m.ProteinGroupName).ToArray()); + + Assert.AreEqual(6, peptides.Count); + CollectionAssert.AreEquivalent(new string[] { "Q7KZF4", "P52298", "Q15149", "Q15149", "Q7KZF4", "P52298" }, peptides.Select(g => g.ProteinGroups.First()).Select(m => m.ProteinGroupName).ToArray()); + + Assert.AreEqual(3, proteins.Count); + + List peaksList = File.ReadAllLines(Path.Combine(outputDirectory, "peaks.tsv")).ToList(); + List peptidesList = File.ReadAllLines(Path.Combine(outputDirectory, "peptides.tsv")).ToList(); + List proteinsList = File.ReadAllLines(Path.Combine(outputDirectory, "proteins.tsv")).ToList(); + + //check that all rows including header have the same number of elements + Assert.AreEqual(1, peaksList.Select(l => l.Split('\t').Length).Distinct().ToList().Count); + Assert.AreEqual(1, peptidesList.Select(l => l.Split('\t').Length).Distinct().ToList().Count); + Assert.AreEqual(1, proteinsList.Select(l => l.Split('\t').Length).Distinct().ToList().Count); + + CollectionAssert.AreEquivalent(new string[] { "P52298", "Q15149", "Q7KZF4" }, proteins.Select(p => p.ProteinGroupName.ToArray())); + + Directory.Delete(outputDirectory, true); + } + + [Test] public static void TestFlashLfqQoutputRealData() { From 9f701d0098f9d1ad7f598e5ddb743e08c9683e0a Mon Sep 17 00:00:00 2001 From: Alex Date: Tue, 29 Jul 2025 16:39:23 -0500 Subject: [PATCH 2/2] Fixed negative intensity bug --- mzLib/FlashLFQ/IsotopicEnvelope.cs | 7 ++++--- mzLib/mzLib.nuspec | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/mzLib/FlashLFQ/IsotopicEnvelope.cs b/mzLib/FlashLFQ/IsotopicEnvelope.cs index 14c3775c5..f27f883b1 100644 --- a/mzLib/FlashLFQ/IsotopicEnvelope.cs +++ b/mzLib/FlashLFQ/IsotopicEnvelope.cs @@ -1,4 +1,5 @@ -using MassSpectrometry; +using System; +using MassSpectrometry; namespace FlashLFQ { @@ -17,7 +18,7 @@ public IsotopicEnvelope(IIndexedPeak monoisotopicPeak, int chargeState, double i { IndexedPeak = monoisotopicPeak; ChargeState = chargeState; - Intensity = intensity / chargeState; + Intensity = intensity / Math.Abs(chargeState); PearsonCorrelation = pearsonCorrelation; } @@ -25,7 +26,7 @@ public IsotopicEnvelope(IndexedMassSpectralPeak monoisotopicPeak, int chargeStat { IndexedPeak = monoisotopicPeak; ChargeState = chargeState; - Intensity = intensity / chargeState; + Intensity = intensity / Math.Abs(chargeState); PearsonCorrelation = pearsonCorrelation; } diff --git a/mzLib/mzLib.nuspec b/mzLib/mzLib.nuspec index 76019a72d..a4987e021 100644 --- a/mzLib/mzLib.nuspec +++ b/mzLib/mzLib.nuspec @@ -2,7 +2,7 @@ mzLib - 1.0.559 + 8.1.8 mzLib Stef S. Stef S.