diff --git a/pwiz_tools/Skyline/CommandLine.cs b/pwiz_tools/Skyline/CommandLine.cs index 13f4090798e..0042d929d88 100644 --- a/pwiz_tools/Skyline/CommandLine.cs +++ b/pwiz_tools/Skyline/CommandLine.cs @@ -2274,14 +2274,6 @@ public bool MinimizeResults(CommandArgs commandArgs) return true; } - private IEnumerable DigestProteinToPeptides(FastaSequence sequence) - { - var peptideSettings = Document.Settings.PeptideSettings; - return peptideSettings.Enzyme.Digest(sequence, peptideSettings.DigestSettings); - // CONSIDER: should AssociateProteinsDlg use the length filters? The old PeptidePerProteinDlg doesn't seem to. - //peptideSettings.Filter.MaxPeptideLength, peptideSettings.Filter.MinPeptideLength); - } - private bool AssociateProteins(CommandArgs commandArgs) { return HandleExceptions(commandArgs, () => @@ -2292,7 +2284,7 @@ private bool AssociateProteins(CommandArgs commandArgs) _out.WriteLine(Resources.CommandLine_AssociateProteins_Associating_peptides_with_proteins_from_FASTA_file__0_, Path.GetFileName(fastaPath)); var progressMonitor = new CommandProgressMonitor(_out, new ProgressStatus(String.Empty)); var proteinAssociation = new ProteinAssociation(Document, progressMonitor); - proteinAssociation.UseFastaFile(fastaPath, DigestProteinToPeptides, progressMonitor); + proteinAssociation.UseFastaFile(fastaPath, progressMonitor); proteinAssociation.ApplyParsimonyOptions(commandArgs.AssociateProteinsGroupProteins.GetValueOrDefault(), commandArgs.AssociateProteinsGeneLevelParsimony.GetValueOrDefault(), commandArgs.AssociateProteinsFindMinimalProteinList.GetValueOrDefault(), diff --git a/pwiz_tools/Skyline/EditUI/AssociateProteinsDlg.cs b/pwiz_tools/Skyline/EditUI/AssociateProteinsDlg.cs index de06a28d6ea..937a65ee91f 100644 --- a/pwiz_tools/Skyline/EditUI/AssociateProteinsDlg.cs +++ b/pwiz_tools/Skyline/EditUI/AssociateProteinsDlg.cs @@ -393,14 +393,6 @@ private void numMinPeptides_ValueChanged(object sender, EventArgs e) UpdateParsimonyResults(); } - private IEnumerable DigestProteinToPeptides(FastaSequence sequence) - { - var peptideSettings = _document.Settings.PeptideSettings; - return peptideSettings.Enzyme.Digest(sequence, peptideSettings.DigestSettings); - // CONSIDER: should AssociateProteinsDlg use the length filters? The old PeptidePerProteinDlg doesn't seem to. - //peptideSettings.Filter.MaxPeptideLength, peptideSettings.Filter.MinPeptideLength); - } - // find matches using the background proteome public void UseBackgroundProteome() { @@ -420,7 +412,8 @@ public void UseBackgroundProteome() { using (var longWaitDlg = new LongWaitDlg()) { - longWaitDlg.PerformWork(this, 1000, broker => _proteinAssociation.UseBackgroundProteome(backgroundProteome, DigestProteinToPeptides, broker)); + longWaitDlg.PerformWork(this, 1000, broker => + _proteinAssociation.UseBackgroundProteome(backgroundProteome, broker)); if (longWaitDlg.IsCanceled) return; } @@ -498,7 +491,7 @@ public void UseFastaFile(string file) { using (var longWaitDlg = new LongWaitDlg()) { - longWaitDlg.PerformWork(this, 1000, broker => _proteinAssociation.UseFastaFile(file, DigestProteinToPeptides, broker)); + longWaitDlg.PerformWork(this, 1000, broker => _proteinAssociation.UseFastaFile(file, broker)); if (longWaitDlg.IsCanceled) return; } diff --git a/pwiz_tools/Skyline/Model/Proteome/ProteinAssociation.cs b/pwiz_tools/Skyline/Model/Proteome/ProteinAssociation.cs index fa46317d9a1..31aad7d8102 100644 --- a/pwiz_tools/Skyline/Model/Proteome/ProteinAssociation.cs +++ b/pwiz_tools/Skyline/Model/Proteome/ProteinAssociation.cs @@ -31,6 +31,7 @@ using pwiz.ProteomeDatabase.DataModel; using pwiz.ProteomeDatabase.Fasta; using pwiz.Skyline.Model.AuditLog; +using pwiz.Skyline.Model.DocSettings; using pwiz.Skyline.Properties; using pwiz.Skyline.Util; using pwiz.Skyline.Util.Extensions; @@ -102,38 +103,37 @@ private void ResetMapping() ParsimoniousProteins = null; } - public void UseFastaFile(string file, Func> digestProteinToPeptides, ILongWaitBroker broker) + public void UseFastaFile(string file, ILongWaitBroker broker) { if (!File.Exists(file)) return; - ResetMapping(); using var stream = File.Open(file, FileMode.Open, FileAccess.Read, FileShare.Read); var fastaSource = new FastaSource(stream); - var proteinAssociations = FindProteinMatches(fastaSource, digestProteinToPeptides, broker); - if (proteinAssociations != null) - { - AssociatedProteins = proteinAssociations; - } + UseProteinSource(fastaSource, _document.Settings.PeptideSettings.Enzyme, broker); } // find matches using the background proteome - public void UseBackgroundProteome(BackgroundProteome backgroundProteome, Func> digestProteinToPeptides, ILongWaitBroker broker) + public void UseBackgroundProteome(BackgroundProteome backgroundProteome, ILongWaitBroker broker) { if (backgroundProteome.Equals(BackgroundProteome.NONE)) throw new InvalidOperationException(Resources.AssociateProteinsDlg_UseBackgroundProteome_No_background_proteome_defined); - ResetMapping(); var proteome = backgroundProteome; - var proteinSource = new BackgroundProteomeSource(broker.CancellationToken, proteome); - var proteinAssociations = FindProteinMatches(proteinSource, digestProteinToPeptides, broker); + UseProteinSource(new BackgroundProteomeSource(broker.CancellationToken, proteome), _document.Settings.PeptideSettings.Enzyme, broker); + } + + public void UseProteinSource(IProteinSource proteinSource, Enzyme enzyme, ILongWaitBroker broker) + { + ResetMapping(); + var proteinAssociations = FindProteinMatches(proteinSource, enzyme, broker); if (proteinAssociations != null) { AssociatedProteins = proteinAssociations; } } - private Dictionary FindProteinMatches(IProteinSource proteinSource, Func> digestProteinToPeptides, ILongWaitBroker broker) + private Dictionary FindProteinMatches(IProteinSource proteinSource, Enzyme enzyme, ILongWaitBroker broker) { var localResults = new MappingResultsInternal(); var peptideToProteins = new Dictionary, List>(); @@ -141,38 +141,16 @@ private Dictionary FindProteinMatches(I var proteinAssociations = new Dictionary(); int maxProgressValue = 0; broker.Message = ProteomeResources.AssociateProteinsDlg_FindProteinMatchesWithFasta_Finding_peptides_in_FASTA_file; + var proteinPeptideMatchesDictionary = new Dictionary(); + var allEnzymaticPeptides = new HashSet(); - ParallelEx.ForEach(proteinSource.Proteins, fastaRecord => + ParallelEx.ForEach(proteinSource.Proteins.Select(Tuple.Create), fastaRecordIndex => { + var fastaRecord = fastaRecordIndex.Item1; int progressValue = fastaRecord.Progress; var fasta = fastaRecord.Sequence; - var trieResults = _peptideTrie.FindAll(fasta.Sequence); - var matches = new List(); - - // don't count the same peptide twice in a protein - var peptidesMatched = new HashSet(); - - IList digestedPeptides = null; - - foreach (var result in trieResults) - { - if (broker.IsCanceled) - { - break; - } - if (!peptidesMatched.Add(result.Keyword)) - continue; - - // check that peptide is in the digest of the protein (if the result is non-empty) - digestedPeptides ??= digestProteinToPeptides(fastaRecord.Sequence).ToList(); - if (!digestedPeptides.Contains(p => p.Sequence == result.Keyword)) - continue; - - matches.AddRange(_peptideToPath[result.Keyword]); - } - - var peptideAssociationGroup = new PeptideAssociationGroup(matches); - + ProteinPeptideMatches proteinPeptideMatches = new ProteinPeptideMatches(fastaRecord, enzyme, + _peptideTrie.FindAll(fasta.Sequence).Select(result => result.Keyword).Distinct()); lock (localResults) { if (broker.IsCanceled) @@ -184,25 +162,66 @@ private Dictionary FindProteinMatches(I maxProgressValue = Math.Max(maxProgressValue, progressValue); } - if (matches.Count > 0) - { - proteinAssociations[fastaRecord] = peptideAssociationGroup; - ++localResults.ProteinsMapped; - localResults.FinalPeptideCount += matches.Count; + proteinPeptideMatchesDictionary.Add(fastaRecordIndex.Item2, proteinPeptideMatches); + allEnzymaticPeptides.UnionWith(proteinPeptideMatches.EnzymaticPeptides); + } + }); + + var proteinPeptideMatchesList = proteinPeptideMatchesDictionary + .OrderBy(kvp => kvp.Key).Select(kvp => kvp.Value).ToList(); + var peptideAssociationGroups = new PeptideAssociationGroup[proteinPeptideMatchesList.Count]; + ParallelEx.For(0, proteinPeptideMatchesList.Count, iProtein => + { + var proteinPeptideMatches = proteinPeptideMatchesList[iProtein]; + var matches = new List(); - foreach (var match in matches) + foreach (var peptideSequence in proteinPeptideMatches.CandidatePeptides) + { + if (!proteinPeptideMatches.EnzymaticPeptides.Contains(peptideSequence)) + { + // The peptide could not have been digested by the enzyme from this protein sequence. + // Only skip it if there is at least one protein that could produce the digested peptide + if (allEnzymaticPeptides.Contains(peptideSequence)) { - if (!peptideToProteins.ContainsKey(match)) - peptideToProteins.Add(match, new List { fastaRecord }); - else - peptideToProteins[match].Add(fastaRecord); + continue; } } - else - ++localResults.ProteinsUnmapped; + + matches.AddRange(_peptideToPath[peptideSequence]); + } + if (matches.Count > 0) + { + peptideAssociationGroups[iProtein] = new PeptideAssociationGroup(matches); } }); + for (int iProtein = 0; iProtein < proteinPeptideMatchesList.Count; iProtein++) + { + var fastaRecord = proteinPeptideMatchesList[iProtein].ProteinRecord; + var peptideAssociationGroup = peptideAssociationGroups[iProtein]; + if (peptideAssociationGroup == null) + { + ++localResults.ProteinsUnmapped; + } + else + { + ++localResults.ProteinsMapped; + localResults.FinalPeptideCount += peptideAssociationGroup.Peptides.Count; + proteinAssociations[fastaRecord] = peptideAssociationGroup; + foreach (var match in peptideAssociationGroup.Peptides) + { + if (peptideToProteins.TryGetValue(match, out var list)) + { + list.Add(fastaRecord); + } + else + { + peptideToProteins.Add(match, new List{fastaRecord}); + } + } + } + } + if (broker.IsCanceled) return null; @@ -223,6 +242,31 @@ private Dictionary FindProteinMatches(I return proteinAssociations; } + private class ProteinPeptideMatches + { + private static readonly DigestSettings lenientDigestSettings = new DigestSettings(int.MaxValue, false); + public ProteinPeptideMatches(IProteinRecord proteinRecord, Enzyme enzyme, IEnumerable candidatePeptides) + { + ProteinRecord = proteinRecord; + CandidatePeptides = ImmutableList.ValueOf(candidatePeptides); + if (CandidatePeptides.Count > 0) + { + var maxPeptideLength = CandidatePeptides.Max(peptide => peptide.Length); + EnzymaticPeptides = enzyme + .Digest(proteinRecord.Sequence, lenientDigestSettings, maxPeptideLength) + .Select(peptide => peptide.Sequence).Intersect(CandidatePeptides).ToHashSet(); + } + else + { + EnzymaticPeptides = Array.Empty(); + } + } + + public IProteinRecord ProteinRecord { get; } + public ImmutableList CandidatePeptides { get; } + public ICollection EnzymaticPeptides { get; } + } + [XmlRoot("protein_association")] public class ParsimonySettings : Immutable, IXmlSerializable, IValidating { @@ -1277,3 +1321,4 @@ public object GetDefaultObject(ObjectInfo info) } } } + diff --git a/pwiz_tools/Skyline/Test/ProteinAssociationTest.cs b/pwiz_tools/Skyline/Test/ProteinAssociationTest.cs new file mode 100644 index 00000000000..c595c02fa13 --- /dev/null +++ b/pwiz_tools/Skyline/Test/ProteinAssociationTest.cs @@ -0,0 +1,128 @@ +/* + * Original author: Nicholas Shulman , + * MacCoss Lab, Department of Genome Sciences, UW + * + * Copyright 2024 University of Washington - Seattle, WA + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +using System; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using System.Text; +using System.Threading; +using System.Windows.Forms; +using Microsoft.VisualStudio.TestTools.UnitTesting; +using pwiz.Skyline.Model; +using pwiz.Skyline.Model.DocSettings; +using pwiz.Skyline.Model.Proteome; +using pwiz.Skyline.Properties; +using pwiz.Skyline.Util; +using pwiz.SkylineTestUtil; + +namespace pwiz.SkylineTest +{ + [TestClass] + public class ProteinAssociationTest : AbstractUnitTest + { + [TestMethod] + public void TestTrypticProteinAssociation() + { + var fastaFileText = @">Protein1 +MRALWVLGLCCVLLTFGSVRADDEVDVDGTVEEDLGKSREGSRTDDEVVQREEEAIQLDG +LNASQIRELREKSEKFAFQAEVNRMMKLIINSLYKNKEIFLRELISNASDALDKIRLISL +TDENALSGNEELTVKIKCDKEKNLLHVTDTGVGMTREELVKNLGTIAKSGTSEFLNKMTE +>Protein2 +MPEEVHHGEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISNASDALDKIRYESLT +DPSKLDSGKELKIDIIPNPQERTLTLVDTGIGMTKADLINNLGTIAKSGTKAFMEALQAG +ADISMIGQFGVGFYSAYLVAEKVVVITKHNDDEQYAWESSAGGSFTVRADHGEPIGRGTK"; + var peptides = new[] + { + "ADLINNLGTIAK", "ELISNASDALDKIR", "FAFQAEVNR", "IDIIPNPQER", "LIINSLYK", + "LISLTDENALSGNEELTVK", "NKEIFLR", "NLLHVTDTGVGMTR", "SGTSEFLNK", "TDDEVVQREEEAIQLDGLNASQIR", + "KYSQFINFPIYVWSSK" + }; + + var document = CreateDocumentWithPeptides(peptides); + + // Associate proteins using Trypsin. The peptide "NKEIFLR" is only tryptic for the first protein + var trypsin = EnzymeList.GetDefault(); + var trypsinAssociatedProteins = AssociateProteins(document, fastaFileText, trypsin); + CollectionAssert.Contains(trypsinAssociatedProteins["Protein1"], "NKEIFLR"); + CollectionAssert.DoesNotContain(trypsinAssociatedProteins["Protein2"], "NKEIFLR"); + CollectionAssert.AreEquivalent(new[] { "ADLINNLGTIAK", "ELISNASDALDKIR", "IDIIPNPQER" }, + trypsinAssociatedProteins["Protein2"]); + + // Now associate proteins using Chymotrypsin. The peptide "NKEIFLR" is not chymotryptic for either protein + var chymotrypsin = new Enzyme("Chymotrypsin", "FWYL", "P"); + var chymotrypsinAssociatedProteins = AssociateProteins(document, fastaFileText, chymotrypsin); + CollectionAssert.Contains(chymotrypsinAssociatedProteins["Protein1"], "NKEIFLR"); + CollectionAssert.Contains(chymotrypsinAssociatedProteins["Protein2"], "NKEIFLR"); + CollectionAssert.AreEquivalent(new[] { "ADLINNLGTIAK", "ELISNASDALDKIR", "IDIIPNPQER", "NKEIFLR" }, + chymotrypsinAssociatedProteins["Protein2"]); + } + + private static Dictionary> AssociateProteins(SrmDocument document, string fastaFileText, Enzyme enzyme) + { + var fastaProteinSource = + new ProteinAssociation.FastaSource(new MemoryStream(Encoding.UTF8.GetBytes(fastaFileText))); + + var proteinAssociation = new ProteinAssociation(document, new LongWaitBrokerImpl()); + proteinAssociation.UseProteinSource(fastaProteinSource, enzyme, new LongWaitBrokerImpl()); + return proteinAssociation.AssociatedProteins.ToDictionary( + kvp => kvp.Key.Sequence.Name, kvp => kvp.Value.Peptides.Select(p => p.Peptide.Sequence).ToList()); + } + + private static SrmDocument CreateDocumentWithPeptides(IEnumerable peptides) + { + var settings = SrmSettingsList.GetDefault(); + var peptideDocNodes = new List(); + foreach (var peptideSequence in peptides) + { + var peptideDocNode = + new PeptideDocNode(new Peptide(peptideSequence)).ChangeSettings(settings, SrmSettingsDiff.ALL); + peptideDocNodes.Add(peptideDocNode); + } + + var peptideGroupDocNode = new PeptideGroupDocNode(new PeptideGroup(), Annotations.EMPTY, "Peptide List", + null, peptideDocNodes.ToArray()); + return (SrmDocument)new SrmDocument(settings).ChangeChildren(new DocNode[] { peptideGroupDocNode }); + } + + private class LongWaitBrokerImpl : ILongWaitBroker + { + public bool IsCanceled + { + get { return false; } + } + public int ProgressValue { get; set; } + public string Message { get; set; } + public bool IsDocumentChanged(SrmDocument docOrig) + { + return false; + } + + public DialogResult ShowDialog(Func show) + { + throw new InvalidOperationException(); + } + + public void SetProgressCheckCancel(int step, int totalSteps) + { + } + + public CancellationToken CancellationToken => CancellationToken.None; + } +} +} diff --git a/pwiz_tools/Skyline/Test/Test.csproj b/pwiz_tools/Skyline/Test/Test.csproj index 2829f47d2ea..c492f6b79af 100644 --- a/pwiz_tools/Skyline/Test/Test.csproj +++ b/pwiz_tools/Skyline/Test/Test.csproj @@ -182,6 +182,7 @@ +