Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
d21d2c5
Allow non-tryptic peptides to be associated with a protein so long as…
nickshulman Sep 24, 2024
2fffb19
Fix problem where results were being updated at wrong spot in the loop
nickshulman Sep 24, 2024
56373de
Fix incorrect use of "Parallel" instead of "ParallelEx"
nickshulman Sep 24, 2024
3a84298
Merge branch 'master' into Skyline/work/20240923_AssociateProteinsLes…
nickshulman Oct 4, 2024
b7a47f5
Merge branch 'master' into Skyline/work/20240923_AssociateProteinsLes…
nickshulman Oct 8, 2024
10f2de0
Fix intermittent failure in TestHugeAssociateProteins
nickshulman Oct 9, 2024
7b0e4e4
Use Tuple<string, bool> in "ProteinPeptideMatches"
nickshulman Oct 9, 2024
71c918b
Add "ProteinAssociationTest"
nickshulman Oct 9, 2024
a67a923
Delete "TwoProteins.fasta" and use TemporaryDirectory instead
nickshulman Oct 9, 2024
2b433d1
Use ParallelEx.For to enumerate over ProteinPeptideMatches objects
nickshulman Oct 9, 2024
6a0ae1b
Add method "ProteinAssociation.UseProteinSource"
nickshulman Oct 9, 2024
da5516b
Merge remote-tracking branch 'remotes/origin/master' into Skyline/wor…
nickshulman Oct 9, 2024
1a71b69
Change "AssociateProteins" to take an Enzyme instead of passing in a …
nickshulman Oct 9, 2024
4235938
Remove inaccurate comment
nickshulman Oct 9, 2024
e27a2f8
Use Enzyme from the document instead of passing it in.
nickshulman Oct 9, 2024
234c1b0
Fix TestAssociateProteins
nickshulman Oct 10, 2024
fb7a62c
Merge remote-tracking branch 'remotes/origin/master' into Skyline/wor…
nickshulman Oct 10, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 81 additions & 28 deletions pwiz_tools/Skyline/Model/Proteome/ProteinAssociation.cs
Original file line number Diff line number Diff line change
Expand Up @@ -141,38 +141,35 @@ private Dictionary<IProteinRecord, PeptideAssociationGroup> FindProteinMatches(I
var proteinAssociations = new Dictionary<IProteinRecord, PeptideAssociationGroup>();
int maxProgressValue = 0;
broker.Message = ProteomeResources.AssociateProteinsDlg_FindProteinMatchesWithFasta_Finding_peptides_in_FASTA_file;
var proteinPeptideMatchesDictionary = new Dictionary<int, ProteinPeptideMatches>();

ParallelEx.ForEach(proteinSource.Proteins, fastaRecord =>
ParallelEx.ForEach(proteinSource.Proteins.Select(Tuple.Create<IProteinRecord, int>), fastaRecordIndex =>
{
var fastaRecord = fastaRecordIndex.Item1;
int progressValue = fastaRecord.Progress;
var proteinPeptideMatches = new ProteinPeptideMatches(fastaRecord);
var peptideHashSet = new HashSet<string>();
HashSet<string> digestedPeptides = null;
var fasta = fastaRecord.Sequence;
var trieResults = _peptideTrie.FindAll(fasta.Sequence);
var matches = new List<PeptideDocNode>();

// don't count the same peptide twice in a protein
var peptidesMatched = new HashSet<string>();

IList<Peptide> digestedPeptides = null;

foreach (var result in trieResults)
{
if (broker.IsCanceled)
{
break;
}
if (!peptidesMatched.Add(result.Keyword))
continue;

// check that peptide is in the digest of the protein (if the result is non-empty)
digestedPeptides ??= digestProteinToPeptides(fastaRecord.Sequence).ToList();
if (!digestedPeptides.Contains(p => p.Sequence == result.Keyword))
var peptideSequence = result.Keyword;
if (!peptideHashSet.Add(peptideSequence))
continue;

matches.AddRange(_peptideToPath[result.Keyword]);
// check whether peptide is in the digest of the protein (if the result is non-empty)
digestedPeptides ??= digestProteinToPeptides(fastaRecord.Sequence).Select(p => p.Sequence)
.ToHashSet();
bool matchesDigestSettings = digestedPeptides.Contains(peptideSequence);
proteinPeptideMatches.PeptideMatchesList.Add(Tuple.Create(peptideSequence, matchesDigestSettings));
}

var peptideAssociationGroup = new PeptideAssociationGroup(matches);

lock (localResults)
{
if (broker.IsCanceled)
Expand All @@ -184,24 +181,63 @@ private Dictionary<IProteinRecord, PeptideAssociationGroup> FindProteinMatches(I
maxProgressValue = Math.Max(maxProgressValue, progressValue);
}

if (matches.Count > 0)
if (proteinPeptideMatches.PeptideMatchesList.Count > 0)
{
proteinPeptideMatchesDictionary.Add(fastaRecordIndex.Item2, proteinPeptideMatches);
}
else
{
proteinAssociations[fastaRecord] = peptideAssociationGroup;
++localResults.ProteinsMapped;
localResults.FinalPeptideCount += matches.Count;
localResults.ProteinsUnmapped++;
}
}
});

// Make a HashSet of all peptide sequences which match the digest settings in any protein sequence
var peptidesThatMatchDigestSettingsForAnyProtein = proteinPeptideMatchesDictionary.Values
.SelectMany(matches => matches.MatchingPeptides).ToHashSet();
foreach (var proteinPeptideMatches in proteinPeptideMatchesDictionary.OrderBy(kvp => kvp.Key)
.Select(kvp => kvp.Value))
{
var fastaRecord = proteinPeptideMatches.ProteinRecord;
var matches = new List<PeptideDocNode>();

foreach (var match in matches)
foreach ((string peptideSequence, bool matchesSettings) in proteinPeptideMatches.PeptideMatchesList)
{
if (!matchesSettings)
{
// The peptide could not have been digested by the enzyme from this protein sequence.
// Only skip it if there is at least one protein that could produce the digested peptide
if (peptidesThatMatchDigestSettingsForAnyProtein.Contains(peptideSequence))
{
if (!peptideToProteins.ContainsKey(match))
peptideToProteins.Add(match, new List<IProteinRecord> { fastaRecord });
else
peptideToProteins[match].Add(fastaRecord);
continue;
}
}
else
++localResults.ProteinsUnmapped;

matches.AddRange(_peptideToPath[peptideSequence]);
}
});

var peptideAssociationGroup = new PeptideAssociationGroup(matches);

if (broker.IsCanceled)
return null;

if (matches.Count > 0)
{
proteinAssociations[fastaRecord] = peptideAssociationGroup;
++localResults.ProteinsMapped;
localResults.FinalPeptideCount += matches.Count;

foreach (var match in matches)
{
if (!peptideToProteins.ContainsKey(match))
peptideToProteins.Add(match, new List<IProteinRecord> { fastaRecord });
else
peptideToProteins[match].Add(fastaRecord);
}
}
else
++localResults.ProteinsUnmapped;
}

if (broker.IsCanceled)
return null;
Expand All @@ -223,6 +259,22 @@ private Dictionary<IProteinRecord, PeptideAssociationGroup> FindProteinMatches(I
return proteinAssociations;
}

private class ProteinPeptideMatches
{
public ProteinPeptideMatches(IProteinRecord proteinRecord)
{
ProteinRecord = proteinRecord;
PeptideMatchesList = new List<Tuple<string, bool>>();
}

public IProteinRecord ProteinRecord { get; }
public List<Tuple<string, bool>> PeptideMatchesList { get; }
public IEnumerable<string> MatchingPeptides
{
get => PeptideMatchesList.Where(t => t.Item2).Select(t => t.Item1);
}
}

[XmlRoot("protein_association")]
public class ParsimonySettings : Immutable, IXmlSerializable, IValidating
{
Expand Down Expand Up @@ -1277,3 +1329,4 @@ public object GetDefaultObject(ObjectInfo<object> info)
}
}
}

117 changes: 117 additions & 0 deletions pwiz_tools/Skyline/Test/ProteinAssociationTest.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
/*
* Original author: Nicholas Shulman <nicksh .at. u.washington.edu>,
* MacCoss Lab, Department of Genome Sciences, UW
*
* Copyright 2024 University of Washington - Seattle, WA
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading;
using System.Windows.Forms;
using Microsoft.VisualStudio.TestTools.UnitTesting;
using pwiz.Skyline.Model;
using pwiz.Skyline.Model.DocSettings;
using pwiz.Skyline.Model.Proteome;
using pwiz.Skyline.Properties;
using pwiz.Skyline.Util;
using pwiz.SkylineTestUtil;

namespace pwiz.SkylineTest
{
[TestClass]
public class ProteinAssociationTest : AbstractUnitTest
{
[TestMethod]
public void TestTrypticProteinAssociation()
{
TestFilesDir = new TestFilesDir(TestContext, @"Test\ProteinAssociationTest.data");
var peptides = new[]
{
"ADLINNLGTIAK", "ELISNASDALDKIR", "FAFQAEVNR", "IDIIPNPQER", "LIINSLYK",
"LISLTDENALSGNEELTVK", "NKEIFLR", "NLLHVTDTGVGMTR", "SGTSEFLNK", "TDDEVVQREEEAIQLDGLNASQIR",
"KYSQFINFPIYVWSSK"
};
var document = CreateDocumentWithPeptides(peptides);
string fastaFilePath = TestFilesDir.GetTestPath("TwoProteins.fasta");

// Associate proteins using Trypsin. The peptide "NKEIFLR" is only tryptic for the first protein
var trypsin = EnzymeList.GetDefault();
var trypsinAssociatedProteins = AssociateProteins(document, fastaFilePath, trypsin);
CollectionAssert.Contains(trypsinAssociatedProteins["Protein1"], "NKEIFLR");
CollectionAssert.DoesNotContain(trypsinAssociatedProteins["Protein2"], "NKEIFLR");
CollectionAssert.AreEquivalent(new[] { "ADLINNLGTIAK", "ELISNASDALDKIR", "IDIIPNPQER" },
trypsinAssociatedProteins["Protein2"]);

// Now associate proteins using Chymotrypsin. The peptide "NKEIFLR" is not chymotryptic for either protein
var chymotrypsin = new Enzyme("Chymotrypsin", "FWYL", "P");
var chymotrypsinAssociatedProteins = AssociateProteins(document, fastaFilePath, chymotrypsin);
CollectionAssert.Contains(trypsinAssociatedProteins["Protein1"], "NKEIFLR");
CollectionAssert.Contains(trypsinAssociatedProteins["Protein2"], "NKEIFLR");
CollectionAssert.AreEquivalent(new[] { "ADLINNLGTIAK", "ELISNASDALDKIR", "IDIIPNPQER", "NKEIFLR" },
chymotrypsinAssociatedProteins["Protein2"]);
}

private static Dictionary<string, List<string>> AssociateProteins(SrmDocument document, string fastaFilePath, Enzyme enzyme)
{
var lenientDigestSettings = new DigestSettings(DigestSettings.MAX_MISSED_CLEAVAGES, false);
var proteinAssociation = new ProteinAssociation(document, new LongWaitBrokerImpl());
proteinAssociation.UseFastaFile(fastaFilePath, proteinSequence => enzyme.Digest(proteinSequence, lenientDigestSettings), new LongWaitBrokerImpl());
return proteinAssociation.AssociatedProteins.ToDictionary(
kvp => kvp.Key.Sequence.Name, kvp => kvp.Value.Peptides.Select(p => p.Peptide.Sequence).ToList());
}

private static SrmDocument CreateDocumentWithPeptides(IEnumerable<string> peptides)
{
var settings = SrmSettingsList.GetDefault();
var peptideDocNodes = new List<PeptideDocNode>();
foreach (var peptideSequence in peptides)
{
var peptideDocNode =
new PeptideDocNode(new Peptide(peptideSequence)).ChangeSettings(settings, SrmSettingsDiff.ALL);
peptideDocNodes.Add(peptideDocNode);
}

var peptideGroupDocNode = new PeptideGroupDocNode(new PeptideGroup(), Annotations.EMPTY, "Peptide List",
null, peptideDocNodes.ToArray());
return (SrmDocument)new SrmDocument(settings).ChangeChildren(new DocNode[] { peptideGroupDocNode });
}

private class LongWaitBrokerImpl : ILongWaitBroker
{
public bool IsCanceled
{
get { return false; }
}
public int ProgressValue { get; set; }
public string Message { get; set; }
public bool IsDocumentChanged(SrmDocument docOrig)
{
return false;
}

public DialogResult ShowDialog(Func<IWin32Window, DialogResult> show)
{
throw new InvalidOperationException();
}

public void SetProgressCheckCancel(int step, int totalSteps)
{
}

public CancellationToken CancellationToken => CancellationToken.None;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
>Protein1
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we want a pattern of many unit tests having their own .data directory with small test data files in them? I think you could just write this out to a temp file directly from the code to keep the repo tidier. I love the new ability to keep things in .data dir instead of a .zip file, but think we should still use it judiciously. Like .sky files. Those would be a pain to write directly from test code (as xml I mean; the settings can be generated programmatically). As would any big DSV file.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only way I know to create and clean up a temporary directory is by using "TestFilesDir".
Do you know of an easier way to do that which does not require either a .zip file or .data folder?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about:
using var testDir = new TemporaryDirectory(Path.Combine(TestContext.TestRunDirectory, TestContext.TestName));
If it works we could definitely have a shortcut for that in AbstractUnitTest. :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, that works. Thanks!
I am probably going to add a method to ProteinAssociation which takes a ProteinSource so that it can be used without any file on disk.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, TestContext.TestRunDirectory is null when running from TestRunner.
I added a method ProteinAssociate.UseProteinSource so I do not need a file on disk.

MRALWVLGLCCVLLTFGSVRADDEVDVDGTVEEDLGKSREGSRTDDEVVQREEEAIQLDG
LNASQIRELREKSEKFAFQAEVNRMMKLIINSLYKNKEIFLRELISNASDALDKIRLISL
TDENALSGNEELTVKIKCDKEKNLLHVTDTGVGMTREELVKNLGTIAKSGTSEFLNKMTE
>Protein2
MPEEVHHGEEEVETFAFQAEIAQLMSLIINTFYSNKEIFLRELISNASDALDKIRYESLT
DPSKLDSGKELKIDIIPNPQERTLTLVDTGIGMTKADLINNLGTIAKSGTKAFMEALQAG
ADISMIGQFGVGFYSAYLVAEKVVVITKHNDDEQYAWESSAGGSFTVRADHGEPIGRGTK
2 changes: 2 additions & 0 deletions pwiz_tools/Skyline/Test/Test.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@
<Compile Include="FeatureTooltipTest.cs" />
<Compile Include="CommandLinePeakBoundaryTest.cs" />
<Compile Include="PeptideTest.cs" />
<Compile Include="ProteinAssociationTest.cs" />
<Compile Include="ProtocolBuffers\GeneratedCode\LegacySkylineDocumentProto.cs" />
<Compile Include="ProtocolBuffers\LegacyProtocolBufferFormatsTest.cs" />
<Compile Include="ReferenceValueTest.cs" />
Expand Down Expand Up @@ -370,6 +371,7 @@
<None Include="OnDemandFeatureCalculatorTest.zip" />
<None Include="PeakBoundaryTest.zip" />
<None Include="PeakGroupIntegratorTest.zip" />
<None Include="ProteinAssociationTest.data\TwoProteins.fasta" />
<None Include="Proteome\ProteomeDbTest.zip" />
<EmbeddedResource Include="Reporting\HeavyLabeledLeucine.sky" />
<EmbeddedResource Include="Reporting\PivotIsotopeLabel.skyr" />
Expand Down