Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
d21d2c5
Allow non-tryptic peptides to be associated with a protein so long as…
nickshulman Sep 24, 2024
2fffb19
Fix problem where results were being updated at wrong spot in the loop
nickshulman Sep 24, 2024
56373de
Fix incorrect use of "Parallel" instead of "ParallelEx"
nickshulman Sep 24, 2024
3a84298
Merge branch 'master' into Skyline/work/20240923_AssociateProteinsLes…
nickshulman Oct 4, 2024
b7a47f5
Merge branch 'master' into Skyline/work/20240923_AssociateProteinsLes…
nickshulman Oct 8, 2024
10f2de0
Fix intermittent failure in TestHugeAssociateProteins
nickshulman Oct 9, 2024
7b0e4e4
Use Tuple<string, bool> in "ProteinPeptideMatches"
nickshulman Oct 9, 2024
71c918b
Add "ProteinAssociationTest"
nickshulman Oct 9, 2024
a67a923
Delete "TwoProteins.fasta" and use TemporaryDirectory instead
nickshulman Oct 9, 2024
2b433d1
Use ParallelEx.For to enumerate over ProteinPeptideMatches objects
nickshulman Oct 9, 2024
6a0ae1b
Add method "ProteinAssociation.UseProteinSource"
nickshulman Oct 9, 2024
da5516b
Merge remote-tracking branch 'remotes/origin/master' into Skyline/wor…
nickshulman Oct 9, 2024
1a71b69
Change "AssociateProteins" to take an Enzyme instead of passing in a …
nickshulman Oct 9, 2024
4235938
Remove inaccurate comment
nickshulman Oct 9, 2024
e27a2f8
Use Enzyme from the document instead of passing it in.
nickshulman Oct 9, 2024
234c1b0
Fix TestAssociateProteins
nickshulman Oct 10, 2024
fb7a62c
Merge remote-tracking branch 'remotes/origin/master' into Skyline/wor…
nickshulman Oct 10, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 1 addition & 9 deletions pwiz_tools/Skyline/CommandLine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2274,14 +2274,6 @@ public bool MinimizeResults(CommandArgs commandArgs)
return true;
}

private IEnumerable<Peptide> DigestProteinToPeptides(FastaSequence sequence)
{
var peptideSettings = Document.Settings.PeptideSettings;
return peptideSettings.Enzyme.Digest(sequence, peptideSettings.DigestSettings);
// CONSIDER: should AssociateProteinsDlg use the length filters? The old PeptidePerProteinDlg doesn't seem to.
//peptideSettings.Filter.MaxPeptideLength, peptideSettings.Filter.MinPeptideLength);
}

private bool AssociateProteins(CommandArgs commandArgs)
{
return HandleExceptions(commandArgs, () =>
Expand All @@ -2292,7 +2284,7 @@ private bool AssociateProteins(CommandArgs commandArgs)
_out.WriteLine(Resources.CommandLine_AssociateProteins_Associating_peptides_with_proteins_from_FASTA_file__0_, Path.GetFileName(fastaPath));
var progressMonitor = new CommandProgressMonitor(_out, new ProgressStatus(String.Empty));
var proteinAssociation = new ProteinAssociation(Document, progressMonitor);
proteinAssociation.UseFastaFile(fastaPath, DigestProteinToPeptides, progressMonitor);
proteinAssociation.UseFastaFile(fastaPath, progressMonitor);
proteinAssociation.ApplyParsimonyOptions(commandArgs.AssociateProteinsGroupProteins.GetValueOrDefault(),
commandArgs.AssociateProteinsGeneLevelParsimony.GetValueOrDefault(),
commandArgs.AssociateProteinsFindMinimalProteinList.GetValueOrDefault(),
Expand Down
13 changes: 3 additions & 10 deletions pwiz_tools/Skyline/EditUI/AssociateProteinsDlg.cs
Original file line number Diff line number Diff line change
Expand Up @@ -393,14 +393,6 @@ private void numMinPeptides_ValueChanged(object sender, EventArgs e)
UpdateParsimonyResults();
}

private IEnumerable<Peptide> DigestProteinToPeptides(FastaSequence sequence)
{
var peptideSettings = _document.Settings.PeptideSettings;
return peptideSettings.Enzyme.Digest(sequence, peptideSettings.DigestSettings);
// CONSIDER: should AssociateProteinsDlg use the length filters? The old PeptidePerProteinDlg doesn't seem to.
//peptideSettings.Filter.MaxPeptideLength, peptideSettings.Filter.MinPeptideLength);
}

// find matches using the background proteome
public void UseBackgroundProteome()
{
Expand All @@ -420,7 +412,8 @@ public void UseBackgroundProteome()
{
using (var longWaitDlg = new LongWaitDlg())
{
longWaitDlg.PerformWork(this, 1000, broker => _proteinAssociation.UseBackgroundProteome(backgroundProteome, DigestProteinToPeptides, broker));
longWaitDlg.PerformWork(this, 1000, broker =>
_proteinAssociation.UseBackgroundProteome(backgroundProteome, broker));
if (longWaitDlg.IsCanceled)
return;
}
Expand Down Expand Up @@ -498,7 +491,7 @@ public void UseFastaFile(string file)
{
using (var longWaitDlg = new LongWaitDlg())
{
longWaitDlg.PerformWork(this, 1000, broker => _proteinAssociation.UseFastaFile(file, DigestProteinToPeptides, broker));
longWaitDlg.PerformWork(this, 1000, broker => _proteinAssociation.UseFastaFile(file, broker));
if (longWaitDlg.IsCanceled)
return;
}
Expand Down
149 changes: 97 additions & 52 deletions pwiz_tools/Skyline/Model/Proteome/ProteinAssociation.cs
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
using pwiz.ProteomeDatabase.DataModel;
using pwiz.ProteomeDatabase.Fasta;
using pwiz.Skyline.Model.AuditLog;
using pwiz.Skyline.Model.DocSettings;
using pwiz.Skyline.Properties;
using pwiz.Skyline.Util;
using pwiz.Skyline.Util.Extensions;
Expand Down Expand Up @@ -102,77 +103,54 @@ private void ResetMapping()
ParsimoniousProteins = null;
}

public void UseFastaFile(string file, Func<FastaSequence, IEnumerable<Peptide>> digestProteinToPeptides, ILongWaitBroker broker)
public void UseFastaFile(string file, ILongWaitBroker broker)
{
if (!File.Exists(file))
return;

ResetMapping();
using var stream = File.Open(file, FileMode.Open, FileAccess.Read, FileShare.Read);
var fastaSource = new FastaSource(stream);
var proteinAssociations = FindProteinMatches(fastaSource, digestProteinToPeptides, broker);
if (proteinAssociations != null)
{
AssociatedProteins = proteinAssociations;
}
UseProteinSource(fastaSource, _document.Settings.PeptideSettings.Enzyme, broker);
}

// find matches using the background proteome
public void UseBackgroundProteome(BackgroundProteome backgroundProteome, Func<FastaSequence, IEnumerable<Peptide>> digestProteinToPeptides, ILongWaitBroker broker)
public void UseBackgroundProteome(BackgroundProteome backgroundProteome, ILongWaitBroker broker)
{
if (backgroundProteome.Equals(BackgroundProteome.NONE))
throw new InvalidOperationException(Resources.AssociateProteinsDlg_UseBackgroundProteome_No_background_proteome_defined);

ResetMapping();
var proteome = backgroundProteome;
var proteinSource = new BackgroundProteomeSource(broker.CancellationToken, proteome);
var proteinAssociations = FindProteinMatches(proteinSource, digestProteinToPeptides, broker);
UseProteinSource(new BackgroundProteomeSource(broker.CancellationToken, proteome), _document.Settings.PeptideSettings.Enzyme, broker);
}

public void UseProteinSource(IProteinSource proteinSource, Enzyme enzyme, ILongWaitBroker broker)
{
ResetMapping();
var proteinAssociations = FindProteinMatches(proteinSource, enzyme, broker);
if (proteinAssociations != null)
{
AssociatedProteins = proteinAssociations;
}
}

private Dictionary<IProteinRecord, PeptideAssociationGroup> FindProteinMatches(IProteinSource proteinSource, Func<FastaSequence, IEnumerable<Peptide>> digestProteinToPeptides, ILongWaitBroker broker)
private Dictionary<IProteinRecord, PeptideAssociationGroup> FindProteinMatches(IProteinSource proteinSource, Enzyme enzyme, ILongWaitBroker broker)
{
var localResults = new MappingResultsInternal();
var peptideToProteins = new Dictionary<ReferenceValue<PeptideDocNode>, List<IProteinRecord>>();

var proteinAssociations = new Dictionary<IProteinRecord, PeptideAssociationGroup>();
int maxProgressValue = 0;
broker.Message = ProteomeResources.AssociateProteinsDlg_FindProteinMatchesWithFasta_Finding_peptides_in_FASTA_file;
var proteinPeptideMatchesDictionary = new Dictionary<int, ProteinPeptideMatches>();
var allEnzymaticPeptides = new HashSet<string>();

ParallelEx.ForEach(proteinSource.Proteins, fastaRecord =>
ParallelEx.ForEach(proteinSource.Proteins.Select(Tuple.Create<IProteinRecord, int>), fastaRecordIndex =>
{
var fastaRecord = fastaRecordIndex.Item1;
int progressValue = fastaRecord.Progress;
var fasta = fastaRecord.Sequence;
var trieResults = _peptideTrie.FindAll(fasta.Sequence);
var matches = new List<PeptideDocNode>();

// don't count the same peptide twice in a protein
var peptidesMatched = new HashSet<string>();

IList<Peptide> digestedPeptides = null;

foreach (var result in trieResults)
{
if (broker.IsCanceled)
{
break;
}
if (!peptidesMatched.Add(result.Keyword))
continue;

// check that peptide is in the digest of the protein (if the result is non-empty)
digestedPeptides ??= digestProteinToPeptides(fastaRecord.Sequence).ToList();
if (!digestedPeptides.Contains(p => p.Sequence == result.Keyword))
continue;

matches.AddRange(_peptideToPath[result.Keyword]);
}

var peptideAssociationGroup = new PeptideAssociationGroup(matches);

ProteinPeptideMatches proteinPeptideMatches = new ProteinPeptideMatches(fastaRecord, enzyme,
_peptideTrie.FindAll(fasta.Sequence).Select(result => result.Keyword).Distinct());
lock (localResults)
{
if (broker.IsCanceled)
Expand All @@ -184,25 +162,66 @@ private Dictionary<IProteinRecord, PeptideAssociationGroup> FindProteinMatches(I
maxProgressValue = Math.Max(maxProgressValue, progressValue);
}

if (matches.Count > 0)
{
proteinAssociations[fastaRecord] = peptideAssociationGroup;
++localResults.ProteinsMapped;
localResults.FinalPeptideCount += matches.Count;
proteinPeptideMatchesDictionary.Add(fastaRecordIndex.Item2, proteinPeptideMatches);
allEnzymaticPeptides.UnionWith(proteinPeptideMatches.EnzymaticPeptides);
}
});

var proteinPeptideMatchesList = proteinPeptideMatchesDictionary
.OrderBy(kvp => kvp.Key).Select(kvp => kvp.Value).ToList();
var peptideAssociationGroups = new PeptideAssociationGroup[proteinPeptideMatchesList.Count];
ParallelEx.For(0, proteinPeptideMatchesList.Count, iProtein =>
{
var proteinPeptideMatches = proteinPeptideMatchesList[iProtein];
var matches = new List<PeptideDocNode>();

foreach (var match in matches)
foreach (var peptideSequence in proteinPeptideMatches.CandidatePeptides)
{
if (!proteinPeptideMatches.EnzymaticPeptides.Contains(peptideSequence))
{
// The peptide could not have been digested by the enzyme from this protein sequence.
// Only skip it if there is at least one protein that could produce the digested peptide
if (allEnzymaticPeptides.Contains(peptideSequence))
{
if (!peptideToProteins.ContainsKey(match))
peptideToProteins.Add(match, new List<IProteinRecord> { fastaRecord });
else
peptideToProteins[match].Add(fastaRecord);
continue;
}
}
else
++localResults.ProteinsUnmapped;

matches.AddRange(_peptideToPath[peptideSequence]);
}
if (matches.Count > 0)
{
peptideAssociationGroups[iProtein] = new PeptideAssociationGroup(matches);
}
});

for (int iProtein = 0; iProtein < proteinPeptideMatchesList.Count; iProtein++)
{
var fastaRecord = proteinPeptideMatchesList[iProtein].ProteinRecord;
var peptideAssociationGroup = peptideAssociationGroups[iProtein];
if (peptideAssociationGroup == null)
{
++localResults.ProteinsUnmapped;
}
else
{
++localResults.ProteinsMapped;
localResults.FinalPeptideCount += peptideAssociationGroup.Peptides.Count;
proteinAssociations[fastaRecord] = peptideAssociationGroup;
foreach (var match in peptideAssociationGroup.Peptides)
{
if (peptideToProteins.TryGetValue(match, out var list))
{
list.Add(fastaRecord);
}
else
{
peptideToProteins.Add(match, new List<IProteinRecord>{fastaRecord});
}
}
}
}

if (broker.IsCanceled)
return null;

Expand All @@ -223,6 +242,31 @@ private Dictionary<IProteinRecord, PeptideAssociationGroup> FindProteinMatches(I
return proteinAssociations;
}

private class ProteinPeptideMatches
{
private static readonly DigestSettings lenientDigestSettings = new DigestSettings(int.MaxValue, false);
public ProteinPeptideMatches(IProteinRecord proteinRecord, Enzyme enzyme, IEnumerable<string> candidatePeptides)
{
ProteinRecord = proteinRecord;
CandidatePeptides = ImmutableList.ValueOf(candidatePeptides);
if (CandidatePeptides.Count > 0)
{
var maxPeptideLength = CandidatePeptides.Max(peptide => peptide.Length);
EnzymaticPeptides = enzyme
.Digest(proteinRecord.Sequence, lenientDigestSettings, maxPeptideLength)
.Select(peptide => peptide.Sequence).Intersect(CandidatePeptides).ToHashSet();
}
else
{
EnzymaticPeptides = Array.Empty<string>();
}
}

public IProteinRecord ProteinRecord { get; }
public ImmutableList<string> CandidatePeptides { get; }
public ICollection<string> EnzymaticPeptides { get; }
}

[XmlRoot("protein_association")]
public class ParsimonySettings : Immutable, IXmlSerializable, IValidating
{
Expand Down Expand Up @@ -1277,3 +1321,4 @@ public object GetDefaultObject(ObjectInfo<object> info)
}
}
}

Loading