Skip to content
Merged
Changes from 4 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
d21d2c5
Allow non-tryptic peptides to be associated with a protein so long as…
nickshulman Sep 24, 2024
2fffb19
Fix problem where results were being updated at wrong spot in the loop
nickshulman Sep 24, 2024
56373de
Fix incorrect use of "Parallel" instead of "ParallelEx"
nickshulman Sep 24, 2024
3a84298
Merge branch 'master' into Skyline/work/20240923_AssociateProteinsLes…
nickshulman Oct 4, 2024
b7a47f5
Merge branch 'master' into Skyline/work/20240923_AssociateProteinsLes…
nickshulman Oct 8, 2024
10f2de0
Fix intermittent failure in TestHugeAssociateProteins
nickshulman Oct 9, 2024
7b0e4e4
Use Tuple<string, bool> in "ProteinPeptideMatches"
nickshulman Oct 9, 2024
71c918b
Add "ProteinAssociationTest"
nickshulman Oct 9, 2024
a67a923
Delete "TwoProteins.fasta" and use TemporaryDirectory instead
nickshulman Oct 9, 2024
2b433d1
Use ParallelEx.For to enumerate over ProteinPeptideMatches objects
nickshulman Oct 9, 2024
6a0ae1b
Add method "ProteinAssociation.UseProteinSource"
nickshulman Oct 9, 2024
da5516b
Merge remote-tracking branch 'remotes/origin/master' into Skyline/wor…
nickshulman Oct 9, 2024
1a71b69
Change "AssociateProteins" to take an Enzyme instead of passing in a …
nickshulman Oct 9, 2024
4235938
Remove inaccurate comment
nickshulman Oct 9, 2024
e27a2f8
Use Enzyme from the document instead of passing it in.
nickshulman Oct 9, 2024
234c1b0
Fix TestAssociateProteins
nickshulman Oct 10, 2024
fb7a62c
Merge remote-tracking branch 'remotes/origin/master' into Skyline/wor…
nickshulman Oct 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 95 additions & 15 deletions pwiz_tools/Skyline/Model/Proteome/ProteinAssociation.cs
Original file line number Diff line number Diff line change
Expand Up @@ -141,38 +141,34 @@ private Dictionary<IProteinRecord, PeptideAssociationGroup> FindProteinMatches(I
var proteinAssociations = new Dictionary<IProteinRecord, PeptideAssociationGroup>();
int maxProgressValue = 0;
broker.Message = ProteomeResources.AssociateProteinsDlg_FindProteinMatchesWithFasta_Finding_peptides_in_FASTA_file;
var proteinPeptideMatchesList = new List<ProteinPeptideMatches>();

ParallelEx.ForEach(proteinSource.Proteins, fastaRecord =>
{
int progressValue = fastaRecord.Progress;
var proteinPeptideMatches = new ProteinPeptideMatches(fastaRecord);
var peptideHashSet = new HashSet<string>();
HashSet<string> digestedPeptides = null;
var fasta = fastaRecord.Sequence;
var trieResults = _peptideTrie.FindAll(fasta.Sequence);
var matches = new List<PeptideDocNode>();

// don't count the same peptide twice in a protein
var peptidesMatched = new HashSet<string>();

IList<Peptide> digestedPeptides = null;

foreach (var result in trieResults)
{
if (broker.IsCanceled)
{
break;
}
if (!peptidesMatched.Add(result.Keyword))
continue;

// check that peptide is in the digest of the protein (if the result is non-empty)
digestedPeptides ??= digestProteinToPeptides(fastaRecord.Sequence).ToList();
if (!digestedPeptides.Contains(p => p.Sequence == result.Keyword))
var peptideSequence = result.Keyword;
if (!peptideHashSet.Add(peptideSequence))
continue;

matches.AddRange(_peptideToPath[result.Keyword]);
// check whether peptide is in the digest of the protein (if the result is non-empty)
digestedPeptides ??= digestProteinToPeptides(fastaRecord.Sequence).Select(p => p.Sequence)
.ToHashSet();
bool matchesDigestSettings = digestedPeptides.Contains(peptideSequence);
proteinPeptideMatches.AddPeptideSequence(peptideSequence, matchesDigestSettings);
}

var peptideAssociationGroup = new PeptideAssociationGroup(matches);

lock (localResults)
{
if (broker.IsCanceled)
Expand All @@ -184,6 +180,48 @@ private Dictionary<IProteinRecord, PeptideAssociationGroup> FindProteinMatches(I
maxProgressValue = Math.Max(maxProgressValue, progressValue);
}

if (proteinPeptideMatches.PeptideSequenceCount > 0)
{
proteinPeptideMatchesList.Add(proteinPeptideMatches);
}
else
{
localResults.ProteinsUnmapped++;
}
}
});

// Make a HashSet of all peptide sequences which match the digest settings in any protein sequence
var peptidesThatMatchDigestSettingsForAnyProtein = proteinPeptideMatchesList
.SelectMany(matches => matches.PeptidesMatchingDigestSettings).ToHashSet();

ParallelEx.ForEach(proteinPeptideMatchesList, proteinPeptideMatches =>
{
var fastaRecord = proteinPeptideMatches.ProteinRecord;
var matches = new List<PeptideDocNode>();

for (int iPeptide = 0; iPeptide < proteinPeptideMatches.PeptideSequenceCount; iPeptide++)
{
var peptideSequence = proteinPeptideMatches.GetPeptideSequence(iPeptide);
if (!proteinPeptideMatches.MatchesDigestSettings(iPeptide))
{
// The peptide could not have been digested by the enzyme from this protein sequence
// Only skip it if there is at least one protein that could produce the digested peptide
if (peptidesThatMatchDigestSettingsForAnyProtein.Contains(peptideSequence))
{
continue;
}
}
matches.AddRange(_peptideToPath[peptideSequence]);
}

var peptideAssociationGroup = new PeptideAssociationGroup(matches);

lock (localResults)
{
if (broker.IsCanceled)
return;

if (matches.Count > 0)
{
proteinAssociations[fastaRecord] = peptideAssociationGroup;
Expand Down Expand Up @@ -223,6 +261,47 @@ private Dictionary<IProteinRecord, PeptideAssociationGroup> FindProteinMatches(I
return proteinAssociations;
}

private class ProteinPeptideMatches
{
private List<string> _peptideSequences = new List<string>();
private List<bool> _matchesDigestSettingsList = new List<bool>();
public ProteinPeptideMatches(IProteinRecord proteinRecord)
{
ProteinRecord = proteinRecord;
}

public IProteinRecord ProteinRecord { get; }
public int PeptideSequenceCount
{
get { return _peptideSequences.Count; }
}

public string GetPeptideSequence(int i)
{
return _peptideSequences[i];
}

public bool MatchesDigestSettings(int i)
{
return _matchesDigestSettingsList[i];
}

public void AddPeptideSequence(string sequence, bool matchesDigestSettings)
{
_peptideSequences.Add(sequence);
_matchesDigestSettingsList.Add(matchesDigestSettings);
}

public IEnumerable<string> PeptidesMatchingDigestSettings
{
get
{
return Enumerable.Range(0, PeptideSequenceCount).Where(i => _matchesDigestSettingsList[i])
.Select(i => _peptideSequences[i]);
}
}
}

[XmlRoot("protein_association")]
public class ParsimonySettings : Immutable, IXmlSerializable, IValidating
{
Expand Down Expand Up @@ -1277,3 +1356,4 @@ public object GetDefaultObject(ObjectInfo<object> info)
}
}
}