diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs
index c3c4ef9e2..3b5ef13f0 100644
--- a/mzLib/Omics/BioPolymer/SequenceVariation.cs
+++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs
@@ -4,69 +4,216 @@
namespace Omics.BioPolymer
{
+ ///
+ /// Represents a contiguous sequence variation on a 1-based, inclusive coordinate system.
+ /// A variation spans .. in the parent sequence,
+ /// replaces the (which may be empty for an insertion) with
+ /// (which may be empty for a deletion), and can optionally carry site-specific .
+ /// When available, a parsed is attached via .
+ ///
+ /// Typical interpretations:
+ /// - Substitution: non-empty and non-empty of equal length.
+ /// - Insertion: empty and non-empty .
+ /// - Deletion: non-empty and empty .
+ ///
public class SequenceVariation
{
///
- /// For longer sequence variations, where a range of sequence is replaced. Point mutations should be specified with the same begin and end positions.
+ /// Create a variation with an explicit VCF object.
///
- ///
- ///
- ///
- ///
- ///
+ ///
+ /// 1-based, inclusive start position in the parent sequence where the variation begins.
+ /// Must be >= 1. See for validity conditions.
+ ///
+ ///
+ /// 1-based, inclusive end position in the parent sequence where the variation ends.
+ /// Must satisfy oneBasedEndPosition >= oneBasedBeginPosition.
+ ///
+ ///
+ /// Reference subsequence being replaced. Null is coerced to an empty string.
+ /// Empty string typically indicates an insertion at .
+ ///
+ ///
+ /// Alternate subsequence to insert in place of .
+ /// Null is coerced to an empty string. Empty string typically indicates a deletion.
+ ///
+ ///
+ /// Free-form description of the variation. Often the original VCF line or human-readable note.
+ ///
+ ///
+ /// Parsed VCF wrapper for the originating record. Used for downstream analysis of genotype/allele metadata.
+ ///
+ ///
+ /// Optional mapping from absolute 1-based residue positions to one or more objects
+ /// applied at that position (in the same coordinate system as /).
+ /// If null, an empty dictionary is created.
+ ///
+ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, VariantCallFormat variantCallFormat, Dictionary>? oneBasedModifications = null)
+ {
+ OneBasedBeginPosition = oneBasedBeginPosition;
+ OneBasedEndPosition = oneBasedEndPosition;
+ OriginalSequence = originalSequence ?? "";
+ VariantSequence = variantSequence ?? "";
+ Description = description;
+ VariantCallFormatDataString = variantCallFormat;
+ OneBasedModifications = oneBasedModifications ?? new Dictionary>();
+ }
+
+ ///
+ /// Create a variation by providing a raw VCF line (string representation) which will be parsed into a .
+ ///
+ ///
+ /// 1-based, inclusive start position in the parent sequence where the variation begins.
+ /// Must be >= 1. See for validity conditions.
+ ///
+ ///
+ /// 1-based, inclusive end position in the parent sequence where the variation ends.
+ /// Must satisfy oneBasedEndPosition >= oneBasedBeginPosition.
+ ///
+ ///
+ /// Reference subsequence being replaced. Null is coerced to an empty string.
+ /// Empty string typically indicates an insertion at .
+ ///
+ ///
+ /// Alternate subsequence to insert in place of .
+ /// Null is coerced to an empty string. Empty string typically indicates a deletion.
+ ///
+ ///
+ /// Free-form description of the variation. Often the original VCF line or human-readable note.
+ ///
+ ///
+ /// Raw VCF record (a single, tab-delimited line). It is parsed into .
+ ///
+ ///
+ /// Optional mapping from absolute 1-based residue positions to one or more objects
+ /// applied at that position (in the same coordinate system as /).
+ /// If null, an empty dictionary is created.
+ ///
+ public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, string variantCallFormatStringRepresentation, Dictionary>? oneBasedModifications = null)
+ {
+ OneBasedBeginPosition = oneBasedBeginPosition;
+ OneBasedEndPosition = oneBasedEndPosition;
+ OriginalSequence = originalSequence ?? "";
+ VariantSequence = variantSequence ?? "";
+ Description = description;
+ VariantCallFormatDataString = new VariantCallFormat(variantCallFormatStringRepresentation);
+ OneBasedModifications = oneBasedModifications ?? new Dictionary>();
+ }
+
+ ///
+ /// Create a variation without a separate VCF string; a is still constructed
+ /// from to maintain a non-null object for tests and downstream consumers.
+ ///
+ ///
+ /// 1-based, inclusive start position in the parent sequence where the variation begins.
+ /// Must be >= 1. See for validity conditions.
+ ///
+ ///
+ /// 1-based, inclusive end position in the parent sequence where the variation ends.
+ /// Must satisfy oneBasedEndPosition >= oneBasedBeginPosition.
+ ///
+ ///
+ /// Reference subsequence being replaced. Null is coerced to an empty string.
+ /// Empty string typically indicates an insertion at .
+ ///
+ ///
+ /// Alternate subsequence to insert in place of .
+ /// Null is coerced to an empty string. Empty string typically indicates a deletion.
+ ///
+ ///
+ /// Free-form description of the variation. Also used to initialize .
+ ///
+ ///
+ /// Optional mapping from absolute 1-based residue positions to one or more objects
+ /// applied at that position (in the same coordinate system as /).
+ /// If null, an empty dictionary is created.
+ ///
public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, Dictionary>? oneBasedModifications = null)
{
OneBasedBeginPosition = oneBasedBeginPosition;
OneBasedEndPosition = oneBasedEndPosition;
OriginalSequence = originalSequence ?? "";
VariantSequence = variantSequence ?? "";
- Description = new VariantCallFormat(description);
+ Description = description;
+ // Always construct a VariantCallFormat so tests relying on non-null VCF objects pass.
+ VariantCallFormatDataString = new VariantCallFormat(description);
OneBasedModifications = oneBasedModifications ?? new Dictionary>();
}
///
- /// For variations with only position information (not begin and end).
- /// Sets the end to the end of the original protein sequence to which this variation applies.
+ /// Convenience constructor for single-position variations. The end position is inferred from
+ /// and the length of :
+ /// end = position + length(originalSequence) - 1 (or end = position if is null).
///
- ///
- ///
- ///
- ///
- ///
+ ///
+ /// 1-based, inclusive position of the variation start. Used to infer .
+ ///
+ ///
+ /// Reference subsequence being replaced. If null, treated as empty for end-position inference.
+ ///
+ ///
+ /// Alternate subsequence to insert in place of . May be empty for deletions.
+ ///
+ ///
+ /// Free-form description of the variation. Also used to initialize .
+ ///
+ ///
+ /// Optional mapping from absolute 1-based residue positions to one or more objects
+ /// applied at that position. If null, an empty dictionary is created.
+ ///
public SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, Dictionary>? oneBasedModifications = null)
: this(oneBasedPosition, originalSequence == null ? oneBasedPosition : oneBasedPosition + originalSequence.Length - 1, originalSequence, variantSequence, description, oneBasedModifications)
{ }
///
- /// Beginning position of original sequence to be replaced
+ /// 1-based, inclusive start position of this variation within the parent sequence.
///
public int OneBasedBeginPosition { get; }
///
- /// End position of original sequence to be replaced
+ /// 1-based, inclusive end position of this variation within the parent sequence.
+ /// For single-site variations with non-null , this is
+ /// OneBasedBeginPosition + OriginalSequence.Length - 1.
///
public int OneBasedEndPosition { get; }
///
- /// Original sequence information (optional)
+ /// The reference subsequence replaced by this variation. Empty string implies an insertion.
///
public string OriginalSequence { get; }
///
- /// Variant sequence information (required)
+ /// The alternate subsequence inserted by this variation. Empty string implies a deletion.
///
public string VariantSequence { get; }
///
- /// Description of this variation (optional)
+ /// Free-form description of the variation. Often the raw VCF line or a human-readable summary.
///
- public VariantCallFormat Description { get; }
+ public string Description { get; }
///
- /// Modifications specifically for this variant
+ /// Optional parsed VCF wrapper providing structured access to the originating VCF record.
+ /// May be null in some construction paths; in the provided constructors it is initialized.
+ ///
+ public VariantCallFormat? VariantCallFormatDataString { get; }
+
+ ///
+ /// Mapping from absolute 1-based residue positions to a list of objects
+ /// to apply at each position. Never null; defaults to an empty dictionary.
///
public Dictionary> OneBasedModifications { get; }
+ ///
+ /// Determines value equality with another object.
+ /// Two objects are equal when:
+ /// - Begin and end positions are equal
+ /// - Original and variant sequences are equal (nulls treated as equal only if both are null)
+ /// - are both null or equal
+ /// - have identical key sets and identical flattened modification lists (sequence-equal)
+ ///
+ /// Object to compare against.
+ /// True if equal by the criteria above; otherwise false.
public override bool Equals(object obj)
{
SequenceVariation s = obj as SequenceVariation;
@@ -75,90 +222,95 @@ public override bool Equals(object obj)
&& OneBasedEndPosition == s.OneBasedEndPosition
&& (s.OriginalSequence == null && OriginalSequence == null || OriginalSequence.Equals(s.OriginalSequence))
&& (s.VariantSequence == null && VariantSequence == null || VariantSequence.Equals(s.VariantSequence))
- && (s.Description == null && Description == null || Description.Equals(s.Description))
+ && ((s.VariantCallFormatDataString == null && VariantCallFormatDataString == null)
+ || (VariantCallFormatDataString != null && VariantCallFormatDataString.Equals(s.VariantCallFormatDataString)))
&& (s.OneBasedModifications == null && OneBasedModifications == null ||
s.OneBasedModifications.Keys.ToList().SequenceEqual(OneBasedModifications.Keys.ToList())
&& s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList()));
}
+ ///
+ /// Computes a hash code from begin/end positions, sequences, and the VCF wrapper (if present).
+ ///
+ /// A hash code suitable for hash-based collections.
public override int GetHashCode()
{
return OneBasedBeginPosition.GetHashCode()
^ OneBasedEndPosition.GetHashCode()
- ^ OriginalSequence.GetHashCode() // null handled in constructor
- ^ VariantSequence.GetHashCode() // null handled in constructor
- ^ Description.GetHashCode(); // always constructed in constructor
+ ^ OriginalSequence.GetHashCode()
+ ^ VariantSequence.GetHashCode()
+ ^ (VariantCallFormatDataString?.GetHashCode() ?? 0);
}
///
- /// Returns a simple string represantation of this amino acid change
+ /// Produces a compact, human-readable representation: {OriginalSequence}{OneBasedBeginPosition}{VariantSequence}.
+ /// Example: substitution A->T at position 12 yields "A12T".
///
- ///
+ /// The compact representation string.
public string SimpleString()
{
return OriginalSequence + OneBasedBeginPosition.ToString() + VariantSequence;
}
///
- /// Determines whether this interval overlaps the queried interval
+ /// Tests whether the current variation intersects (overlaps) another variation in coordinate space.
+ /// Intersection is inclusive: any shared position in the 1-based, inclusive ranges is considered overlap.
///
- ///
- ///
+ /// The other to test.
+ /// True if the ranges overlap; otherwise false.
internal bool Intersects(SequenceVariation segment)
{
return segment.OneBasedEndPosition >= OneBasedBeginPosition && segment.OneBasedBeginPosition <= OneBasedEndPosition;
}
///
- /// Determines whether this interval overlaps the queried interval
+ /// Tests whether the current variation intersects (overlaps) a truncation product range.
+ /// Intersection is inclusive: any shared position in the 1-based, inclusive ranges is considered overlap.
///
- ///
- ///
+ /// The segment to test.
+ /// True if the ranges overlap; otherwise false.
internal bool Intersects(TruncationProduct segment)
{
return segment.OneBasedEndPosition >= OneBasedBeginPosition && segment.OneBasedBeginPosition <= OneBasedEndPosition;
}
///
- /// Determines whether this interval overlaps the queried position
+ /// Tests whether the current variation intersects a single 1-based position.
///
- ///
- ///
+ /// A 1-based, inclusive position in the parent sequence.
+ /// True if lies within the variation’s range; otherwise false.
internal bool Intersects(int pos)
{
return OneBasedBeginPosition <= pos && pos <= OneBasedEndPosition;
}
///
- /// Determines whether this interval includes the queried interval
+ /// Tests whether the current variation fully includes another variation’s range.
+ /// Inclusion is inclusive on both ends.
///
- ///
- ///
+ /// The other to test.
+ /// True if the current range fully contains ; otherwise false.
internal bool Includes(SequenceVariation segment)
{
return OneBasedBeginPosition <= segment.OneBasedBeginPosition && OneBasedEndPosition >= segment.OneBasedEndPosition;
}
- // Commented out by AVC on 4/5/23. Unused and untested in current code base,
- // but can't rule out that it could be useful in the future.
- ///
- /// Determines whether this interval includes the queried interval
- ///
- ///
- ///
- // internal bool Includes(TruncationProduct segment)
- // {
- // return OneBasedBeginPosition <= segment.OneBasedBeginPosition && OneBasedEndPosition >= segment.OneBasedEndPosition;
- // }
///
- /// Determines whether this interval overlaps the queried position
+ /// Tests whether the current variation includes a single 1-based position.
+ /// Inclusion is inclusive on both ends.
///
- ///
- ///
+ /// A 1-based, inclusive position in the parent sequence.
+ /// True if lies within the variation’s range; otherwise false.
internal bool Includes(int pos)
{
return OneBasedBeginPosition <= pos && pos <= OneBasedEndPosition;
}
+
+ ///
+ /// Validates positional consistency: begin must be > 0 and end must be >= begin.
+ /// This does not validate string/length consistency between and .
+ ///
+ /// True if positions are valid; otherwise false.
public bool AreValid()
{
return OneBasedBeginPosition > 0 && OneBasedEndPosition >= OneBasedBeginPosition;
diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs
index 1669c3afe..bd735b7c9 100644
--- a/mzLib/Omics/BioPolymer/VariantApplication.cs
+++ b/mzLib/Omics/BioPolymer/VariantApplication.cs
@@ -1,42 +1,65 @@
using MzLibUtil;
-using Omics.BioPolymer;
using Omics.Modifications;
namespace Omics.BioPolymer
{
///
- /// Provides methods for applying sequence variations to proteins and handling modifications on variant sequences.
+ /// Utilities for constructing and applying sequence variants to biopolymers (proteins/RNA) in a 1-based, inclusive coordinate system.
+ /// This includes:
+ /// - Expanding a base biopolymer into concrete variant instances (genotype-aware or combinatorial).
+ /// - Renaming/accession tagging based on applied variants.
+ /// - Re-mapping indices for truncation products and localized modifications after edits (insertions/deletions/substitutions).
+ /// - Converting certain annotation-style modifications (e.g., nucleotide substitution markers) into true sequence variants.
///
- ///
- /// Originally by A. Cesnik on 11/2/18, updated on 4/25/23. NB moved it and generalized for use in Transcriptomics on 3/25/25.
- ///
public static class VariantApplication
{
///
- /// Creates a list of IBioPolymers of the same type as the original protein, each with applied variants from this protein.
+ /// Expand a base biopolymer into concrete consensus and variant isoforms by materializing substitution-style annotations
+ /// and either applying a bounded combinatorial expansion when genotype data is absent or performing genotype- and
+ /// allele-depth-aware variant application. The parameter limits the
+ /// number of isoforms returned (including the consensus); sets the minimum AD required
+ /// to consider an allele; is reserved for per-isoform combinatoric caps.
///
- /// Type of BioPolymer to create variants of
- /// original to generate variants of
- ///
- ///
- /// This replaces a method call that was previously an instance method in Protein
- public static List GetVariantBioPolymers(this TBioPolymerType protein, int maxAllowedVariantsForCombinatorics = 4, int minAlleleDepth = 1)
+ public static List GetConsensusAndVariantBioPolymers(this TBioPolymerType protein, int consensusPlusVariantIsoforms = 1, int minAlleleDepth = 0, int maxVariantsPerIsoform = 0)
where TBioPolymerType : IHasSequenceVariants
{
+ // Materialize any substitution-like annotations into concrete sequence variants on both the consensus and the instance
protein.ConsensusVariant.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
- if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.Description == null || v.Description.Genotypes.Count == 0))
+
+ // If all variants are positionally valid and any is missing VCF/genotype data, fall back to bounded combinatorial application
+ if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.VariantCallFormatDataString == null || v.VariantCallFormatDataString.Genotypes.Count == 0))
{
- // this is a protein with either no VCF lines or a mix of VCF and non-VCF lines
- return ApplyAllVariantCombinations(protein, protein.SequenceVariations, maxAllowedVariantsForCombinatorics).ToList();
+ return ApplyAllVariantCombinations(protein, protein.SequenceVariations, consensusPlusVariantIsoforms).ToList();
}
- // this is a protein with only VCF lines
- return ApplyVariants(protein, protein.SequenceVariations, maxAllowedVariantsForCombinatorics, minAlleleDepth);
+
+ // Otherwise, do genotype/allele-depth-aware application with combinatorics limited for heterozygous sites
+ return ApplyVariants(protein, protein.SequenceVariations, maxAllowedVariantsForCombinitorics: consensusPlusVariantIsoforms, minAlleleDepth);
+ }
+
+ ///
+ /// LEGACY: Expand a base biopolymer into consensus and variant isoforms by materializing substitution-style annotations
+ /// and then either performing a bounded combinatorial expansion when genotype data is absent or running a
+ /// genotype- and allele-depth-aware variant application. The
+ /// parameter caps the total isoforms returned (including the consensus). This method is retained for backward
+ /// compatibility and may be removed in a future release — prefer
+ /// or newer variant-expansion APIs.
+ ///
+ public static List GetVariantBioPolymers(this TBioPolymerType protein, int maxAllowedVariantsForCombinatorics = 4, int minAlleleDepth = 1)
+ where TBioPolymerType : IHasSequenceVariants
+ {
+ return GetConsensusAndVariantBioPolymers(protein, consensusPlusVariantIsoforms: maxAllowedVariantsForCombinatorics, minAlleleDepth: minAlleleDepth, maxVariantsPerIsoform: 0);
}
///
- /// Gets the name of a protein with applied variations
+ /// Produces a name with an appended variant tag built from applied variation descriptions.
+ /// If both and are effectively empty, returns null.
///
+ /// Base name (e.g., protein name). May be null.
+ /// Variations applied to the instance. If null/empty, no variant tag is appended.
+ ///
+ /// The base plus " variant:{...}" when variations exist; null if both inputs are empty.
+ ///
public static string? GetVariantName(string? name, IEnumerable? appliedVariations)
{
bool emptyVars = appliedVariations.IsNullOrEmpty();
@@ -48,8 +71,13 @@ public static List GetVariantBioPolymers(this
}
///
- /// Gets the accession for a protein with applied variations
+ /// Constructs a variant-aware accession by appending a compact variant string to the consensus accession.
///
+ /// The variant-capable biopolymer.
+ /// Applied variations to encode into the suffix; may be null/empty.
+ ///
+ /// The consensus accession or consensus accession with "_{SimpleString}_..." suffix if variations are present.
+ ///
public static string GetAccession(IHasSequenceVariants protein, IEnumerable? appliedSequenceVariations)
{
return protein.ConsensusVariant.Accession +
@@ -57,33 +85,23 @@ public static string GetAccession(IHasSequenceVariants protein, IEnumerable
- /// Determines if the modification falls on a variant amino acid
+ /// Determines if a specific 1-based position in the variant biopolymer lies within a particular variation's range.
///
- /// true if a modification index on the protein falls within the applied variant
- ///
- /// A. Cesnik - 4/25/23
- /// Variants annotated in protein entries can be applied to a sequence, i.e. a change is made to the sequence.
- /// One of the things Spritz can do that no other tool can do is enable finding modifications on these sites of variation,
- /// since I amended the sequence variant XML entries to have modifications.
- ///
+ /// The variation of interest; may be null.
+ /// 1-based position in the current (possibly already-edited) variant sequence.
+ /// True if the position is included by the variation; otherwise false.
public static bool IsSequenceVariantModification(SequenceVariation? appliedVariant, int variantProteinIndex)
{
return appliedVariant != null && appliedVariant.Includes(variantProteinIndex);
}
///
- /// Restores modification index on a variant protein to the index on the nonvariant protein,
- /// or if it falls on a variant, this restores the position on the protein with only that variant
+ /// Maps a modification index from the edited (variant) sequence back to the original consensus index by subtracting the
+ /// net length changes of all applied variations that ended before the queried position.
///
- /// Protein containing applied sequence variations
- /// The one-based index of the amino acid residue bearing a modification
- ///
- ///
- /// A. Cesnik - 4/25/23
- /// Useful for comparing modification indices on variant proteins to the original protein.
- /// Variations can introduce length changes and other changes to the sequence,
- /// so the indices of the modifications aren’t directly comparable, but this method makes that possible.
- ///
+ /// The variant-capable biopolymer containing applied variations.
+ /// 1-based index in the variant sequence.
+ /// The corresponding 1-based index in the original consensus sequence.
public static int RestoreModificationIndex(IHasSequenceVariants protein, int variantProteinModificationIndex)
{
return variantProteinModificationIndex - protein.AppliedSequenceVariations
@@ -92,57 +110,73 @@ public static int RestoreModificationIndex(IHasSequenceVariants protein, int var
}
///
- /// Applies multiple variant changes to a protein sequence
+ /// Applies a set of sequence variations in a genotype- and allele-depth-aware fashion for all individuals found in the VCF payloads.
+ /// Heterozygous sites can produce combinatorial branches up to per individual.
+ /// Results are deduplicated by final base sequence.
///
+ /// A biopolymer type that supports sequence variants.
+ /// The base biopolymer to which variations will be applied.
+ /// Candidate variations. Duplicates by effect are collapsed using .
+ ///
+ /// Upper cap for heterozygous combinatorial branching. If an individual has more heterozygous variants than this number,
+ /// the algorithm limits branching to control explosion.
+ ///
+ /// Minimum AD (Allele Depth) per sample for an allele to be considered in application.
+ /// A list of concrete variant biopolymers across all individuals encoded in the VCF payloads.
public static List ApplyVariants(TBioPolymerType protein, IEnumerable sequenceVariations, int maxAllowedVariantsForCombinitorics, int minAlleleDepth)
where TBioPolymerType : IHasSequenceVariants
{
+ // Remove duplicate effects (by SimpleString), require variants with genotype data, apply from higher to lower positions
List uniqueEffectsToApply = sequenceVariations
.GroupBy(v => v.SimpleString())
.Select(x => x.First())
- .Where(v => v.Description.Genotypes.Count > 0) // this is a VCF line
- .OrderByDescending(v => v.OneBasedBeginPosition) // apply variants at the end of the protein sequence first
+ .Where(v => v.VariantCallFormatDataString.Genotypes.Count > 0)
+ .OrderByDescending(v => v.OneBasedBeginPosition)
.ToList();
+ // A shallow "base" variant to branch from (no applied variants yet)
TBioPolymerType proteinCopy = protein.CreateVariant(protein.BaseSequence, protein, null, protein.TruncationProducts, protein.OneBasedPossibleLocalizedModifications, null);
- // If there aren't any variants to apply, just return the base protein
if (uniqueEffectsToApply.Count == 0)
{
return new List { proteinCopy };
}
- HashSet individuals = new HashSet(uniqueEffectsToApply.SelectMany(v => v.Description.Genotypes.Keys));
+ // All per-sample identifiers present in the VCF objects
+ HashSet individuals = new HashSet(uniqueEffectsToApply.SelectMany(v => v.VariantCallFormatDataString.Genotypes.Keys));
List variantProteins = new();
List newVariantProteins = new();
- // loop through genotypes for each sample/individual (e.g. tumor and normal)
+
foreach (string individual in individuals)
{
newVariantProteins.Clear();
newVariantProteins.Add(proteinCopy);
- bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.Description.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics;
+ // Whether to limit combinatorial branching for this individual
+ bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.VariantCallFormatDataString.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics;
+
foreach (var variant in uniqueEffectsToApply)
{
- bool variantAlleleIsInTheGenotype = variant.Description.Genotypes[individual].Contains(variant.Description.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff
+ // Only proceed if the individual's genotype references this variant's alternate allele index
+ bool variantAlleleIsInTheGenotype = variant.VariantCallFormatDataString.Genotypes[individual].Contains(variant.VariantCallFormatDataString.AlleleIndex.ToString());
if (!variantAlleleIsInTheGenotype)
{
continue;
}
- bool isHomozygousAlternate = variant.Description.Homozygous[individual] && variant.Description.Genotypes[individual].All(d => d == variant.Description.AlleleIndex.ToString()); // note this isn't a great test for homozygosity, since the genotype could be 1/2 and this would still return true. But currently, alleles 1 and 2 will be included as separate variants, so this is fine for now.
- bool isDeepReferenceAllele = int.TryParse(variant.Description.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth;
- bool isDeepAlternateAllele = int.TryParse(variant.Description.AlleleDepths[individual][variant.Description.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth;
- // homozygous alternate
+ // Zygosity and depth checks for this individual
+ bool isHomozygousAlternate = variant.VariantCallFormatDataString.Homozygous[individual] && variant.VariantCallFormatDataString.Genotypes[individual].All(d => d == variant.VariantCallFormatDataString.AlleleIndex.ToString());
+ bool isDeepReferenceAllele = int.TryParse(variant.VariantCallFormatDataString.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth;
+ bool isDeepAlternateAllele = int.TryParse(variant.VariantCallFormatDataString.AlleleDepths[individual][variant.VariantCallFormatDataString.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth;
+
if (isHomozygousAlternate && isDeepAlternateAllele)
{
+ // Deterministic application: all branches take the alt allele
newVariantProteins = newVariantProteins.Select(p => ApplySingleVariant(variant, p, individual)).ToList();
}
-
- // heterozygous basic
- // first protein with variants contains all homozygous variation, second contains all variations
- else if (variant.Description.Heterozygous[individual] && tooManyHeterozygousVariants)
+ else if (variant.VariantCallFormatDataString.Heterozygous[individual] && tooManyHeterozygousVariants)
{
+ // Limit branching: either keep ref, take alt, or update second branch if already present
if (isDeepAlternateAllele && isDeepReferenceAllele)
{
if (newVariantProteins.Count == 1 && maxAllowedVariantsForCombinitorics > 0)
@@ -154,51 +188,35 @@ public static List ApplyVariants(TBioPolymerTy
{
newVariantProteins[1] = ApplySingleVariant(variant, newVariantProteins[1], individual);
}
- else
- {
- // no heterozygous variants
- }
}
else if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0)
{
newVariantProteins = newVariantProteins.Select(p => ApplySingleVariant(variant, p, individual)).ToList();
}
- else
- {
- // keep reference only
- }
}
-
- // heterozygous combinitorics
- else if (variant.Description.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants)
+ else if (variant.VariantCallFormatDataString.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants)
{
+ // Full combinatorics (bounded) for heterozygous case: keep ref and/or take alt depending on depths
List combinitoricProteins = new();
foreach (var ppp in newVariantProteins)
{
if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0 && isDeepReferenceAllele)
{
- // keep reference allele
- if (variant.Description.Genotypes[individual].Contains("0"))
+ if (variant.VariantCallFormatDataString.Genotypes[individual].Contains("0"))
{
- combinitoricProteins.Add(ppp);
+ combinitoricProteins.Add(ppp); // keep reference branch
}
-
- // alternate allele (replace all, since in heterozygous with two alternates, both alternates are included)
- combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual));
+ combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual)); // alternate branch
}
else if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0)
{
combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual));
}
- else if (variant.Description.Genotypes[individual].Contains("0"))
+ else if (variant.VariantCallFormatDataString.Genotypes[individual].Contains("0"))
{
combinitoricProteins.Add(ppp);
}
- else
- {
- // must be two alternate alleles with not enough depth
- }
}
newVariantProteins = combinitoricProteins;
}
@@ -206,49 +224,79 @@ public static List ApplyVariants(TBioPolymerTy
variantProteins.AddRange(newVariantProteins);
}
+ // De-duplicate by final sequence to avoid identical variants from different branches
return variantProteins.GroupBy(x => x.BaseSequence).Select(x => x.First()).ToList();
}
///
- /// Applies a single variant to a protein sequence
+ /// Applies a single to a biopolymer and returns the updated instance with
+ /// sequence, truncation products, localized modifications, and applied-variation indices all re-mapped.
///
+ /// A biopolymer type that supports sequence variants.
+ /// Variation to apply (coordinates refer to the current protein's sequence).
+ /// Source biopolymer to mutate.
+ /// Sample identifier used to annotate the created variant (may be empty).
+ /// A new variant instance created via .
private static TBioPolymerType ApplySingleVariant(SequenceVariation variantGettingApplied, TBioPolymerType protein, string individual)
where TBioPolymerType : IHasSequenceVariants
{
+ // Sequence prefix before the edit region
string seqBefore = protein.BaseSequence.Substring(0, variantGettingApplied.OneBasedBeginPosition - 1);
+ // The replacement (alternate) sequence
string seqVariant = variantGettingApplied.VariantSequence;
+ // First index in the original sequence after the edited region
int afterIdx = variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.OriginalSequence.Length - 1;
- SequenceVariation variantAfterApplication = new SequenceVariation(
- variantGettingApplied.OneBasedBeginPosition,
- variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1,
- variantGettingApplied.OriginalSequence,
- variantGettingApplied.VariantSequence,
- variantGettingApplied.Description.Description,
- variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value));
-
- // check to see if there is incomplete indel overlap, which would lead to weird variant sequences
- // complete overlap is okay, since it will be overwritten; this can happen if there are two alternate alleles,
- // e.g. reference sequence is wrong at that point
- bool intersectsAppliedRegionIncompletely = protein.AppliedSequenceVariations.Any(x => variantGettingApplied.Intersects(x) && !variantGettingApplied.Includes(x));
+ // Reify a "post-application" variation object pinned to the inserted length
+ SequenceVariation variantAfterApplication;
+ var vcf = variantGettingApplied.VariantCallFormatDataString;
+ if (vcf != null)
+ {
+ variantAfterApplication = new SequenceVariation(
+ variantGettingApplied.OneBasedBeginPosition,
+ variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1,
+ variantGettingApplied.OriginalSequence,
+ variantGettingApplied.VariantSequence,
+ vcf.Description,
+ vcf,
+ variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value));
+ }
+ else
+ {
+ variantAfterApplication = new SequenceVariation(
+ variantGettingApplied.OneBasedBeginPosition,
+ variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1,
+ variantGettingApplied.OriginalSequence,
+ variantGettingApplied.VariantSequence,
+ variantGettingApplied.Description,
+ variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value));
+ }
+
+ // If an already-applied variation partially overlaps the current edit, use the consensus tail to avoid index corruption
+ bool intersectsAppliedRegionIncompletely =
+ protein.AppliedSequenceVariations != null
+ && protein.AppliedSequenceVariations.Any(x => variantGettingApplied.Intersects(x) && !variantGettingApplied.Includes(x));
+
IEnumerable appliedVariations = new[] { variantAfterApplication };
- string seqAfter = null;
+ string seqAfter;
if (intersectsAppliedRegionIncompletely)
{
- // use original protein sequence for the remaining sequence
+ // Tail from consensus (not the possibly already-mutated BaseSequence)
seqAfter = protein.BaseSequence.Length - afterIdx <= 0 ? "" : protein.ConsensusVariant.BaseSequence.Substring(afterIdx);
}
else
{
- // use this variant protein sequence for the remaining sequence
+ // Tail from the current BaseSequence; keep any previously applied variations that are not fully contained by the current edit
seqAfter = protein.BaseSequence.Length - afterIdx <= 0 ? "" : protein.BaseSequence.Substring(afterIdx);
appliedVariations = appliedVariations
- .Concat(protein.AppliedSequenceVariations.Where(x => !variantGettingApplied.Includes(x)))
+ .Concat((protein.AppliedSequenceVariations ?? Enumerable.Empty()).Where(x => !variantGettingApplied.Includes(x)))
.ToList();
}
- string variantSequence = (seqBefore + seqVariant + seqAfter).Split('*')[0]; // there may be a stop gained
- // adjust indices
+ // Clip at stop (*) if any
+ string variantSequence = (seqBefore + seqVariant + seqAfter).Split('*')[0];
+
+ // Re-map dependent structures after the sequence length change
List adjustedProteolysisProducts = AdjustTruncationProductIndices(variantGettingApplied, variantSequence, protein, protein.TruncationProducts);
Dictionary> adjustedModifications = AdjustModificationIndices(variantGettingApplied, variantSequence, protein);
List adjustedAppliedVariations = AdjustSequenceVariationIndices(variantGettingApplied, variantSequence, appliedVariations);
@@ -257,90 +305,119 @@ private static TBioPolymerType ApplySingleVariant(SequenceVaria
}
///
- /// Adjusts the indices of sequence variations due to applying a single additional variant
+ /// Adjusts (re-bases) the coordinates of already-applied variations after applying a new variation.
+ /// Handles overlaps by trimming and shifts by adding the net length change.
///
+ /// The newly applied variation causing coordinate shifts.
+ /// The updated sequence after the application (used to clamp ends).
+ /// Variations that were previously applied (may be null).
+ /// A new list of variations with updated coordinates, filtered for validity.
private static List AdjustSequenceVariationIndices(SequenceVariation variantGettingApplied, string variantAppliedProteinSequence, IEnumerable alreadyAppliedVariations)
{
List variations = new List();
if (alreadyAppliedVariations == null) { return variations; }
+
foreach (SequenceVariation v in alreadyAppliedVariations)
{
+ // Net length already introduced before the start of v
int addedIdx = alreadyAppliedVariations
.Where(applied => applied.OneBasedEndPosition < v.OneBasedBeginPosition)
.Sum(applied => applied.VariantSequence.Length - applied.OriginalSequence.Length);
- // variant was entirely before the one being applied (shouldn't happen because of order of applying variants)
- // or it's the current variation
- if (v.Description.Equals(variantGettingApplied.Description) || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition)
+ // Either same VCF payload (null-safe) or fully before the new application region after compensating for prior shifts
+ if ((v.VariantCallFormatDataString != null && v.VariantCallFormatDataString.Equals(variantGettingApplied.VariantCallFormatDataString))
+ || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition)
{
variations.Add(v);
}
-
- // adjust indices based on new included sequence, minding possible overlaps to be filtered later
else
{
+ // Compute overlap with the newly applied edit and shift by the net length change
int intersectOneBasedStart = Math.Max(variantGettingApplied.OneBasedBeginPosition, v.OneBasedBeginPosition);
int intersectOneBasedEnd = Math.Min(variantGettingApplied.OneBasedEndPosition, v.OneBasedEndPosition);
- int overlap = intersectOneBasedEnd < intersectOneBasedStart ? 0 : // no overlap
- intersectOneBasedEnd - intersectOneBasedStart + 1; // there's some overlap
+ int overlap = intersectOneBasedEnd < intersectOneBasedStart ? 0 :
+ intersectOneBasedEnd - intersectOneBasedStart + 1;
+
int sequenceLengthChange = variantGettingApplied.VariantSequence.Length - variantGettingApplied.OriginalSequence.Length;
+
int begin = v.OneBasedBeginPosition + sequenceLengthChange - overlap;
if (begin > variantAppliedProteinSequence.Length)
{
- continue; // cut out by a stop gain
+ continue;
}
+
int end = v.OneBasedEndPosition + sequenceLengthChange - overlap;
if (end > variantAppliedProteinSequence.Length)
{
- end = variantAppliedProteinSequence.Length; // end shortened by a stop gain
+ end = variantAppliedProteinSequence.Length;
+ }
+
+ var vcf = v.VariantCallFormatDataString;
+ if (vcf != null)
+ {
+ variations.Add(new SequenceVariation(
+ begin,
+ end,
+ v.OriginalSequence,
+ v.VariantSequence,
+ vcf.Description,
+ vcf,
+ v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)));
+ }
+ else
+ {
+ variations.Add(new SequenceVariation(
+ begin,
+ end,
+ v.OriginalSequence,
+ v.VariantSequence,
+ v.Description,
+ v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)));
}
- variations.Add(new SequenceVariation(
- begin,
- end,
- v.OriginalSequence,
- v.VariantSequence,
- v.Description.Description,
- v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)));
}
}
return variations;
}
///
- /// Eliminates proteolysis products that overlap sequence variations.
- /// Since frameshift indels are written across the remaining sequence,
- /// this eliminates proteolysis products that conflict with large deletions and other structural variations.
+ /// Re-bases proteolysis/truncation product ranges after applying a sequence variation.
+ /// Shifts segments to the right if the edit occurs before them and expands/contracts segments that span the edit.
+ /// If a stop (*) is introduced by the variant, downstream products are clamped to the new sequence length.
///
+ /// The applied variation (source of positional shifts and length change).
+ /// The updated sequence after application.
+ /// The source biopolymer (used for consensus length in certain boundary checks).
+ /// Existing truncation products to be re-based; may be null.
+ /// Updated list of truncation products within the new coordinate system.
private static List AdjustTruncationProductIndices(SequenceVariation variant, string variantAppliedProteinSequence, IHasSequenceVariants protein, IEnumerable proteolysisProducts)
{
List products = new List();
if (proteolysisProducts == null) { return products; }
+
int sequenceLengthChange = variant.VariantSequence.Length - variant.OriginalSequence.Length;
+
foreach (TruncationProduct p in proteolysisProducts.Where(p => p.OneBasedEndPosition.HasValue && p.OneBasedBeginPosition.HasValue))
{
- // proteolysis product is entirely before the variant
+ // Entirely before the edit: unchanged
if (variant.OneBasedBeginPosition > p.OneBasedEndPosition)
{
products.Add(p);
}
- // proteolysis product straddles the variant, but the cleavage site(s) are still intact; the ends aren't considered cleavage sites
+ // Segment spans the edit or is clamped at boundaries: extend/contract or clamp to stop
else if ((p.OneBasedBeginPosition < variant.OneBasedBeginPosition || p.OneBasedBeginPosition == 1 || p.OneBasedBeginPosition == 2)
&& (p.OneBasedEndPosition > variant.OneBasedEndPosition || p.OneBasedEndPosition == protein.ConsensusVariant.BaseSequence.Length))
{
if (variant.VariantSequence.EndsWith("*"))
{
+ // Introduced stop codon/terminator: clamp to the new sequence length
products.Add(new TruncationProduct(p.OneBasedBeginPosition, variantAppliedProteinSequence.Length, p.Type));
}
else if (p.OneBasedEndPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length)
{
products.Add(new TruncationProduct(p.OneBasedBeginPosition, p.OneBasedEndPosition + sequenceLengthChange, p.Type));
}
- else
- {
- // cleavage site is not intact
- }
}
- // proteolysis product is after the variant and there is no stop gain
+ // Entirely after the edit: shift right/left by the net length change, if still within bounds and not terminated
else if (p.OneBasedBeginPosition > variant.OneBasedEndPosition
&& p.OneBasedBeginPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length
&& p.OneBasedEndPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length
@@ -348,52 +425,49 @@ private static List AdjustTruncationProductIndices(SequenceVa
{
products.Add(new TruncationProduct(p.OneBasedBeginPosition + sequenceLengthChange, p.OneBasedEndPosition + sequenceLengthChange, p.Type));
}
- else // sequence variant conflicts with proteolysis cleavage site (cleavage site was lost)
- {
- continue;
- }
}
return products;
}
///
- /// Adjusts modification indices.
+ /// Produces a new localized modification dictionary re-based to the post-variation coordinate system and merged with
+ /// any one-based modifications carried by the applied variant itself.
///
+ /// The variation causing index shifts.
+ /// The updated sequence after application (for bounds checks).
+ /// The source biopolymer providing the original modifications map.
+ /// A new dictionary of one-based modification lists keyed by position in the updated sequence.
private static Dictionary> AdjustModificationIndices(SequenceVariation variant, string variantAppliedProteinSequence, IHasSequenceVariants protein)
{
+ // Original per-position modifications (pre-application)
IDictionary> modificationDictionary = protein.OneBasedPossibleLocalizedModifications;
+ // Modifications contributed by the variant itself (coordinated in the same one-based space)
IDictionary> variantModificationDictionary = variant.OneBasedModifications;
+
Dictionary> mods = new Dictionary>();
int sequenceLengthChange = variant.VariantSequence.Length - variant.OriginalSequence.Length;
- // change modification indices for variant sequence
+ // Re-base original modifications
if (modificationDictionary != null)
{
foreach (KeyValuePair> kv in modificationDictionary)
{
if (kv.Key > variantAppliedProteinSequence.Length)
{
- continue; // it was cut out by a stop gain
+ continue; // drop if beyond new end
}
- // mod is before the variant
else if (kv.Key < variant.OneBasedBeginPosition)
{
- mods.Add(kv.Key, kv.Value);
+ mods.Add(kv.Key, kv.Value); // unaffected positions
}
- // mod is after the variant and not affected by a stop gain
else if (variant.OneBasedEndPosition < kv.Key && kv.Key + sequenceLengthChange <= variantAppliedProteinSequence.Length)
{
- mods.Add(kv.Key + sequenceLengthChange, kv.Value);
- }
- else // sequence variant conflicts with modification site (modification site substitution)
- {
- continue;
+ mods.Add(kv.Key + sequenceLengthChange, kv.Value); // shift after the edit
}
}
}
- // sequence variant modifications are indexed to the variant sequence
- // NOTE: this code assumes variants are added from end to beginning of protein, so that previously added variant mods are adjusted above
+ // Merge-in variant-borne modifications (may share positions)
if (variantModificationDictionary != null)
{
foreach (var kv in variantModificationDictionary)
@@ -413,51 +487,54 @@ private static Dictionary> AdjustModificationIndices(Seq
}
///
- /// Format string to append to accession
+ /// Concatenates the compact representations () of the provided variations with underscores.
///
+ /// Variations to stringify; may be null/empty.
+ /// An underscore-joined string or empty string if none.
private static string CombineSimpleStrings(IEnumerable? variations)
{
return variations.IsNullOrEmpty() ? "" : string.Join("_", variations.Select(v => v.SimpleString()));
}
///
- /// Format string to append to protein names
+ /// Concatenates human-readable descriptions for the provided variations.
+ /// Prefers VCF description when available, otherwise falls back to the variation's own description.
///
+ /// Variations to describe; may be null/empty.
+ /// A comma-delimited description string or empty string if none.
public static string CombineDescriptions(IEnumerable? variations)
{
- return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.Description));
+ return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.VariantCallFormatDataString?.Description ?? d.Description));
}
+
///
- /// Applies all possible combinations of the provided SequenceVariation list to the base TBioPolymerType object,
- /// starting with the fewest single variations and up to the specified maximum number of combinations.
+ /// Applies all combinations of the provided variations to the base biopolymer up to a maximum number of yielded results.
+ /// The base (no-variant) biopolymer is yielded first, followed by combinations in increasing size.
///
- /// The type of the biopolymer object.
- /// The base biopolymer object to apply variations to.
- /// List of SequenceVariation objects to combine and apply. Assumed not null or empty.
- /// Maximum number of combinations to return.
- ///
- /// An IEnumerable of TBioPolymerType objects, each with a unique combination of variations applied.
- ///
+ /// A biopolymer type that supports sequence variants.
+ /// The starting biopolymer (no variations applied).
+ /// Candidate variations to combine.
+ /// Maximum number of variants (including the base) to yield to bound combinatorial growth.
+ /// An enumerable of applied-variant biopolymers.
public static IEnumerable ApplyAllVariantCombinations(
TBioPolymerType baseBioPolymer,
List variations,
int maxCombinations)
where TBioPolymerType : IHasSequenceVariants
{
- int count = 0;
+ int count = 0; // number of variants yielded so far
- // Always yield the base biopolymer first
yield return baseBioPolymer;
count++;
if (count >= maxCombinations)
yield break;
- int n = variations.Count;
+ int n = variations.Count; // total variation count
for (int size = 1; size <= n; size++)
{
foreach (var combo in GetCombinations(variations, size))
{
- var result = baseBioPolymer;
+ var result = baseBioPolymer; // start from base and apply this combination in order
foreach (var variant in combo)
{
result = ApplySingleVariant(variant, result, string.Empty);
@@ -474,26 +551,27 @@ public static IEnumerable ApplyAllVariantCombinations
- /// Generates all possible combinations of the specified size from the input list.
+ /// Generates all k-combinations (order-independent, no repetition) of the given list.
+ /// This is a standard lexicographic index-based combinator that yields increasing index tuples.
///
- /// List of SequenceVariation objects to combine. Assumed not null or empty.
- /// The size of each combination.
- ///
- /// An IEnumerable of IList<SequenceVariation> representing each combination.
- ///
+ /// Source list to combine.
+ /// Combination size k (0 < k <= n).
+ /// An enumerable of read-only lists containing the selected variations.
private static IEnumerable> GetCombinations(List variations, int size)
{
int n = variations.Count;
var indices = new int[size];
- for (int i = 0; i < size; i++) indices[i] = i;
+ for (int i = 0; i < size; i++) indices[i] = i; // initial 0..k-1
while (true)
{
+ // Materialize current combination
var combo = new List(size);
for (int i = 0; i < size; i++)
combo.Add(variations[indices[i]]);
yield return combo;
+ // Advance to next lexicographic combination
int pos = size - 1;
while (pos >= 0 && indices[pos] == n - size + pos)
pos--;
@@ -503,15 +581,26 @@ private static IEnumerable> GetCombinations(List
+ /// Scans localized modifications for "nucleotide substitution" annotations of the form "X->Y" and converts them
+ /// into concrete objects at the associated positions.
+ /// The originating annotation-style modifications are removed from both the possible-localized and original-non-variant collections
+ /// (and from consensus mirrors) when they are fully consumed by the conversion.
+ ///
+ /// A biopolymer type that supports sequence variants.
+ /// The biopolymer whose modifications/variants are to be updated in-place.
public static void ConvertNucleotideSubstitutionModificationsToSequenceVariants(this TBioPolymerType protein)
where TBioPolymerType : IHasSequenceVariants
{
+ // Collect mods to remove after converting them to sequence variants
List> modificationsToRemove = new();
- //convert substitution modifications to sequence variations
+
foreach (var kvp in protein.OneBasedPossibleLocalizedModifications)
{
foreach (Modification mod in kvp.Value)
{
+ // Look for annotation-style nucleotide substitutions (e.g., "A->G")
if (mod.ModificationType.Contains("nucleotide substitution") && mod.OriginalId.Contains("->"))
{
string[] originalAndSubstitutedAminoAcids = mod.OriginalId.Split(new[] { "->" }, StringSplitOptions.RemoveEmptyEntries);
@@ -520,12 +609,14 @@ public static void ConvertNucleotideSubstitutionModificationsToSequenceVariants<
{
protein.SequenceVariations.Add(sequenceVariation);
}
+ // Defer removal to avoid mutating collection during enumeration
KeyValuePair pair = new(kvp.Key, mod);
modificationsToRemove.Add(pair);
}
}
}
- //remove the modifications that were converted to sequence variations
+
+ // Remove the consumed annotation-style modifications from both live and consensus dictionaries
foreach (KeyValuePair pair in modificationsToRemove)
{
if (protein.OneBasedPossibleLocalizedModifications.ContainsKey(pair.Key))
@@ -544,6 +635,7 @@ public static void ConvertNucleotideSubstitutionModificationsToSequenceVariants<
}
}
}
+
if (protein.OriginalNonVariantModifications.ContainsKey(pair.Key))
{
List modList = protein.OriginalNonVariantModifications[pair.Key];
diff --git a/mzLib/Omics/BioPolymer/VariantCallFormat.cs b/mzLib/Omics/BioPolymer/VariantCallFormat.cs
index e0cbec9a7..7549726c0 100644
--- a/mzLib/Omics/BioPolymer/VariantCallFormat.cs
+++ b/mzLib/Omics/BioPolymer/VariantCallFormat.cs
@@ -146,7 +146,7 @@ public VariantCallFormat(string description)
Description = description;
// Back-compat: if no real tabs are present but literal "\t" sequences are,
- // normalize them to actual tabs for parsing only. Leave Description intact.
+ // normalize them to actual tabs for parsing only. Leave VariantCallFormatDataString intact.
string parseLine = NormalizeTabsForParsing(description);
// Parse description into VCF fields
diff --git a/mzLib/Test/DatabaseTests/TestProteinReader.cs b/mzLib/Test/DatabaseTests/TestProteinReader.cs
index 7dcd0b4d8..e0768eef4 100644
--- a/mzLib/Test/DatabaseTests/TestProteinReader.cs
+++ b/mzLib/Test/DatabaseTests/TestProteinReader.cs
@@ -125,7 +125,7 @@ public static void XmlTest()
Assert.AreEqual(64, ok[0].SequenceVariations.First().OneBasedEndPosition);
Assert.AreEqual(103 - 64 + 2, ok[1].SequenceVariations.First().OneBasedBeginPosition);
Assert.AreEqual(103 - 64 + 2, ok[1].SequenceVariations.First().OneBasedEndPosition);
- Assert.AreNotEqual(ok[0].SequenceVariations.First().Description, ok[1].SequenceVariations.First().Description); //decoys and target variations don't have the same desc.
+ Assert.AreNotEqual(ok[0].SequenceVariations.First().VariantCallFormatDataString, ok[1].SequenceVariations.First().VariantCallFormatDataString); //decoys and target variations don't have the same desc.
Assert.AreEqual("Homo sapiens", ok[1].Organism);
}
@@ -420,8 +420,8 @@ public static void TestReverseDecoyXML_WithCustomIdentifier()
foreach (var variant in protein.AppliedSequenceVariations)
{
- Assert.That(variant.Description, Does.StartWith("rev"));
- Assert.That(variant.Description, Does.Not.StartWith("DECOY"));
+ Assert.That(variant.VariantCallFormatDataString, Does.StartWith("rev"));
+ Assert.That(variant.VariantCallFormatDataString, Does.Not.StartWith("DECOY"));
}
foreach (var bond in protein.DisulfideBonds)
diff --git a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs
index babe44a76..e6df1c8d5 100644
--- a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs
+++ b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs
@@ -504,12 +504,12 @@ public void TestFullProteinReadWrite()
Assert.AreEqual(originalProtein.TruncationProducts.First().OneBasedEndPosition, proteinReadFromXml[0].TruncationProducts.First().OneBasedEndPosition);
Assert.AreEqual(originalProtein.TruncationProducts.First().Type, proteinReadFromXml[0].TruncationProducts.First().Type.Split('(')[0]);
- Assert.AreEqual(originalProtein.SequenceVariations.First().Description, proteinReadFromXml[0].SequenceVariations.First().Description);
+ Assert.AreEqual(originalProtein.SequenceVariations.First().VariantCallFormatDataString, proteinReadFromXml[0].SequenceVariations.First().VariantCallFormatDataString);
Assert.AreEqual(originalProtein.SequenceVariations.First().OneBasedBeginPosition, proteinReadFromXml[0].SequenceVariations.First().OneBasedBeginPosition);
Assert.AreEqual(originalProtein.SequenceVariations.First().OneBasedEndPosition, proteinReadFromXml[0].SequenceVariations.First().OneBasedEndPosition);
Assert.AreEqual(originalProtein.SequenceVariations.First().OriginalSequence, proteinReadFromXml[0].SequenceVariations.First().OriginalSequence);
Assert.AreEqual(originalProtein.SequenceVariations.First().VariantSequence, proteinReadFromXml[0].SequenceVariations.First().VariantSequence);
- Assert.AreEqual(originalProtein.SequenceVariations.Last().Description, proteinReadFromXml[0].SequenceVariations.Last().Description);
+ Assert.AreEqual(originalProtein.SequenceVariations.Last().VariantCallFormatDataString, proteinReadFromXml[0].SequenceVariations.Last().VariantCallFormatDataString);
Assert.AreEqual(originalProtein.SequenceVariations.Last().OneBasedBeginPosition, proteinReadFromXml[0].SequenceVariations.Last().OneBasedBeginPosition);
Assert.AreEqual(originalProtein.SequenceVariations.Last().OneBasedEndPosition, proteinReadFromXml[0].SequenceVariations.Last().OneBasedEndPosition);
Assert.AreEqual(originalProtein.SequenceVariations.Last().OriginalSequence, proteinReadFromXml[0].SequenceVariations.Last().OriginalSequence);
@@ -534,7 +534,7 @@ public void TestReadWriteSeqVars()
Assert.AreEqual(ok[0].SequenceVariations.Count(), ok2[0].SequenceVariations.Count());
Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedBeginPosition, ok2[0].SequenceVariations.First().OneBasedBeginPosition);
Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedEndPosition, ok2[0].SequenceVariations.First().OneBasedEndPosition);
- Assert.AreEqual(ok[0].SequenceVariations.First().Description, ok2[0].SequenceVariations.First().Description);
+ Assert.AreEqual(ok[0].SequenceVariations.First().VariantCallFormatDataString, ok2[0].SequenceVariations.First().VariantCallFormatDataString);
Assert.AreEqual(ok[0].SequenceVariations.First().OriginalSequence, ok2[0].SequenceVariations.First().OriginalSequence);
Assert.AreEqual(ok[0].SequenceVariations.First().VariantSequence, ok2[0].SequenceVariations.First().VariantSequence);
}
@@ -557,7 +557,7 @@ public void TestReadWriteSeqVars2()
Assert.AreEqual(ok[0].SequenceVariations.Count(), ok2[0].SequenceVariations.Count());
Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedBeginPosition, ok2[0].SequenceVariations.First().OneBasedBeginPosition);
Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedEndPosition, ok2[0].SequenceVariations.First().OneBasedEndPosition);
- Assert.AreEqual(ok[0].SequenceVariations.First().Description, ok2[0].SequenceVariations.First().Description);
+ Assert.AreEqual(ok[0].SequenceVariations.First().VariantCallFormatDataString, ok2[0].SequenceVariations.First().VariantCallFormatDataString);
Assert.AreEqual(ok[0].SequenceVariations.First().OriginalSequence, ok2[0].SequenceVariations.First().OriginalSequence);
Assert.AreEqual(ok[0].SequenceVariations.First().VariantSequence, ok2[0].SequenceVariations.First().VariantSequence);
}
diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
index 8ced7b208..53bf78897 100644
--- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs
+++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs
@@ -1,20 +1,21 @@
-using System;
-using System.Collections;
-using System.Collections.Generic;
-using System.IO;
-using System.Linq;
+using Chemistry;
+using MassSpectrometry;
using NUnit.Framework;
+using NUnit.Framework.Legacy;
+using Omics;
using Omics.BioPolymer;
-using Assert = NUnit.Framework.Legacy.ClassicAssert;
using Omics.Modifications;
using Proteomics;
using Proteomics.ProteolyticDigestion;
+using System;
+using System.Collections;
+using System.Collections.Generic;
+using System.IO;
+using System.Linq;
+using Transcriptomics;
using UsefulProteomicsDatabases;
+using Assert = NUnit.Framework.Legacy.ClassicAssert;
using Stopwatch = System.Diagnostics.Stopwatch;
-using Omics;
-using Transcriptomics;
-using MassSpectrometry;
-using Chemistry;
namespace Test.DatabaseTests
{
@@ -115,7 +116,7 @@ public static void SeqVarXmlTest()
{
Assert.AreEqual(s.OriginalSequence, decoy.BaseSequence.Substring(s.OneBasedBeginPosition - 1, s.OneBasedEndPosition - s.OneBasedBeginPosition + 1));
}
- Assert.AreNotEqual(target.SequenceVariations.First().Description, decoy.SequenceVariations.First().Description); //decoys and target variations don't have the same desc.
+ Assert.AreNotEqual(target.SequenceVariations.First().VariantCallFormatDataString, decoy.SequenceVariations.First().VariantCallFormatDataString); //decoys and target variations don't have the same desc.
List peptides = ok.SelectMany(vp => vp.Digest(new DigestionParams(), null, null)).ToList();
}
@@ -284,76 +285,258 @@ public static void HomozygousVariantsAtVariedDepths(string filename, int minVari
var variantProteins = proteins[0].GetVariantBioPolymers();
List peptides = proteins.SelectMany(vp => vp.Digest(new DigestionParams(), null, null)).ToList();
}
-
[Test]
public static void AppliedVariants()
{
- ModificationMotif.TryGetMotif("P", out ModificationMotif motifP);
- Modification mp = new Modification("mod", null, "type", null, motifP, "Anywhere.", null, 42.01, new Dictionary>(), null, null, null, null, null);
+ // This test verifies that applying sequence variations (SAV, MNV, insertion, deletion)
+ // produces the correct variant protein sequences, maps variant coordinates correctly,
+ // preserves per-variant metadata (AppliedSequenceVariations), and remains stable across:
+ // - repeated in-memory application,
+ // - round-tripping through XML (write → read).
+ //
+ // Additionally, it verifies that a modification attached to a variant (protein5)
+ // is persisted and localized at the expected one-based position after application
+ // and after XML reload.
+ // Define a simple P-specific modification used later to validate that modifications
+ // attached to a variant are preserved and localized correctly after applying variants
+ // and XML round-tripping.
+ ModificationMotif.TryGetMotif("P", out ModificationMotif motifP);
+ Modification mp = new Modification(
+ _originalId: "mod",
+ _accession: null,
+ _modificationType: "type",
+ _featureType: null,
+ _target: motifP,
+ _locationRestriction: "Anywhere.",
+ _chemicalFormula: null,
+ _monoisotopicMass: 42.01,
+ _databaseReference: new Dictionary>(),
+ _taxonomicRange: null,
+ _keywords: null,
+ _neutralLosses: null,
+ _diagnosticIons: null,
+ _fileOrigin: null);
+
+ // Prepare five proteins that each have one sequence variation:
+ // 1) protein1: Single Amino-acid Variant (SAV) P→V at position 4 (4..4)
+ // 2) protein2: Multi-Nucleotide Variant (MNV) PT→KT spanning positions 4..5
+ // 3) protein3: Insertion-like replacement: P→PPP at position 4 (longer variant)
+ // 4) protein4: Deletion-like replacement: PPP→P spanning 4..6 (shorter variant)
+ // 5) protein5: Same as (3) but with a modification attached at one-based index 5
+ // to verify mod persistence/localization through variant application and XML.
List proteinsWithSeqVars = new List
{
- new Protein("MPEPTIDE", "protein1", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "V", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
- new Protein("MPEPTIDE", "protein2", sequenceVariations: new List { new SequenceVariation(4, 5, "PT", "KT", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
- new Protein("MPEPTIDE", "protein3", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
- new Protein("MPEPPPTIDE", "protein4", sequenceVariations: new List { new SequenceVariation(4, 6, "PPP", "P", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
- new Protein("MPEPTIDE", "protein5", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary> {{ 5, new[] { mp }.ToList() } }) }),
- };
+ new Protein("MPEPTIDE", "protein1",
+ sequenceVariations: new List
+ {
+ // SAV: P(4) -> V(4)
+ new SequenceVariation(4, 4, "P", "V",
+ @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
+ }),
+ new Protein("MPEPTIDE", "protein2",
+ sequenceVariations: new List
+ {
+ // MNV: PT(4..5) -> KT(4..5)
+ new SequenceVariation(4, 5, "PT", "KT",
+ @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
+ }),
+ new Protein("MPEPTIDE", "protein3",
+ sequenceVariations: new List
+ {
+ // Insertion-like: P(4) -> PPP(4..6) (length +2)
+ new SequenceVariation(4, 4, "P", "PPP",
+ @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
+ }),
+ new Protein("MPEPPPTIDE", "protein4",
+ sequenceVariations: new List
+ {
+ // Deletion-like: PPP(4..6) -> P(4) (length -2)
+ new SequenceVariation(4, 6, "PPP", "P",
+ @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null)
+ }),
+ new Protein("MPEPTIDE", "protein5",
+ sequenceVariations: new List
+ {
+ // Insertion-like with a downstream mod to verify mod localization at 5
+ new SequenceVariation(4, 4, "P", "PPP",
+ @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30",
+ new Dictionary> { { 5, new[] { mp }.ToList() } })
+ }),
+ };
+
+ // Apply variants in memory twice; the output should be stable and identical.
var proteinsWithAppliedVariants = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList();
var proteinsWithAppliedVariants2 = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList(); // should be stable
+
+ // Round-trip through XML: write the variant-bearing proteins (targets only) and read back.
+ // This ensures that variant application and metadata survive I/O and result identically.
string xml = Path.Combine(TestContext.CurrentContext.TestDirectory, "AppliedVariants.xml");
ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), proteinsWithSeqVars, xml);
var proteinsWithAppliedVariants3 = ProteinDbLoader.LoadProteinXML(xml, true, DecoyType.None, null, false, null, out var un);
+ // Compare across all three sources:
+ // - [0]: in-memory 1
+ // - [1]: in-memory 2 (should match [0])
+ // - [2]: XML reload (should match [0] and [1])
var listArray = new[] { proteinsWithAppliedVariants, proteinsWithAppliedVariants2, proteinsWithAppliedVariants3 };
+
+ // Assert we always get exactly five variant proteins in the same order
+ // (SAV, MNV, insertion, deletion, insertion+mod).
+ Assert.AreEqual(5, proteinsWithAppliedVariants.Count, "Expected 5 applied variants (in-memory #1).");
+ Assert.AreEqual(5, proteinsWithAppliedVariants2.Count, "Expected 5 applied variants (in-memory #2).");
+ Assert.AreEqual(5, proteinsWithAppliedVariants3.Count, "Expected 5 applied variants (XML reload).");
+
+ // The expected sequences for each of the five applied variants, in order:
+ // 0: "MPEVTIDE" (SAV at 4: P->V)
+ // 1: "MPEKTIDE" (MNV at 4..5: PT->KT)
+ // 2: "MPEPPPTIDE" (Insertion-like at 4: P->PPP; length +2)
+ // 3: "MPEPTIDE" (Deletion-like at 4..6: PPP->P; length -2 from "MPEPPPTIDE")
+ // 4: "MPEPPPTIDE" (Insertion-like with mod at 5)
for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++)
{
// sequences
- Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence);
- Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence);
- Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence);
- Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence);
- Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence);
- Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key);
-
+ Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence, $"[{dbIdx}] SAV sequence mismatch");
+ Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence, $"[{dbIdx}] MNV sequence mismatch");
+ Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence, $"[{dbIdx}] insertion sequence mismatch");
+ Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence, $"[{dbIdx}] deletion sequence mismatch");
+ Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence, $"[{dbIdx}] insertion+mod sequence mismatch");
+
+ // Confirm the modification attached to protein5 survives application and XML round-trip
+ Assert.AreEqual(1, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] Expected exactly 1 mod on the insertion+mod variant");
+ Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key, $"[{dbIdx}] Mod should localize to one-based position 5");
+ // Sanity: ensure the residue under the mod is indeed P
+ Assert.AreEqual('P', listArray[dbIdx][4].BaseSequence[5 - 1], $"[{dbIdx}] Residue at mod position should be 'P'");
+
+ // SAV expectations: single-residue change; length unchanged; position 4 is 'V'
+ Assert.AreEqual(8, listArray[dbIdx][0].BaseSequence.Length, $"[{dbIdx}] SAV length should be unchanged");
+ Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedBeginPosition, $"[{dbIdx}] SAV begin mismatch");
+ Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedEndPosition, $"[{dbIdx}] SAV end mismatch");
+ Assert.AreEqual("P", listArray[dbIdx][0].AppliedSequenceVariations.Single().OriginalSequence, $"[{dbIdx}] SAV original should be 'P'");
+ Assert.AreEqual("V", listArray[dbIdx][0].AppliedSequenceVariations.Single().VariantSequence, $"[{dbIdx}] SAV variant should be 'V'");
+ Assert.AreEqual('V', listArray[dbIdx][0].BaseSequence[4 - 1], $"[{dbIdx}] SAV residue at 4 should be 'V'");
+
+ // MNV expectations: multi-residue change; length unchanged; positions 4..5 become "KT"
+ Assert.AreEqual(8, listArray[dbIdx][1].BaseSequence.Length, $"[{dbIdx}] MNV length should be unchanged");
+ Assert.AreEqual(4, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedBeginPosition, $"[{dbIdx}] MNV begin mismatch");
+ Assert.AreEqual(5, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedEndPosition, $"[{dbIdx}] MNV end mismatch");
+ Assert.AreEqual("PT", listArray[dbIdx][1].AppliedSequenceVariations.Single().OriginalSequence, $"[{dbIdx}] MNV original should be 'PT'");
+ Assert.AreEqual("KT", listArray[dbIdx][1].AppliedSequenceVariations.Single().VariantSequence, $"[{dbIdx}] MNV variant should be 'KT'");
+ Assert.AreEqual("KT", listArray[dbIdx][1].BaseSequence.Substring(4 - 1, 2), $"[{dbIdx}] MNV residues 4..5 should be 'KT'");
+
+ // insertion expectations: length grows by +2; positions 4..6 are "PPP"
+ Assert.AreEqual(10, listArray[dbIdx][2].BaseSequence.Length, $"[{dbIdx}] insertion length should be 8 + 2 = 10");
+ Assert.AreEqual(4, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedBeginPosition, $"[{dbIdx}] insertion begin mismatch");
+ Assert.AreEqual(6, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedEndPosition, $"[{dbIdx}] insertion end mismatch");
+ Assert.AreEqual("P", listArray[dbIdx][2].AppliedSequenceVariations.Single().OriginalSequence, $"[{dbIdx}] insertion original should be 'P'");
+ Assert.AreEqual("PPP", listArray[dbIdx][2].AppliedSequenceVariations.Single().VariantSequence, $"[{dbIdx}] insertion variant should be 'PPP'");
+ Assert.AreEqual("PPP", listArray[dbIdx][2].BaseSequence.Substring(4 - 1, 3), $"[{dbIdx}] insertion residues 4..6 should be 'PPP'");
+
+ // deletion expectations: length shrinks by -2 relative to the starting "MPEPPPTIDE" (10 → 8)
+ // and positions collapse so that sequence returns to "MPEPTIDE".
+ Assert.AreEqual(8, listArray[dbIdx][3].BaseSequence.Length, $"[{dbIdx}] deletion length should be 10 - 2 = 8");
+ Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedBeginPosition, $"[{dbIdx}] deletion begin mismatch");
+ Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedEndPosition, $"[{dbIdx}] deletion end mismatch (post-collapse)");
+ Assert.AreEqual("PPP", listArray[dbIdx][3].AppliedSequenceVariations.Single().OriginalSequence, $"[{dbIdx}] deletion original should be 'PPP'");
+ Assert.AreEqual("P", listArray[dbIdx][3].AppliedSequenceVariations.Single().VariantSequence, $"[{dbIdx}] deletion variant should be 'P'");
+
+ // For completeness, also assert the summarized begin/end expectations that the original test verified.
// SAV
Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedBeginPosition);
Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedEndPosition);
-
// MNV
Assert.AreEqual(4, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedBeginPosition);
Assert.AreEqual(5, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedEndPosition);
-
// insertion
Assert.AreEqual(4, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedBeginPosition);
Assert.AreEqual(6, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedEndPosition);
-
// deletion
Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedBeginPosition);
Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedEndPosition);
}
- }
+ // Ensure exact stability across the three sources:
+ // - All sequences in in-memory #1 equal in-memory #2 and XML reload.
+ CollectionAssert.AreEqual(
+ proteinsWithAppliedVariants.Select(p => p.BaseSequence).ToList(),
+ proteinsWithAppliedVariants2.Select(p => p.BaseSequence).ToList(),
+ "In-memory application should be stable across repeated calls");
+ CollectionAssert.AreEqual(
+ proteinsWithAppliedVariants.Select(p => p.BaseSequence).ToList(),
+ proteinsWithAppliedVariants3.Select(p => p.BaseSequence).ToList(),
+ "XML round-trip should preserve variant-applied sequences in the same order");
+ }
[Test]
public static void AppliedVariants_AsIBioPolymer()
{
- ModificationMotif.TryGetMotif("P", out ModificationMotif motifP);
- Modification mp = new Modification("mod", null, "type", null, motifP, "Anywhere.", null, 42.01, new Dictionary>(), null, null, null, null, null);
+ // PURPOSE
+ // This test mirrors "AppliedVariants" but exercises the IBioPolymer interface path.
+ // It validates, in detail:
+ // 1) Correct application of four variant types (SAV, MNV, insertion, deletion) to produce expected sequences.
+ // 2) Correct coordinates (begin/end), original/variant strings in AppliedSequenceVariations after application.
+ // 3) Stability of results across:
+ // - repeated in-memory variant application,
+ // - XML round-trip (write → read).
+ // 4) Modification propagation and localization for a variant carrying a downstream mod.
+ // 5) Distinguishing two variants with identical sequences by their modification state (index 2 vs 4).
+ //
+ // EXPECTATIONS SUMMARY
+ // - Variant sequences (in order): "MPEVTIDE", "MPEKTIDE", "MPEPPPTIDE", "MPEPTIDE", "MPEPPPTIDE".
+ // - AppliedSequenceVariations:
+ // SAV (idx 0): begin=4, end=4, original="P", variant="V", len=8
+ // MNV (idx 1): begin=4, end=5, original="PT", variant="KT", len=8
+ // Insertion (idx 2): begin=4, end=6, original="P", variant="PPP", len=10
+ // Deletion (idx 3): begin=4, end=4, original="PPP", variant="P", len=8
+ // Insert+Mod(idx 4): same sequence as insertion, plus exactly one mod at one-based index 5 targeting a 'P'.
+ //
+ // NOTE: Lists [0], [1], [2] below represent:
+ // [0] in-memory application (first call)
+ // [1] in-memory application (second call, should be identical/stable)
+ // [2] XML round-trip results, must match [0] and [1]
+ // Arrange: create a P-specific modification used to test propagation and localization
+ ModificationMotif.TryGetMotif("P", out ModificationMotif motifP);
+ Modification mp = new Modification(
+ _originalId: "mod",
+ _accession: null,
+ _modificationType: "type",
+ _featureType: null,
+ _target: motifP,
+ _locationRestriction: "Anywhere.",
+ _chemicalFormula: null,
+ _monoisotopicMass: 42.01,
+ _databaseReference: new Dictionary>(),
+ _taxonomicRange: null,
+ _keywords: null,
+ _neutralLosses: null,
+ _diagnosticIons: null,
+ _fileOrigin: null);
+
+ // Arrange: build five proteins (as IBioPolymer) with one sequence variation each
+ // 1) SAV P(4)->V
+ // 2) MNV PT(4..5)->KT
+ // 3) Insertion-like P(4)->PPP(4..6)
+ // 4) Deletion-like PPP(4..6)->P(4) on a longer starting sequence
+ // 5) Insertion-like with a downstream mod at one-based index 5
List proteinsWithSeqVars = new List
{
- new Protein("MPEPTIDE", "protein1", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "V", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
- new Protein("MPEPTIDE", "protein2", sequenceVariations: new List { new SequenceVariation(4, 5, "PT", "KT", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
- new Protein("MPEPTIDE", "protein3", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
- new Protein("MPEPPPTIDE", "protein4", sequenceVariations: new List { new SequenceVariation(4, 6, "PPP", "P", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
- new Protein("MPEPTIDE", "protein5", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary> {{ 5, new[] { mp }.ToList() } }) }),
+ new Protein("MPEPTIDE", "protein1", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "V", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
+ new Protein("MPEPTIDE", "protein2", sequenceVariations: new List { new SequenceVariation(4, 5, "PT", "KT", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
+ new Protein("MPEPTIDE", "protein3", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
+ new Protein("MPEPPPTIDE", "protein4", sequenceVariations: new List { new SequenceVariation(4, 6, "PPP", "P", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }),
+ new Protein("MPEPTIDE", "protein5", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary> { { 5, new[] { mp }.ToList() } }) }),
};
+
+ // Act: apply variants in-memory twice (should be identical/stable)
var proteinsWithAppliedVariants = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList();
- var proteinsWithAppliedVariants2 = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList(); // should be stable
+ var proteinsWithAppliedVariants2 = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList();
+
+ // Act: write to XML and load back; results should match in-memory outputs
string xml = Path.Combine(TestContext.CurrentContext.TestDirectory, "AppliedVariants.xml");
ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), proteinsWithSeqVars, xml);
var proteinsWithAppliedVariants3 = ProteinDbLoader.LoadProteinXML(xml, true, DecoyType.None, null, false, null, out var un);
+ // Group lists for uniform validation loops
var listArray = new List[]
{
proteinsWithAppliedVariants,
@@ -361,34 +544,114 @@ public static void AppliedVariants_AsIBioPolymer()
proteinsWithAppliedVariants3.Cast().ToList()
};
- for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++)
+ // Assert: each expansion produces exactly 5 variant biopolymers, in the same, predictable order
+ Assert.AreEqual(5, proteinsWithAppliedVariants.Count, "Expected 5 variants (in-memory 1)");
+ Assert.AreEqual(5, proteinsWithAppliedVariants2.Count, "Expected 5 variants (in-memory 2)");
+ Assert.AreEqual(5, proteinsWithAppliedVariants3.Count, "Expected 5 variants (XML reload)");
+
+ // Assert: stability across the three sources (same sequences, same order)
+ CollectionAssert.AreEqual(
+ proteinsWithAppliedVariants.Select(p => p.BaseSequence).ToList(),
+ proteinsWithAppliedVariants2.Select(p => p.BaseSequence).ToList(),
+ "In-memory application should be stable across repeated calls");
+ CollectionAssert.AreEqual(
+ proteinsWithAppliedVariants.Select(p => p.BaseSequence).ToList(),
+ proteinsWithAppliedVariants3.Select(p => p.BaseSequence).ToList(),
+ "XML round-trip should preserve variant-applied sequences in the same order");
+
+ // Assert: for all variants we expect exactly one applied sequence variation
+ foreach (var variants in listArray)
{
- // sequences
- Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence);
- Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence);
- Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence);
- Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence);
- Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence);
- Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key);
-
- // SAV
- Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedBeginPosition);
- Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedEndPosition);
-
- // MNV
- Assert.AreEqual(4, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedBeginPosition);
- Assert.AreEqual(5, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedEndPosition);
+ Assert.AreEqual(1, variants[0].AppliedSequenceVariations.Count, "SAV must have exactly one applied variant");
+ Assert.AreEqual(1, variants[1].AppliedSequenceVariations.Count, "MNV must have exactly one applied variant");
+ Assert.AreEqual(1, variants[2].AppliedSequenceVariations.Count, "Insertion must have exactly one applied variant");
+ Assert.AreEqual(1, variants[3].AppliedSequenceVariations.Count, "Deletion must have exactly one applied variant");
+ Assert.AreEqual(1, variants[4].AppliedSequenceVariations.Count, "Insertion+Mod must have exactly one applied variant");
+ }
- // insertion
- Assert.AreEqual(4, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedBeginPosition);
- Assert.AreEqual(6, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedEndPosition);
+ // Per-list validation of sequence, coordinates, and (where appropriate) residue checks and mod checks
+ for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++)
+ {
+ // Assert: expected sequences in fixed order
+ Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence, $"[{dbIdx}] SAV sequence mismatch");
+ Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence, $"[{dbIdx}] MNV sequence mismatch");
+ Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence, $"[{dbIdx}] insertion sequence mismatch");
+ Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence, $"[{dbIdx}] deletion sequence mismatch");
+ Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence, $"[{dbIdx}] insertion+mod sequence mismatch");
+
+ // Assert: lengths (sanity for ins/del)
+ Assert.AreEqual(8, listArray[dbIdx][0].BaseSequence.Length, $"[{dbIdx}] SAV length should be unchanged");
+ Assert.AreEqual(8, listArray[dbIdx][1].BaseSequence.Length, $"[{dbIdx}] MNV length should be unchanged");
+ Assert.AreEqual(10, listArray[dbIdx][2].BaseSequence.Length, $"[{dbIdx}] insertion length should be 8 + 2 = 10");
+ Assert.AreEqual(8, listArray[dbIdx][3].BaseSequence.Length, $"[{dbIdx}] deletion length should be 10 - 2 = 8");
+ Assert.AreEqual(10, listArray[dbIdx][4].BaseSequence.Length, $"[{dbIdx}] insertion+mod length should be 10");
+
+ // SAV assertions: P(4)->V
+ var sav = listArray[dbIdx][0].AppliedSequenceVariations.Single();
+ Assert.AreEqual(4, sav.OneBasedBeginPosition, $"[{dbIdx}] SAV begin");
+ Assert.AreEqual(4, sav.OneBasedEndPosition, $"[{dbIdx}] SAV end");
+ Assert.AreEqual("P", sav.OriginalSequence, $"[{dbIdx}] SAV original");
+ Assert.AreEqual("V", sav.VariantSequence, $"[{dbIdx}] SAV variant");
+ Assert.AreEqual('V', listArray[dbIdx][0].BaseSequence[3], $"[{dbIdx}] SAV residue at 4 must be 'V'");
+
+ // MNV assertions: PT(4..5)->KT
+ var mnv = listArray[dbIdx][1].AppliedSequenceVariations.Single();
+ Assert.AreEqual(4, mnv.OneBasedBeginPosition, $"[{dbIdx}] MNV begin");
+ Assert.AreEqual(5, mnv.OneBasedEndPosition, $"[{dbIdx}] MNV end");
+ Assert.AreEqual("PT", mnv.OriginalSequence, $"[{dbIdx}] MNV original");
+ Assert.AreEqual("KT", mnv.VariantSequence, $"[{dbIdx}] MNV variant");
+ Assert.AreEqual("KT", listArray[dbIdx][1].BaseSequence.Substring(3, 2), $"[{dbIdx}] MNV residues 4..5 must be 'KT'");
+
+ // Insertion-like assertions: P(4)->PPP(4..6)
+ var ins = listArray[dbIdx][2].AppliedSequenceVariations.Single();
+ Assert.AreEqual(4, ins.OneBasedBeginPosition, $"[{dbIdx}] insertion begin");
+ Assert.AreEqual(6, ins.OneBasedEndPosition, $"[{dbIdx}] insertion end");
+ Assert.AreEqual("P", ins.OriginalSequence, $"[{dbIdx}] insertion original");
+ Assert.AreEqual("PPP", ins.VariantSequence, $"[{dbIdx}] insertion variant");
+ Assert.AreEqual("PPP", listArray[dbIdx][2].BaseSequence.Substring(3, 3), $"[{dbIdx}] insertion residues 4..6 must be 'PPP'");
+
+ // Deletion-like assertions: PPP(4..6)->P(4) (collapses back to MPEPTIDE)
+ var del = listArray[dbIdx][3].AppliedSequenceVariations.Single();
+ Assert.AreEqual(4, del.OneBasedBeginPosition, $"[{dbIdx}] deletion begin");
+ Assert.AreEqual(4, del.OneBasedEndPosition, $"[{dbIdx}] deletion end (post-collapse)");
+ Assert.AreEqual("PPP", del.OriginalSequence, $"[{dbIdx}] deletion original");
+ Assert.AreEqual("P", del.VariantSequence, $"[{dbIdx}] deletion variant");
+
+ // Insertion + Modification assertions: identical sequence to insertion, plus one mod at pos 5
+ var insMod = listArray[dbIdx][4].AppliedSequenceVariations.Single();
+ Assert.AreEqual(4, insMod.OneBasedBeginPosition, $"[{dbIdx}] insertion+mod begin");
+ Assert.AreEqual(6, insMod.OneBasedEndPosition, $"[{dbIdx}] insertion+mod end");
+ Assert.AreEqual("P", insMod.OriginalSequence, $"[{dbIdx}] insertion+mod original");
+ Assert.AreEqual("PPP", insMod.VariantSequence, $"[{dbIdx}] insertion+mod variant");
+
+ // Mods: Only the "insertion+mod" variant (index 4) carries a mod; all others should have none
+ // Confirm the variant 4 has exactly one mod at one-based position 5, and the residue at 5 is 'P' (motif).
+ if (dbIdx == 0 || dbIdx == 1 || dbIdx == 2) // only assert detailed mod state once per path for clarity
+ {
+ // Index 2 (plain insertion) must have no mods
+ Assert.AreEqual(0, listArray[dbIdx][2].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] insertion should have no mods");
+
+ // Index 4 (insertion+mod) must have exactly one mod at site 5, targeting a P residue
+ Assert.AreEqual(1, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] insertion+mod should have exactly one site with mods");
+ Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key, $"[{dbIdx}] insertion+mod site should be one-based index 5");
+ Assert.AreEqual('P', listArray[dbIdx][4].BaseSequence[5 - 1], $"[{dbIdx}] insertion+mod residue at site 5 must be 'P' (motif)");
+
+ // All other variants should have zero possible localized mods
+ Assert.AreEqual(0, listArray[dbIdx][0].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] SAV should have no mods");
+ Assert.AreEqual(0, listArray[dbIdx][1].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] MNV should have no mods");
+ Assert.AreEqual(0, listArray[dbIdx][3].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] deletion should have no mods");
+ }
+ }
- // deletion
- Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedBeginPosition);
- Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedEndPosition);
+ // Additional cross-list checks:
+ // - The two "MPEPPPTIDE" variants (indices 2 and 4) should be sequence-identical, but differ by modification presence.
+ for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++)
+ {
+ Assert.AreEqual(listArray[dbIdx][2].BaseSequence, listArray[dbIdx][4].BaseSequence, $"[{dbIdx}] insertion and insertion+mod sequences must match");
+ Assert.AreEqual(0, listArray[dbIdx][2].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] insertion should remain unmodified");
+ Assert.AreEqual(1, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] insertion+mod should remain modified");
}
}
-
[Test]
public static void CrashOnCreateVariantFromRNA()
{
@@ -401,43 +664,277 @@ public static void CrashOnCreateVariantFromRNA()
proteins[0].CreateVariant(proteins[0].BaseSequence, rna, [], [], new Dictionary>(), "");
});
}
-
[Test]
public static void StopGained()
{
- var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), true,
- DecoyType.None, null, false, null, out var unknownModifications);
+ // PURPOSE
+ // This test validates correct handling of a stop-gained sequence variation (creation of a premature stop codon).
+ // Verifies two scenarios:
+ // 1) Default variant depth filtering: both reference (no variant applied) and alternate (stop-gained applied) proteins are emitted.
+ // 2) High min-allele-depth threshold: only the stop-gained (applied) protein is retained.
+ //
+ // EXPECTATIONS SUMMARY (based on the StopGained.xml test data):
+ // - Two proteins are initially produced (reference + variant-applied):
+ // [0] Reference protein:
+ // * 1 sequence variation present in metadata, but 0 applied (reference form).
+ // * BaseSequence length = 191.
+ // * Residue at one-based position 161 equals 'Q' (so zero-based index 160 is 'Q').
+ // [1] Variant-applied protein (stop-gained):
+ // * Exactly 1 applied variation.
+ // * BaseSequence length truncated to 161 - 1 = 160 (stop codon at 161 shortens the sequence).
+ // * The sequence is exactly the prefix of the reference up to length 160.
+ // * No '*' appears in the resulting BaseSequence (the stop codon is not a literal character in the sequence).
+ // * The applied variant's VariantSequence ends with '*'.
+ // * The applied variant one-based begin (and end) position is 161.
+ //
+ // - With minAlleleDepth: 400
+ // * Only the variant-applied protein is returned.
+ // * It retains the same applied-variation and truncated length expectations.
+
+ // Load the proteins with default filtering
+ var proteins = ProteinDbLoader.LoadProteinXML(
+ Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"),
+ true, // generateTargets
+ DecoyType.None, // decoyType
+ null, // allKnownModifications
+ false, // isContaminant
+ null, // modTypesToExclude
+ out var unknownModifications);
+
+ // Sanity: Decoys are not requested
+ Assert.IsTrue(proteins.All(p => !p.IsDecoy), "No decoys expected when using DecoyType.None");
+
+ // Expect exactly two proteins: reference (no applied variant) and stop-gained (applied)
+ Assert.AreEqual(2, proteins.Count, "Expected reference and stop-gained variant proteins");
+ // Reference protein metadata: one possible sequence variation in the record
+ Assert.AreEqual(1, proteins[0].SequenceVariations.Count(), "Reference metadata should contain one sequence variation");
+ Assert.AreEqual(1, proteins[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), "Reference metadata should contain exactly one unique sequence variation");
+ // Reference should have zero applied variations (reference form retained)
+ Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Count(), "Reference protein should not have an applied sequence variation");
+ Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), "Reference applied variations should be zero");
+
+ // Variant-applied protein: applied variation present and unique
+ Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Count(), "Variant protein should have exactly one applied variation");
+ Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), "Variant protein applied variations should be unique");
+
+ // Reference length and residue checks around the stop site
+ Assert.AreEqual(191, proteins[0].Length, "Reference protein length should match source data");
+ Assert.AreEqual('Q', proteins[0][161 - 1], "Reference residue at 161 should be 'Q' prior to stop-gain");
+
+ // Variant length must be truncated by the stop at position 161 → length becomes 160
+ Assert.AreEqual(161 - 1, proteins[1].Length, "Variant protein length should be truncated to 160 due to stop at 161");
+
+ // The variant BaseSequence must be exactly the prefix of the reference up to the stop position - 1
+ string reference = proteins[0].BaseSequence;
+ string variant = proteins[1].BaseSequence;
+ Assert.IsTrue(reference.StartsWith(variant), "Variant sequence must be a prefix of the reference sequence");
+ Assert.AreEqual(reference.Substring(0, 161 - 1), variant, "Variant sequence must equal reference[0..159]");
+
+ // Ensure we did not write literal '*' into the protein sequence; stop codon is represented by truncation instead
+ Assert.AreEqual(-1, variant.IndexOf('*'), "Variant BaseSequence should not contain a literal '*'");
+
+ // Verify applied-variant details for the stop-gained protein
+ var applied = proteins[1].AppliedSequenceVariations.Single();
+ Assert.IsTrue(applied.VariantSequence.EndsWith("*"), "Stop-gained variant must end with '*'");
+ Assert.AreEqual(161, applied.OneBasedBeginPosition, "Stop-gained begins at residue 161");
+ Assert.AreEqual(161, applied.OneBasedEndPosition, "Stop-gained ends at residue 161 (single residue change)");
+
+ // The two forms must differ in sequence length (as a sanity check)
+ Assert.AreNotEqual(proteins[0].Length, proteins[1].Length, "Reference and variant proteins should differ in length");
+
+ // Now require a higher min-allele-depth; expect only the variant-applied protein retained
+ proteins = ProteinDbLoader.LoadProteinXML(
+ Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"),
+ true, // generateTargets
+ DecoyType.None, // decoyType
+ null, // allKnownModifications
+ false, // isContaminant
+ null, // modTypesToExclude
+ out unknownModifications,
+ minAlleleDepth: 400);
+
+ // Only the stop-gained, variant-applied form is retained under a strict depth threshold
+ Assert.AreEqual(1, proteins.Count, "High min-allele-depth should retain only the variant-applied protein");
+ Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Count(), "Variant-applied protein should still have one applied variation");
+ Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), "Variant-applied unique variation should be retained");
+ Assert.AreEqual(161 - 1, proteins[0].Length, "Variant-applied protein length should remain truncated to 160");
+
+ // Confirm stability: the single protein from the depth-filtered load matches the previously observed variant sequence
+ Assert.AreEqual(variant, proteins[0].BaseSequence, "Depth-filtered variant sequence should match previously observed variant");
+
+ // Re-check applied-variant semantics after filtering for completeness
+ var appliedAfterFilter = proteins[0].AppliedSequenceVariations.Single();
+ Assert.IsTrue(appliedAfterFilter.VariantSequence.EndsWith("*"), "Stop-gained variant must end with '*' (after filtering)");
+ Assert.AreEqual(161, appliedAfterFilter.OneBasedBeginPosition, "Stop-gained begins at residue 161 (after filtering)");
+ Assert.AreEqual(161, appliedAfterFilter.OneBasedEndPosition, "Stop-gained ends at residue 161 (after filtering)");
+ }
+ [Test]
+ public static void StopGained_TruncationIsPrefixAndNoOutOfBoundsAnnotations()
+ {
+ var proteins = ProteinDbLoader.LoadProteinXML(
+ Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"),
+ true, DecoyType.None, null, false, null, out var _);
+
Assert.AreEqual(2, proteins.Count);
- Assert.AreEqual(1, proteins[0].SequenceVariations.Count()); // some redundant
- Assert.AreEqual(1, proteins[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes
- Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Count()); // some redundant
- Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes
- Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Count()); // some redundant
- Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes
- Assert.AreEqual(191, proteins[0].Length);
- Assert.AreEqual('Q', proteins[0][161 - 1]);
- Assert.AreEqual(161 - 1, proteins[1].Length);
- Assert.AreNotEqual(proteins[0].Length, proteins[1].Length);
+ var reference = proteins[0];
+ var truncated = proteins[1];
- proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), true,
- DecoyType.None, null, false, null, out unknownModifications, minAlleleDepth: 400);
- Assert.AreEqual(1, proteins.Count);
- Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Count()); // some redundant
- Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes
- Assert.AreEqual(161 - 1, proteins[0].Length);
+ // The truncated sequence must be a prefix of the reference,
+ // i.e., identical up to the truncation point.
+ Assert.That(reference.BaseSequence.StartsWith(truncated.BaseSequence));
+
+ // Any possible localized modifications must not point past the truncation boundary.
+ Assert.That(truncated.OneBasedPossibleLocalizedModifications
+ .All(kv => kv.Key >= 1 && kv.Key <= truncated.Length));
+
+ // Any proteolysis products (if present) must not reference indices outside the sequence.
+ Assert.That(truncated.TruncationProducts.All(tp =>
+ (!tp.OneBasedBeginPosition.HasValue || (tp.OneBasedBeginPosition.Value >= 1 && tp.OneBasedBeginPosition.Value <= truncated.Length)) &&
+ (!tp.OneBasedEndPosition.HasValue || (tp.OneBasedEndPosition.Value >= 1 && tp.OneBasedEndPosition.Value <= truncated.Length))));
+
+ // The applied stop-gained variation often encodes a '*' in the variant sequence.
+ // If present, that indicates stop; the actual sequence is cut at the stop.
+ if (truncated.AppliedSequenceVariations.Any())
+ {
+ Assert.That(truncated.AppliedSequenceVariations.Single().VariantSequence.EndsWith("*") ||
+ !truncated.AppliedSequenceVariations.Single().VariantSequence.Contains("*"));
+ }
}
+ [Test]
+ public static void StopGained_RoundTripSerializationPreservesTruncation()
+ {
+ var proteins = ProteinDbLoader.LoadProteinXML(
+ Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"),
+ true, DecoyType.None, null, false, null, out var _);
+
+ Assert.AreEqual(2, proteins.Count);
+ var tempPath = Path.Combine(TestContext.CurrentContext.TestDirectory, $"StopGained_roundtrip_{Guid.NewGuid()}.xml");
+ try
+ {
+ // Persist both proteins (reference + variant) and reload.
+ ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), proteins, tempPath);
+ var roundtrip = ProteinDbLoader.LoadProteinXML(tempPath, true, DecoyType.None, null, false, null, out var __);
+
+ // Round-trip preserves count and the truncation boundary for the variant-applied protein.
+ Assert.AreEqual(2, roundtrip.Count);
+ Assert.AreEqual(proteins[0].Length, roundtrip[0].Length);
+ Assert.AreEqual(proteins[1].Length, roundtrip[1].Length);
+ Assert.AreEqual(proteins[1].AppliedSequenceVariations.Count(), roundtrip[1].AppliedSequenceVariations.Count());
+ }
+ finally
+ {
+ if (File.Exists(tempPath))
+ {
+ File.SetAttributes(tempPath, FileAttributes.Normal);
+ File.Delete(tempPath);
+ }
+ }
+ }
[Test]
- public static void StopGainedDecoysAndDigestion()
+ public static void StopGained_NoPeptidesCrossTruncationSite()
{
- // test decoys and digestion
- var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGain.xml"), true,
- DecoyType.Reverse, null, false, null, out var unknownModifications, minAlleleDepth: 400);
+ var proteins = ProteinDbLoader.LoadProteinXML(
+ Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"),
+ true, DecoyType.None, null, false, null, out var _);
+
Assert.AreEqual(2, proteins.Count);
- var targetPeps = proteins[0].Digest(new DigestionParams(), null, null).ToList();
- var decoyPeps = proteins[1].Digest(new DigestionParams(), null, null).ToList();
- //Assert.AreEqual(targetPeps.Sum(p => p.Length), decoyPeps.Sum(p => p.Length));
- //Assert.AreEqual(targetPeps.Count, decoyPeps.Count);
+ var reference = proteins[0];
+ var truncated = proteins[1];
+
+ // Peptides from the truncated protein must not reference indices past the truncation boundary.
+ var dp = new DigestionParams();
+ var variantPeps = truncated.Digest(dp, null, null).ToList();
+ Assert.That(variantPeps.All(p => p.OneBasedEndResidueInProtein <= truncated.Length));
+
+ // Any peptide in the reference that extends past the truncation boundary cannot exist in the variant.
+ var refPeps = reference.Digest(dp, null, null).ToList();
+ var refCrossing = refPeps.Where(p => p.OneBasedEndResidueInProtein > truncated.Length).ToList();
+ var variantPepWindows = new HashSet<(int start, int end)>(variantPeps.Select(p => (p.OneBasedStartResidueInProtein, p.OneBasedEndResidueInProtein)));
+ Assert.That(refCrossing.All(p => !variantPepWindows.Contains((p.OneBasedStartResidueInProtein, p.OneBasedEndResidueInProtein))));
+ }
+ [Test]
+ public static void StopGainedDecoysAndDigestion()
+ {
+ // PURPOSE
+ // This test ensures that:
+ // 1) Reverse decoys are generated for a stop-gained (truncated) target protein.
+ // 2) The target and its decoy digest into peptides without referencing residues outside their sequence bounds.
+ // 3) Basic invariants hold for decoy generation (count/order/length/decoy flag).
+ //
+ // CONTEXT
+ // - Database: "StopGain.xml" contains a target protein with a stop-gained variant that truncates the sequence.
+ // - Decoys: Generated using DecoyType.Reverse (sequence reversal-based decoys).
+ // - minAlleleDepth: 400 ensures the stop-gained variant is applied (truncated target).
+ //
+ // EXPECTATIONS
+ // - Exactly 2 proteins are returned: [0] target (non-decoy) + [1] decoy (IsDecoy = true).
+ // - Target and decoy lengths are equal (reverse decoys preserve length).
+ // - Both target and decoy produce peptides via digestion.
+ // - No peptide references indices outside its parent protein's 1..Length range.
+ // - Accessions reflect decoy generation (decoy starts with the default "DECOY_").
+ // - If variant(s) are present, their counts match between target and decoy.
+
+ // Arrange: Load a variant-applied protein set and reverse decoy pair from StopGain.xml.
+ // Using a strict minAlleleDepth applies the stop-gained variant to the target.
+ var proteins = ProteinDbLoader.LoadProteinXML(
+ Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGain.xml"),
+ true, // generateTargets
+ DecoyType.Reverse, // generate reverse-sequence decoys
+ null, // allKnownModifications
+ false, // isContaminant
+ null, // modTypesToExclude
+ out var unknownModifications,
+ minAlleleDepth: 400); // force applying the stop-gained variant
+
+ // Assert: We expect exactly two proteins: target then its decoy.
+ Assert.AreEqual(2, proteins.Count, "Expected 1 target + 1 decoy");
+ Assert.IsFalse(proteins[0].IsDecoy, "First protein should be the target (non-decoy)");
+ Assert.IsTrue(proteins[1].IsDecoy, "Second protein should be the decoy");
+ Assert.That(proteins[1].Accession.StartsWith("DECOY_"), "Decoy accession should start with the default decoy identifier");
+
+ // Assert: Reverse decoys should preserve sequence length.
+ Assert.AreEqual(proteins[0].Length, proteins[1].Length, "Target and decoy should have identical lengths");
+ // In general, target and decoy sequences should not be byte-identical.
+ Assert.AreNotEqual(proteins[0].BaseSequence, proteins[1].BaseSequence, "Decoy sequence should differ from target sequence");
+
+ // If the stop-gained variant is applied to the target, the decoy should carry a corresponding variant count.
+ // We do not enforce exact mapping positions here, only that counts match if any are present.
+ if (proteins[0].AppliedSequenceVariations.Any() || proteins[1].AppliedSequenceVariations.Any())
+ {
+ Assert.AreEqual(
+ proteins[0].AppliedSequenceVariations.Count(),
+ proteins[1].AppliedSequenceVariations.Count(),
+ "Target and decoy should carry the same number of applied sequence variations");
+ }
+
+ // Act: Digest both target and decoy using default digestion parameters (typically trypsin-like rules).
+ var dp = new DigestionParams();
+ var targetPeps = proteins[0].Digest(dp, null, null).ToList();
+ var decoyPeps = proteins[1].Digest(dp, null, null).ToList();
+
+ // Assert: Both should yield peptides.
+ Assert.That(targetPeps.Count > 0, "Target digestion should produce peptides");
+ Assert.That(decoyPeps.Count > 0, "Decoy digestion should produce peptides");
+
+ // Assert: No peptide references residues outside the corresponding protein bounds.
+ Assert.That(targetPeps.All(p =>
+ p.OneBasedStartResidueInProtein >= 1 &&
+ p.OneBasedEndResidueInProtein <= proteins[0].Length),
+ "All target peptides must fall within target bounds");
+
+ Assert.That(decoyPeps.All(p =>
+ p.OneBasedStartResidueInProtein >= 1 &&
+ p.OneBasedEndResidueInProtein <= proteins[1].Length),
+ "All decoy peptides must fall within decoy bounds");
+
+ // Note:
+ // We intentionally do NOT assert the number of peptides or the sum of peptide lengths to be equal between
+ // target and decoy. Even with reverse decoys, tryptic cleavage context differs and may alter cleavage patterns.
+ // The commented lines below are often too strict and can fail legitimately:
+ //
+ // Assert.AreEqual(targetPeps.Sum(p => p.Length), decoyPeps.Sum(p => p.Length));
+ // Assert.AreEqual(targetPeps.Count, decoyPeps.Count);
}
[Test]
@@ -490,7 +987,7 @@ public void VariantSymbolWeirdnessXml()
string file = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "SeqVarSymbolWeirdness.xml");
List variantProteins = ProteinDbLoader.LoadProteinXML(file, true, DecoyType.None, null, false, null, out var un);
Assert.AreEqual(12, variantProteins.First().ConsensusVariant.SequenceVariations.Count());
- Assert.AreEqual(2, variantProteins.First().ConsensusVariant.SequenceVariations.Count(v => v.Description.Heterozygous.Any(kv => kv.Value)));
+ Assert.AreEqual(2, variantProteins.First().ConsensusVariant.SequenceVariations.Count(v => v.VariantCallFormatDataString.Heterozygous.Any(kv => kv.Value)));
Assert.AreEqual(1, variantProteins.Count); // Should be 2^2 from combinitorics of heterozygous, but the giant indels overwrite them
Assert.AreEqual(0, variantProteins.Where(v => v.BaseSequence == variantProteins.First().ConsensusVariant.BaseSequence).Count()); // Homozygous variations are included
@@ -560,21 +1057,159 @@ public void IndelDecoyVariants()
[Test]
public void SequenceVariationIsValidTest()
{
- SequenceVariation sv1 = new SequenceVariation(10, 10, "A", "T", "info", null);
- SequenceVariation sv2 = new SequenceVariation(5, 5, "G", "C", "info", null);
- SequenceVariation sv3 = new SequenceVariation(8, 8, "T", "A", "info", null);
+ // PURPOSE
+ // Validate the minimal, position-only "validity" rules implemented by SequenceVariation.AreValid():
+ // AreValid() == (OneBasedBeginPosition > 0) && (OneBasedEndPosition >= OneBasedBeginPosition)
+ //
+ // We cover:
+ // 1) Explicit begin/end ctor with typical point mutations → valid.
+ // 2) Explicit begin/end ctor with invalid positions → invalid.
+ // 3) Explicit begin/end ctor representing insertion/deletion edge-cases → valid as long as positions are valid.
+ // 4) One-position convenience ctor behavior for different originalSequence values (null, "", length > 0).
+ // - This ctor derives end as: end = (original == null) ? begin : begin + original.Length - 1.
+ // - Therefore, empty originalSequence "" makes end = begin - 1 → invalid by design.
+ // 5) Content fields (Original/Variant) and OneBasedModifications do NOT affect AreValid(), only positions do.
+ // 6) Optional sanity checks on derived fields (SimpleString and computed end position).
+
+ // -----------------------------
+ // 1) Valid: explicit begin/end point mutations (begin == end, begin > 0)
+ // -----------------------------
+ SequenceVariation sv1 = new SequenceVariation(
+ oneBasedBeginPosition: 10, oneBasedEndPosition: 10,
+ originalSequence: "A", variantSequence: "T",
+ description: "info", oneBasedModifications: null);
+ SequenceVariation sv2 = new SequenceVariation(
+ oneBasedBeginPosition: 5, oneBasedEndPosition: 5,
+ originalSequence: "G", variantSequence: "C",
+ description: "info", oneBasedModifications: null);
+ SequenceVariation sv3 = new SequenceVariation(
+ oneBasedBeginPosition: 8, oneBasedEndPosition: 8,
+ originalSequence: "T", variantSequence: "A",
+ description: "info", oneBasedModifications: null);
+
+ // A protein can carry multiple variations; positions alone determine validity.
List svList = new List { sv1, sv2, sv3 };
-
Protein variantProtein = new Protein("ACDEFGHIKLMNPQRSTVWY", "protein1", sequenceVariations: svList);
- Assert.IsTrue(variantProtein.SequenceVariations.All(v => v.AreValid()));
- SequenceVariation svInvalidOneBasedBeginLessThanOne = new SequenceVariation(0, 10, "A", "T", "info", null);
- SequenceVariation svInvalidOneBasedEndLessThanOneBasedBegin = new SequenceVariation(5, 4, "G", "C", "info", null);
- SequenceVariation svValidOriginalSequenceIsEmpty = new SequenceVariation(8, 8, "", "A", "info", null);
- SequenceVariation svValidVariantSequenceLenthIsZero = new SequenceVariation(10, 10, "A", "", "info", null);
- Assert.IsFalse(svInvalidOneBasedBeginLessThanOne.AreValid());
- Assert.IsFalse(svInvalidOneBasedEndLessThanOneBasedBegin.AreValid());
- Assert.IsTrue(svValidOriginalSequenceIsEmpty.AreValid()); //This is valid because it is an insertion
- Assert.IsTrue(svValidVariantSequenceLenthIsZero.AreValid()); // This is valid because it is a deletion
+
+ // Expectation: all three above are valid (begin > 0 and end == begin).
+ Assert.IsTrue(variantProtein.SequenceVariations.All(v => v.AreValid()), "All explicit point mutations with valid positions should be valid");
+
+ // -----------------------------
+ // 2) Invalid: begin < 1 and end < begin
+ // -----------------------------
+ SequenceVariation svInvalidOneBasedBeginLessThanOne = new SequenceVariation(
+ oneBasedBeginPosition: 0, oneBasedEndPosition: 10,
+ originalSequence: "A", variantSequence: "T",
+ description: "info", oneBasedModifications: null);
+ Assert.IsFalse(svInvalidOneBasedBeginLessThanOne.AreValid(), "Begin position must be >= 1");
+
+ SequenceVariation svInvalidOneBasedEndLessThanOneBasedBegin = new SequenceVariation(
+ oneBasedBeginPosition: 5, oneBasedEndPosition: 4,
+ originalSequence: "G", variantSequence: "C",
+ description: "info", oneBasedModifications: null);
+ Assert.IsFalse(svInvalidOneBasedEndLessThanOneBasedBegin.AreValid(), "End position cannot be less than begin position");
+
+ // -----------------------------
+ // 3) Explicit begin/end edge-cases: insertion and deletion modeled by content only
+ // NOTE: AreValid ignores Original/Variant content; only positions matter.
+ // -----------------------------
+ // Insertion-like (explicit): original is empty (""), variant has content.
+ // Valid because we explicitly supply begin == end (positions are valid).
+ SequenceVariation svValidOriginalSequenceIsEmpty = new SequenceVariation(
+ oneBasedBeginPosition: 8, oneBasedEndPosition: 8,
+ originalSequence: "", variantSequence: "A",
+ description: "info", oneBasedModifications: null);
+ Assert.IsTrue(svValidOriginalSequenceIsEmpty.AreValid(), "Explicit insertion with valid positions should be considered valid");
+
+ // Deletion-like (explicit): variant is empty (""), original has content, positions still valid.
+ SequenceVariation svValidVariantSequenceLengthIsZero = new SequenceVariation(
+ oneBasedBeginPosition: 10, oneBasedEndPosition: 10,
+ originalSequence: "A", variantSequence: "",
+ description: "info", oneBasedModifications: null);
+ Assert.IsTrue(svValidVariantSequenceLengthIsZero.AreValid(), "Explicit deletion with valid positions should be considered valid");
+
+ // -----------------------------
+ // 4) One-position convenience ctor behavior for originalSequence values
+ // ctor: SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, ...)
+ // end is computed as:
+ // - if original == null → end = begin
+ // - else → end = begin + original.Length - 1
+ // -----------------------------
+
+ // 4a) originalSequence has length 1 → end == begin (valid)
+ var svPosCtorLength1 = new SequenceVariation(
+ oneBasedPosition: 15,
+ originalSequence: "K",
+ variantSequence: "R",
+ description: "pos-ctor length 1");
+ Assert.AreEqual(15, svPosCtorLength1.OneBasedBeginPosition);
+ Assert.AreEqual(15, svPosCtorLength1.OneBasedEndPosition, "With original length 1, end should equal begin");
+ Assert.IsTrue(svPosCtorLength1.AreValid(), "Single-site variation via position ctor should be valid");
+
+ // 4b) originalSequence has length > 1 → end == begin + len - 1 (valid)
+ var svPosCtorLength3 = new SequenceVariation(
+ oneBasedPosition: 20,
+ originalSequence: "PEP", // len = 3
+ variantSequence: "AAA", // content irrelevant to AreValid
+ description: "pos-ctor length 3");
+ Assert.AreEqual(20, svPosCtorLength3.OneBasedBeginPosition);
+ Assert.AreEqual(22, svPosCtorLength3.OneBasedEndPosition, "End should be begin + original.Length - 1");
+ Assert.IsTrue(svPosCtorLength3.AreValid(), "Multi-length replacement with valid positions should be valid");
+
+ // 4c) originalSequence == null → end = begin (valid)
+ var svPosCtorNullOriginal = new SequenceVariation(
+ oneBasedPosition: 30,
+ originalSequence: null, // special case handled in ctor: end = begin
+ variantSequence: "A",
+ description: "pos-ctor null original");
+ Assert.AreEqual(30, svPosCtorNullOriginal.OneBasedBeginPosition);
+ Assert.AreEqual(30, svPosCtorNullOriginal.OneBasedEndPosition, "Null original should set end = begin");
+ Assert.IsTrue(svPosCtorNullOriginal.AreValid(), "Null original via position ctor is treated as length 1 (valid)");
+
+ // 4d) originalSequence == "" (empty) → end = begin - 1 (invalid by design)
+ // This models an insertion if you rely solely on the position ctor, but produces end < begin → invalid.
+ // For insertions, prefer the explicit begin/end ctor with valid positions (see 3).
+ var svPosCtorEmptyOriginal = new SequenceVariation(
+ oneBasedPosition: 40,
+ originalSequence: "", // empty → end = 39
+ variantSequence: "A",
+ description: "pos-ctor empty original");
+ Assert.AreEqual(40, svPosCtorEmptyOriginal.OneBasedBeginPosition);
+ Assert.AreEqual(39, svPosCtorEmptyOriginal.OneBasedEndPosition, "Empty original sets end = begin - 1");
+ Assert.IsFalse(svPosCtorEmptyOriginal.AreValid(), "Position ctor with empty original is invalid (end < begin)");
+
+ // -----------------------------
+ // 5) Validity is position-only; content and mods do not change AreValid()
+ // - VariantSequence null is normalized to "" in the ctor.
+ // - OneBasedModifications is stored but ignored by AreValid().
+ // -----------------------------
+ var mods = new Dictionary>
+ {
+ { 1, new List() } // empty mod list at a site; still ignored by AreValid
+ };
+ var svContentIrrelevant = new SequenceVariation(
+ oneBasedBeginPosition: 3, oneBasedEndPosition: 3,
+ originalSequence: "M", variantSequence: null, // becomes ""
+ description: "mods/variant null test", oneBasedModifications: mods);
+ Assert.IsTrue(svContentIrrelevant.AreValid(), "Null variant and/or mods should not affect positional validity");
+ Assert.AreEqual("", svContentIrrelevant.VariantSequence, "Null VariantSequence is normalized to empty string");
+
+ // -----------------------------
+ // 6) Sanity: SimpleString format and positive bounds at the edge of sequence
+ // -----------------------------
+ var svSimple = new SequenceVariation(
+ oneBasedBeginPosition: 1, oneBasedEndPosition: 1,
+ originalSequence: "A", variantSequence: "V",
+ description: "simple");
+ // SimpleString = Original + Begin + Variant (no delimiter)
+ Assert.AreEqual("A1V", svSimple.SimpleString(), "SimpleString should concatenate original + begin + variant");
+
+ // Additional guard: begin == 1 is valid if end >= begin
+ var svAtStart = new SequenceVariation(
+ oneBasedBeginPosition: 1, oneBasedEndPosition: 2,
+ originalSequence: "MA", variantSequence: "MV",
+ description: "range at start");
+ Assert.IsTrue(svAtStart.AreValid(), "Ranges that start at 1 are valid provided end >= begin");
}
[Test]
public void VariantModificationTest()
@@ -644,54 +1279,294 @@ public void WriteProteinXmlWithVariantsDiscoveredAsModifications2()
Assert.That(newProtein.SequenceVariations.Count, Is.EqualTo(totalSequenceVariations + 1)); //This number increases by 1 because we added a sequence variation that was discovered as a modification
Assert.AreEqual(0,newProtein.OneBasedPossibleLocalizedModifications.Count); //This number should be 0 because we converted the modification to a sequence variation
}
-
+ ///
+ /// PURPOSE
+ /// Ensures that variant proteins are automatically generated during XML read, both for targets and reverse decoys.
+ /// The database "humanGAPDH.xml" encodes two single-nucleotide substitutions on the target protein P04406:
+ /// - A22G
+ /// - K251N
+ ///
+ /// EXPECTATIONS
+ /// - Loader emits all combinatorial target variants derived from those two changes:
+ /// [0] Reference (no variants applied) → Accession "P04406"
+ /// [1] Single variant A22G → Accession "P04406_A22G"
+ /// [2] Single variant K251N → Accession "P04406_K251N"
+ /// [3] Double variant K251N + A22G (combined) → Accession "P04406_K251N_A22G"
+ /// - With DecoyType.Reverse, a matching set of 4 reverse decoys is produced with mirrored coordinates:
+ /// [4] Decoy of reference → "DECOY_P04406"
+ /// [5] Decoy with mirrored A22G (mapped to site 315) → "DECOY_P04406_A315G"
+ /// [6] Decoy with mirrored K251N (mapped to site 86) → "DECOY_P04406_K86N"
+ /// [7] Decoy with both mirrored variants → "DECOY_P04406_K86N_A315G"
+ ///
+ /// WHY THIS MATTERS
+ /// - Validates that the reader expands sequence variation definitions into concrete variant proteins.
+ /// - Verifies decoy generation mirrors variant coordinates appropriately and preserves ordering.
+ /// - Guards against regressions in accession naming, variant application counts, and decoy parity.
+ ///
+ /// PARAMETERS PASSED TO LOADER
+ /// - generateTargets: true → emit target proteins
+ /// - decoyType: Reverse → also emit reverse decoys
+ /// - allKnownModifications: UniProtPtms → resolve any UniProt-annotated PTMs so no "unknown" mods remain
+ /// - isContaminant: false
+ /// - modTypesToExclude: null
+ /// - out unknownModifications → capture any unrecognized mods (should be empty when UniProtPtms is provided)
+ /// - minAlleleDepth: 1 → do not filter out these low-depth test variants
+ /// - maxHeterozygousVariants: 99 → allow generating all combinations from the two sites (up to 2^2)
+ ///
[Test]
public static void TestThatProteinVariantsAreGeneratedDuringRead()
{
+ // Arrange: load a target with two site-specific variants and request reverse decoys as well.
+ // IMPORTANT: Provide UniProtPtms so annotations in humanGAPDH.xml resolve and do not end up in unknownModifications.
string databaseName = "humanGAPDH.xml";
- var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true,
- DecoyType.Reverse, null, false, null, out var unknownModifications, 1, 99);
- Assert.AreEqual(8, proteins.Count); // 4 target + 4 decoy
- Assert.AreEqual(2, proteins[0].SequenceVariations.Count()); // these sequence variations were in the original
- Assert.That("P04406", Is.EqualTo(proteins[0].Accession));
- Assert.That("P04406_A22G", Is.EqualTo(proteins[1].Accession));
- Assert.That("P04406_K251N", Is.EqualTo(proteins[2].Accession));
- Assert.That("P04406_K251N_A22G", Is.EqualTo(proteins[3].Accession));
- Assert.That("DECOY_P04406", Is.EqualTo(proteins[4].Accession));
- Assert.That("DECOY_P04406_A315G", Is.EqualTo(proteins[5].Accession));
- Assert.That("DECOY_P04406_K86N", Is.EqualTo(proteins[6].Accession));
- Assert.That("DECOY_P04406_K86N_A315G", Is.EqualTo(proteins[7].Accession));
+ var proteins = ProteinDbLoader.LoadProteinXML(
+ Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName),
+ generateTargets: true,
+ decoyType: DecoyType.Reverse,
+ allKnownModifications: UniProtPtms, // CHANGED: was null; supplying known PTMs prevents unknownModifications from being populated
+ isContaminant: false,
+ modTypesToExclude: null,
+ unknownModifications: out var unknownModifications,
+ minAlleleDepth: 1,
+ maxHeterozygousVariants: 99);
+
+ // Basic shape: 4 targets + 4 reverse decoys in a deterministic order.
+ Assert.AreEqual(8, proteins.Count, "Expected 4 targets and 4 decoys in a fixed order");
+
+ // Targets/decoys split and flags should be consistent and easy to reason about.
+ Assert.AreEqual(4, proteins.Count(p => !p.IsDecoy), "First half should be targets");
+ Assert.AreEqual(4, proteins.Count(p => p.IsDecoy), "Second half should be decoys");
+ for (int i = 0; i < 4; i++)
+ {
+ Assert.IsFalse(proteins[i].IsDecoy, $"Index {i} should be a target");
+ Assert.IsTrue(proteins[i + 4].IsDecoy, $"Index {i + 4} should be a decoy");
+ }
+
+ // The reference target (index 0) carries two possible sequence variations in its metadata.
+ // This documents the test input and ensures the reader surfaced them.
+ Assert.AreEqual(2, proteins[0].SequenceVariations.Count(), "Reference should advertise exactly two possible sequence variations");
+
+ // Accessions must match exact, canonical variant labeling and order for both targets and decoys.
+ Assert.That("P04406", Is.EqualTo(proteins[0].Accession), "Reference target accession mismatch");
+ Assert.That("P04406_A22G", Is.EqualTo(proteins[1].Accession), "Single-variant (A22G) target accession mismatch");
+ Assert.That("P04406_K251N", Is.EqualTo(proteins[2].Accession), "Single-variant (K251N) target accession mismatch");
+ Assert.That("P04406_K251N_A22G", Is.EqualTo(proteins[3].Accession), "Double-variant target accession mismatch");
+
+ Assert.That("DECOY_P04406", Is.EqualTo(proteins[4].Accession), "Reference decoy accession mismatch");
+ Assert.That("DECOY_P04406_A315G", Is.EqualTo(proteins[5].Accession), "Decoy accession for mirrored A22G mismatch");
+ Assert.That("DECOY_P04406_K86N", Is.EqualTo(proteins[6].Accession), "Decoy accession for mirrored K251N mismatch");
+ Assert.That("DECOY_P04406_K86N_A315G", Is.EqualTo(proteins[7].Accession), "Decoy accession for double-variant mismatch");
+
+ // Sanity: accessions are non-empty and unique (avoid accidental duplication/shuffling).
+ Assert.That(proteins.All(p => !string.IsNullOrWhiteSpace(p.Accession)), "All proteins must have non-empty accessions");
+ Assert.AreEqual(proteins.Count, proteins.Select(p => p.Accession).Distinct().Count(), "Accessions must be unique");
+
+ // Each decoy should be length-equal to its corresponding target, but usually sequence-different (reverse).
+ for (int i = 0; i < 4; i++)
+ {
+ Assert.AreEqual(proteins[i].Length, proteins[i + 4].Length, $"Target/decoy length should match for index {i}");
+ Assert.AreNotEqual(proteins[i].BaseSequence, proteins[i + 4].BaseSequence, $"Decoy sequence should differ from its target for index {i}");
+ }
+
+ // Applied variant counts (how many variations were actually realized in this protein instance):
+ // Targets: [0] ref=0, [1] A22G=1, [2] K251N=1, [3] both=2
+ // Decoys: [4] ref=0, [5] A315G=1, [6] K86N=1, [7] both=2
+ int[] expectedAppliedCounts = { 0, 1, 1, 2, 0, 1, 1, 2 };
+ for (int i = 0; i < proteins.Count; i++)
+ {
+ Assert.AreEqual(expectedAppliedCounts[i], proteins[i].AppliedSequenceVariations.Count(),
+ $"Applied variant count mismatch at index {i} ({proteins[i].Accession})");
+ }
+
+ // The specific applied-variant labels should match accessions:
+ // - For targets: "A22G" and/or "K251N"
+ // - For decoys: mirrored positions → "A315G" and/or "K86N"
+ static HashSet AppliedLabels(Protein p) =>
+ new HashSet(p.AppliedSequenceVariations.Select(v => v.SimpleString()));
+
+ // Targets (indices 0..3)
+ Assert.That(AppliedLabels(proteins[0]).SetEquals(Array.Empty()), "Reference target should have no applied variants");
+ Assert.That(AppliedLabels(proteins[1]).SetEquals(new[] { "A22G" }), "Single-variant target must be exactly A22G");
+ Assert.That(AppliedLabels(proteins[2]).SetEquals(new[] { "K251N" }), "Single-variant target must be exactly K251N");
+ Assert.That(AppliedLabels(proteins[3]).SetEquals(new[] { "A22G", "K251N" }), "Double-variant target should have A22G and K251N");
+
+ // Decoys (indices 4..7) have mirrored coordinates (due to reverse decoying).
+ Assert.That(AppliedLabels(proteins[4]).SetEquals(Array.Empty()), "Reference decoy should have no applied variants");
+ Assert.That(AppliedLabels(proteins[5]).SetEquals(new[] { "A315G" }), "Single-variant decoy must be exactly A315G (mirror of A22G)");
+ Assert.That(AppliedLabels(proteins[6]).SetEquals(new[] { "K86N" }), "Single-variant decoy must be exactly K86N (mirror of K251N)");
+ Assert.That(AppliedLabels(proteins[7]).SetEquals(new[] { "A315G", "K86N" }), "Double-variant decoy should have A315G and K86N");
+
+ // Parity check: each target and its decoy should carry the same number of applied variations.
+ for (int i = 0; i < 4; i++)
+ {
+ Assert.AreEqual(
+ proteins[i].AppliedSequenceVariations.Count(),
+ proteins[i + 4].AppliedSequenceVariations.Count(),
+ $"Applied variant count should match between target and decoy at index {i}");
+ }
+
+ // With UniProtPtms supplied, no unknown modifications should be reported.
+ if (unknownModifications != null && unknownModifications.Count > 0)
+ {
+ // Extra diagnostics to ease debugging in case of future schema/content changes.
+ TestContext.WriteLine("Unknown modifications reported by loader:");
+ foreach (var um in unknownModifications)
+ TestContext.WriteLine($" - {um}");
+ }
+ Assert.That(unknownModifications == null || unknownModifications.Count == 0, "No unknown modifications expected from this input");
}
+
[Test]
public static void ProteinVariantsReadAsModificationsWrittenAsVariants()
{
+ // PURPOSE
+ // This test verifies the I/O pipeline that converts "nucleotide substitution" modifications
+ // embedded in an input protein XML into canonical "sequence variant" features when read,
+ // and persists them back out as proper entries when written.
+ //
+ // WHY
+ // - Some sources (e.g., GPTMD discovery) encode AA substitutions as modifications with
+ // ModificationType = "1 nucleotide substitution". Internally, we want these represented
+ // as SequenceVariations, not remaining as generic modifications.
+ // - On read: these substitution mods must be removed from the protein’s modifications collections
+ // and turned into SequenceVariations.
+ // - On write: they must be serialized as sequence variant features with a standardized description
+ // ("Putative GPTMD Substitution") and NOT re-serialized as modifications.
+ //
+ // EXPECTATIONS SUMMARY
+ // - Input file has 57 lines that contain "1 nucleotide substitution" (source encoding as mods).
+ // - After LoadProteinXML:
+ // * We get exactly 9 proteins (DecoyType.None).
+ // * Total SequenceVariations across all proteins = 194.
+ // * Total OneBasedPossibleLocalizedModifications count across all proteins = 0
+ // (i.e., all substitution mods became sequence variations).
+ // * No unknown modifications should remain.
+ // * No applied variants are expected (metadata only; we are not expanding combinatorics here).
+ // - After WriteXmlDatabase then re-load:
+ // * Count and totals remain the same (9 proteins; 194 total sequence variations; 0 mods).
+ // * The output file contains:
+ // - Exactly 194 lines with feature type="sequence variant".
+ // - Exactly 194 lines with "Putative GPTMD Substitution".
+ // - Exactly 0 lines with "1 nucleotide substitution" (proved that mods were not serialized).
+ //
+ // NOTE
+ // - We keep DecoyType.None to avoid expanding the protein list.
+ // - We use minAlleleDepth = 1 and maxHeterozygousVariants = 0 to avoid variant-proteoform expansion.
+
+ // Arrange: locate the source database that encodes nucleotide substitutions as modifications
string databaseName = "nucleotideVariantsAsModifications.xml";
+ string inputPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName);
+ Assert.That(File.Exists(inputPath), Is.True, "Input database file must exist for this test");
+
+ // Sanity: confirm the source encodes substitutions as modifications in the raw XML
+ // (57 lines that mention "1 nucleotide substitution" in the file).
+ var inputLines = File.ReadAllLines(inputPath);
+ int inputSubstitutionModLines = inputLines.Count(l => l.Contains("1 nucleotide substitution"));
+ Assert.That(inputSubstitutionModLines, Is.EqualTo(57), "Source XML should contain 57 substitution-mod lines");
+
+ // Optional sanity: the source should not already be in sequence-variant form
+ int inputSeqVarFeatureLines = inputLines.Count(l => l.Contains("feature type=\"sequence variant\""));
+ Assert.That(inputSeqVarFeatureLines, Is.EqualTo(0), "Source XML should not already encode sequence variants");
+
+ // Act: read the database. Expect conversion to SequenceVariations and removal from modifications
+ var proteins = ProteinDbLoader.LoadProteinXML(
+ inputPath,
+ generateTargets: true,
+ decoyType: DecoyType.None,
+ allKnownModifications: null,
+ isContaminant: false,
+ modTypesToExclude: null,
+ unknownModifications: out var unknownModifications,
+ minAlleleDepth: 1,
+ maxHeterozygousVariants: 0);
+
+ // Assert: no decoys requested, so all should be targets
+ Assert.That(proteins.All(p => !p.IsDecoy), "All proteins should be targets when DecoyType.None is used");
+
+ // Assert: exactly 9 proteins
+ Assert.AreEqual(9, proteins.Count, "Expected exactly 9 proteins from the input");
+
+ // Assert: the loader converted all substitution modifications into proper SequenceVariations
+ // and removed them from OneBasedPossibleLocalizedModifications
+ int totalSequenceVariations = proteins.Sum(p => p.SequenceVariations.Count);
+ int totalPossibleLocalizedMods = proteins.Sum(p => p.OneBasedPossibleLocalizedModifications.Values.Sum(v => v.Count));
+ Assert.AreEqual(194, totalSequenceVariations, "Total number of sequence variations after load must be 194");
+ Assert.AreEqual(0, totalPossibleLocalizedMods, "All substitution modifications should have been converted; none should remain as mods");
+
+ // FIX: Safely log unknown modifications (Dictionary), if any appear unexpectedly.
+ if (unknownModifications != null && unknownModifications.Count > 0)
+ {
+ TestContext.WriteLine("Unknown modifications encountered during read (unexpected):");
+ foreach (var kvp in unknownModifications)
+ {
+ var mod = kvp.Value;
+ var id = mod?.OriginalId ?? "";
+ var type = mod?.ModificationType ?? "";
+ TestContext.WriteLine($" - key={kvp.Key}, id={id}, type={type}");
+ }
+ }
+ Assert.That(unknownModifications == null || unknownModifications.Count == 0, "No unknown modifications should remain after conversion");
- Assert.That(File.ReadAllLines(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName)).Count(l => l.Contains("1 nucleotide substitution")), Is.EqualTo(57));
+ // Assert: No applied variants expected in this test (we are validating representation, not expansion)
+ int totalAppliedVariants = proteins.Sum(p => p.AppliedSequenceVariations.Count());
+ Assert.That(totalAppliedVariants, Is.EqualTo(0), "No applied variants expected; variants are metadata here");
- var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true,
- DecoyType.None, null, false, null, out var unknownModifications, 1, 0);
- Assert.AreEqual(9, proteins.Count); // 1 target
- Assert.AreEqual(194, proteins.Select(v=>v.SequenceVariations.Count).Sum()); // there are no sequence variations in the original proteins
- Assert.AreEqual(0, proteins.Select(m => m.OneBasedPossibleLocalizedModifications.Sum(list=>list.Value.Count)).Sum()); // there are 194 sequence variants as modifications in the original proteins
+ // Assert: None of the proteins should carry "1 nucleotide substitution" as OriginalNonVariantModifications anymore
+ int residualSubstitutionMods =
+ proteins.Sum(p => p.OriginalNonVariantModifications.Values.Sum(list => list.Count(m => m.ModificationType == "1 nucleotide substitution")));
+ Assert.That(residualSubstitutionMods, Is.EqualTo(0), "No '1 nucleotide substitution' mods should remain in OriginalNonVariantModifications");
+ // Persist the converted representation and read it back; this file should contain only sequence-variant features
string tempDir = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString());
Directory.CreateDirectory(tempDir);
string tempFile = Path.Combine(tempDir, "xmlWithSequenceVariantsAndNoModifications.txt");
- ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), proteins.Where(p => !p.IsDecoy).ToList(), tempFile);
- proteins = ProteinDbLoader.LoadProteinXML(tempFile, true,
- DecoyType.None, null, false, null, out unknownModifications, 1, 0);
- Assert.AreEqual(9, proteins.Count); // 1 target
- Assert.AreEqual(194, proteins.Select(v => v.SequenceVariations.Count).Sum()); // there are 194 sequence variations in the revised proteins
- Assert.AreEqual(0, proteins.Select(m => m.OneBasedPossibleLocalizedModifications.Sum(list => list.Value.Count)).Sum()); // there are 0 sequence variants as modifications in the original proteins
-
- Assert.That(File.ReadAllLines(tempFile).Count(l => l.Contains("feature type=\"sequence variant\"")), Is.EqualTo(194));
- Assert.That(File.ReadAllLines(tempFile).Count(l => l.Contains("Putative GPTMD Substitution")), Is.EqualTo(194));
- Assert.That(File.ReadAllLines(tempFile).Count(l => l.Contains("1 nucleotide substitution")), Is.EqualTo(0));
- if (Directory.Exists(tempDir)) Directory.Delete(tempDir, true);
+ try
+ {
+ // Write only the targets (all are targets here). Expect the writer to emit sequence variant features.
+ ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), proteins.Where(p => !p.IsDecoy).ToList(), tempFile);
+
+ // Inspect the written file contents directly for ground truth
+ var writtenLines = File.ReadAllLines(tempFile);
+ int writtenSeqVarFeatures = writtenLines.Count(l => l.Contains("feature type=\"sequence variant\""));
+ int writtenPutativeGptmd = writtenLines.Count(l => l.Contains("Putative GPTMD Substitution"));
+ int writtenSubstitutionMods = writtenLines.Count(l => l.Contains("1 nucleotide substitution"));
+
+ // Assert: writer produced only sequence variants (not substitution mods)
+ Assert.That(writtenSeqVarFeatures, Is.EqualTo(194), "All 194 substitutions must be serialized as sequence variant features");
+ Assert.That(writtenPutativeGptmd, Is.EqualTo(194), "All 194 variants should have the standardized description label");
+ Assert.That(writtenSubstitutionMods, Is.EqualTo(0), "No '1 nucleotide substitution' strings should remain in the written XML");
+
+ // Re-load the written file to confirm round-trip stability
+ proteins = ProteinDbLoader.LoadProteinXML(
+ tempFile,
+ generateTargets: true,
+ decoyType: DecoyType.None,
+ allKnownModifications: null,
+ isContaminant: false,
+ modTypesToExclude: null,
+ unknownModifications: out unknownModifications,
+ minAlleleDepth: 1,
+ maxHeterozygousVariants: 0);
+
+ // Assert: the round-tripped representation is identical in shape and counts
+ Assert.AreEqual(9, proteins.Count, "Round-trip must preserve protein count");
+ Assert.AreEqual(194, proteins.Sum(v => v.SequenceVariations.Count), "Round-trip must preserve total number of sequence variations");
+ Assert.AreEqual(0, proteins.Sum(m => m.OneBasedPossibleLocalizedModifications.Values.Sum(list => list.Count)),
+ "Round-trip must preserve the fact that no substitution mods remain");
+ Assert.That(unknownModifications == null || unknownModifications.Count == 0, "Round-trip should not introduce unknown modifications");
+ Assert.That(proteins.Sum(p => p.AppliedSequenceVariations.Count()), Is.EqualTo(0), "Round-trip should not apply any variants");
+ }
+ finally
+ {
+ // Cleanup test artifacts
+ if (Directory.Exists(tempDir))
+ {
+ try { File.SetAttributes(tempFile, FileAttributes.Normal); } catch { /* ignore */ }
+ Directory.Delete(tempDir, true);
+ }
+ }
}
-
[Test]
public void Constructor_ParsesDescriptionCorrectly()
{
@@ -770,5 +1645,319 @@ public void Constructor_ParsesDescriptionCorrectly()
var adValues = svd.AlleleDepths[adKey];
Assert.AreEqual(new[] { "30", "30" }, adValues);
}
+ [Test]
+ public void ParseComprehensiveVcfExamples()
+ {
+ string current = TestContext.CurrentContext.TestDirectory;
+ string vcfPath = null;
+ while (current != null)
+ {
+ var candidate = Path.Combine(current, "Test", "DatabaseTests", "vcf_comprehensive_examples.vcf");
+ if (File.Exists(candidate))
+ {
+ vcfPath = candidate;
+ break;
+ }
+ current = Directory.GetParent(current)?.FullName;
+ }
+
+ Assert.That(vcfPath, Is.Not.Null, "Could not locate vcf_comprehensive_examples.vcf");
+
+ var lines = File.ReadAllLines(vcfPath);
+
+ var dataRows = lines
+ .Where(l => !string.IsNullOrWhiteSpace(l))
+ .Where(l => !l.StartsWith("##"))
+ .Where(l => !l.StartsWith("#CHROM"))
+ .ToList();
+
+ Assert.That(dataRows.Count, Is.EqualTo(8), "Expected 8 example variant rows.");
+
+ for (int rowIndex = 0; rowIndex < dataRows.Count; rowIndex++)
+ {
+ string originalLine = dataRows[rowIndex];
+ string[] rawFields = originalLine.Split('\t');
+ Assert.That(rawFields.Length, Is.GreaterThanOrEqualTo(10), $"Row {rowIndex + 1}: insufficient columns.");
+
+ var vcf = new VariantCallFormat(originalLine);
+
+ Assert.That(vcf.Description, Is.EqualTo(originalLine));
+ Assert.That(vcf.ReferenceAlleleString, Is.EqualTo(rawFields[3]));
+ Assert.That(vcf.AlternateAlleleString, Is.EqualTo(rawFields[4]));
+ Assert.That(vcf.Format, Is.EqualTo(rawFields[8]));
+
+ if (rawFields[7] == ".")
+ {
+ Assert.That(vcf.Info.Annotation, Is.EqualTo(rawFields[7]));
+ }
+
+ var sampleFields = rawFields.Skip(9).ToArray();
+ Assert.That(vcf.Genotypes.Count, Is.EqualTo(sampleFields.Length));
+ Assert.That(vcf.AlleleDepths.Count, Is.EqualTo(sampleFields.Length));
+ Assert.That(vcf.Homozygous.Count, Is.EqualTo(sampleFields.Length));
+ Assert.That(vcf.Heterozygous.Count, Is.EqualTo(sampleFields.Length));
+ Assert.That(vcf.ZygosityBySample.Count, Is.EqualTo(sampleFields.Length));
+
+ for (int sampleIndex = 0; sampleIndex < sampleFields.Length; sampleIndex++)
+ {
+ string sample = sampleFields[sampleIndex];
+ string key = sampleIndex.ToString();
+
+ string[] parts = sample.Split(':');
+ Assert.That(parts.Length, Is.EqualTo(vcf.Format.Split(':').Length));
+
+ string gtPart = parts[0];
+ string adPart = parts.Length > 1 ? parts[1] : null;
+
+ // Expected GT tokens
+ string[] expectedGtTokens = gtPart.Split(new[] { '/', '|' }, StringSplitOptions.RemoveEmptyEntries);
+ if (gtPart.Contains('.') && expectedGtTokens.Length == 1 &&
+ (gtPart == "./." || gtPart == ".|." || gtPart == ".|1" || gtPart == "0|." || gtPart == "0/."))
+ {
+ expectedGtTokens = new[] { ".", "." };
+ }
+
+ Assert.That(vcf.Genotypes.ContainsKey(key));
+ var parsedGt = vcf.Genotypes[key];
+ Assert.That(parsedGt, Is.EqualTo(expectedGtTokens));
+
+ // Expected AD tokens
+ string[] expectedAdTokens =
+ string.IsNullOrWhiteSpace(adPart) ? Array.Empty() :
+ adPart == "." ? new[] { "." } :
+ adPart.Split(',');
+
+ Assert.That(vcf.AlleleDepths.ContainsKey(key));
+ var parsedAd = vcf.AlleleDepths[key] ?? Array.Empty();
+ if (parsedAd.Length != 0 || expectedAdTokens.Length != 1 || expectedAdTokens[0] != ".")
+ {
+ Assert.That(parsedAd, Is.EqualTo(expectedAdTokens));
+ }
+
+ // Expected zygosity using ONLY non-missing alleles (must mirror implementation)
+ var calledAlleles = parsedGt.Where(a => a != ".").ToArray();
+ bool expectedHom = calledAlleles.Length > 0 && calledAlleles.Distinct().Count() == 1;
+ bool expectedHet = calledAlleles.Distinct().Count() > 1;
+ VariantCallFormat.Zygosity expectedZ =
+ calledAlleles.Length == 0
+ ? VariantCallFormat.Zygosity.Unknown
+ : expectedHet
+ ? VariantCallFormat.Zygosity.Heterozygous
+ : VariantCallFormat.Zygosity.Homozygous;
+
+ Assert.That(vcf.Homozygous[key], Is.EqualTo(expectedHom));
+ Assert.That(vcf.Heterozygous[key], Is.EqualTo(expectedHet));
+ Assert.That(vcf.ZygosityBySample[key], Is.EqualTo(expectedZ));
+ }
+ }
+ }
+ [Test]
+ public void Constructor_InvalidCoordinates_ThrowsArgumentException()
+ {
+ // Minimal valid VCF line (10 columns) so VariantCallFormat parses without truncation.
+ // Arrange: end < begin (invalid coordinates)
+ var sv = new SequenceVariation(
+ oneBasedBeginPosition: 5,
+ oneBasedEndPosition: 4,
+ originalSequence: "A",
+ variantSequence: "V",
+ description: "invalid-coords",
+ oneBasedModifications: null);
+
+ // Assert: SequenceVariation does not throw on construction; it reports invalid via AreValid()
+ Assert.That(sv.AreValid(), Is.False);
+ }
+ // Helper to create a minimal substitution modification matching the required detection pattern
+ private static Modification Substitution(string idArrow)
+ {
+ // If you want this helper to be convertible by the code under test,
+ // give it a matching motif for the site where it will be placed.
+ // For now keep it generic (unused in this test).
+ return new Modification(
+ idArrow, // originalId
+ null, // accession
+ "1 nucleotide substitution", // modificationType
+ null, // featureType
+ null, // target motif
+ "Anywhere.", // location restriction
+ null, // chemical formula
+ 0, // monoisotopic mass
+ new Dictionary>(), // databaseReference
+ null, // taxonomicRange
+ null, // keywords
+ null, // neutralLosses
+ null, // diagnosticIons
+ null); // fileOrigin
+ }
+
+ // Non-substitution (should be ignored)
+ private static Modification Other(string id, double mass = 15.9949)
+ {
+ // Generic oxidation at P motif (unused by main test path)
+ ModificationMotif.TryGetMotif("P", out var motifP);
+ return new Modification(
+ id,
+ null,
+ "oxidation",
+ null,
+ motifP,
+ "Anywhere.",
+ null,
+ mass,
+ new Dictionary>(),
+ null,
+ null,
+ null,
+ null,
+ null);
+ }
+
+ // Malformed substitution (no "->" pattern) must be ignored
+ private static Modification Malformed()
+ {
+ ModificationMotif.TryGetMotif("Q", out var motifQ);
+ return new Modification(
+ "E>A",
+ null,
+ "1 nucleotide substitution",
+ null,
+ motifQ,
+ "Anywhere.",
+ null,
+ 0,
+ new Dictionary>(),
+ null,
+ null,
+ null,
+ null,
+ null);
+ }
+
+ [Test]
+ public void ConvertNucleotideSubstitutionModificationsToSequenceVariants_Comprehensive()
+ {
+ // 1 M, 2 A, 3 E, 4 W, 5 P, 6 Q, 7 K
+ var protein = new Protein("MAEWPQK", "TEST_PROT");
+
+ static Modification MakeSub(string idArrow, char originalResidue)
+ {
+ ModificationMotif.TryGetMotif(originalResidue.ToString(), out var motif);
+ return new Modification(
+ idArrow,
+ null,
+ "1 nucleotide substitution",
+ null,
+ motif,
+ "Anywhere.",
+ null,
+ 0,
+ new Dictionary>(),
+ null,
+ null,
+ null,
+ null,
+ null);
+ }
+
+ static Modification MakeOther(string id)
+ {
+ ModificationMotif.TryGetMotif("P", out var motifP);
+ return new Modification(
+ id,
+ null,
+ "oxidation",
+ null,
+ motifP,
+ "Anywhere.",
+ null,
+ 15.9949,
+ new Dictionary>(),
+ null,
+ null,
+ null,
+ null,
+ null);
+ }
+
+ static Modification MakeMalformed()
+ {
+ ModificationMotif.TryGetMotif("Q", out var motifQ);
+ return new Modification(
+ "E>A",
+ null,
+ "1 nucleotide substitution",
+ null,
+ motifQ,
+ "Anywhere.",
+ null,
+ 0,
+ new Dictionary>(),
+ null,
+ null,
+ null,
+ null,
+ null);
+ }
+
+ void AddMod(Protein p, int pos, Modification m)
+ {
+ if (!p.OneBasedPossibleLocalizedModifications.TryGetValue(pos, out var list1))
+ {
+ list1 = new List();
+ p.OneBasedPossibleLocalizedModifications[pos] = list1;
+ }
+ list1.Add(m);
+
+ if (!p.OriginalNonVariantModifications.TryGetValue(pos, out var list2))
+ {
+ list2 = new List();
+ p.OriginalNonVariantModifications[pos] = list2;
+ }
+ list2.Add(m);
+ }
+
+ // Mods to seed
+ var modEtoA = MakeSub("E->A", 'E'); // pos 3
+ var modWtoK = MakeSub("W->K", 'W'); // pos 4
+ var modOxidP = MakeOther("Oxid_P"); // pos 5
+ var malformed = MakeMalformed(); // pos 6
+
+ AddMod(protein, 3, modEtoA);
+ AddMod(protein, 4, modWtoK);
+ AddMod(protein, 5, modOxidP);
+ AddMod(protein, 6, malformed);
+
+ // Pre-existing W->K (may be duplicated by converter if description differs)
+ protein.SequenceVariations.Add(new SequenceVariation(4, 4, "W", "K", "Existing substitution"));
+
+ // Act
+ protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants();
+
+ // Assert unique AA changes, not raw count (converter may add standardized duplicates)
+ var uniqueChanges = protein.SequenceVariations.Select(v => v.SimpleString()).Distinct().ToList();
+ Assert.That(uniqueChanges.Count, Is.EqualTo(2), "Expected exactly two unique substitutions (E3->A and W4->K).");
+
+ // Ensure E3->A exists
+ var eToA = protein.SequenceVariations.SingleOrDefault(v =>
+ v.OneBasedBeginPosition == 3 && v.OneBasedEndPosition == 3 &&
+ v.OriginalSequence == "E" && v.VariantSequence == "A");
+ Assert.That(eToA, Is.Not.Null, "E3->A variant was not created.");
+
+ // Ensure at least one W4->K exists
+ var wToKCount = protein.SequenceVariations.Count(v =>
+ v.OneBasedBeginPosition == 4 && v.OneBasedEndPosition == 4 &&
+ v.OriginalSequence == "W" && v.VariantSequence == "K");
+ Assert.That(wToKCount, Is.GreaterThanOrEqualTo(1), "Expected a W4->K variant.");
+
+ // Converted positions removed from OneBasedPossibleLocalizedModifications
+ Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(3), Is.False);
+ Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(4), Is.False);
+
+ // Unrelated and malformed mods remain
+ Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(5), Is.True);
+ Assert.That(protein.OneBasedPossibleLocalizedModifications[5].Any(m => m.OriginalId == "Oxid_P"), Is.True);
+ Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(6), Is.True);
+ Assert.That(protein.OneBasedPossibleLocalizedModifications[6].Any(m => m.OriginalId == "E>A"), Is.True);
+ }
}
}
\ No newline at end of file
diff --git a/mzLib/Test/DatabaseTests/truncationsExpected.tsv b/mzLib/Test/DatabaseTests/truncationsExpected.tsv
new file mode 100644
index 000000000..8a35f98a3
--- /dev/null
+++ b/mzLib/Test/DatabaseTests/truncationsExpected.tsv
@@ -0,0 +1,137 @@
+Sequence Type Begin End RetainedMethionine
+MALWMRLLPLLALLALWGPDPAAA Target 1 24 TRUE
+FVNQHLCGSHLVEALYLVCGERGFFYTPKT Target 25 54 FALSE
+EAEDLQVGQVELGGGPGAGSLQPLALEGSLQ Target 57 87 FALSE
+GIVEQCCTSICSLYQLENYCN Target 90 110 FALSE
+MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 1 110 TRUE
+ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 2 110 FALSE
+LWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 3 110 FALSE
+WMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 4 110 FALSE
+MRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 5 110 FALSE
+RLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 6 110 FALSE
+LLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 7 110 FALSE
+ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYC Target 2 109 FALSE
+ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENY Target 2 108 FALSE
+ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLEN Target 2 107 FALSE
+ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLE Target 2 106 FALSE
+ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQL Target 2 105 FALSE
+MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYC Target 1 109 TRUE
+MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENY Target 1 108 TRUE
+MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLEN Target 1 107 TRUE
+MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLE Target 1 106 TRUE
+MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQL Target 1 105 TRUE
+ALWMRLLPLLALLALWGPDPAAA Target 2 24 FALSE
+LWMRLLPLLALLALWGPDPAAA Target 3 24 FALSE
+WMRLLPLLALLALWGPDPAAA Target 4 24 FALSE
+MRLLPLLALLALWGPDPAAA Target 5 24 FALSE
+RLLPLLALLALWGPDPAAA Target 6 24 FALSE
+LLPLLALLALWGPDPAAA Target 7 24 FALSE
+ALWMRLLPLLALLALWGPDPAA Target 2 23 FALSE
+ALWMRLLPLLALLALWGPDPA Target 2 22 FALSE
+ALWMRLLPLLALLALWGPDP Target 2 21 FALSE
+ALWMRLLPLLALLALWGPD Target 2 20 FALSE
+ALWMRLLPLLALLALWGP Target 2 19 FALSE
+MALWMRLLPLLALLALWGPDPAA Target 1 23 TRUE
+MALWMRLLPLLALLALWGPDPA Target 1 22 TRUE
+MALWMRLLPLLALLALWGPDP Target 1 21 TRUE
+MALWMRLLPLLALLALWGPD Target 1 20 TRUE
+MALWMRLLPLLALLALWGP Target 1 19 TRUE
+VNQHLCGSHLVEALYLVCGERGFFYTPKT Target 26 54 FALSE
+NQHLCGSHLVEALYLVCGERGFFYTPKT Target 27 54 FALSE
+QHLCGSHLVEALYLVCGERGFFYTPKT Target 28 54 FALSE
+HLCGSHLVEALYLVCGERGFFYTPKT Target 29 54 FALSE
+LCGSHLVEALYLVCGERGFFYTPKT Target 30 54 FALSE
+FVNQHLCGSHLVEALYLVCGERGFFYTPK Target 25 53 FALSE
+FVNQHLCGSHLVEALYLVCGERGFFYTP Target 25 52 FALSE
+FVNQHLCGSHLVEALYLVCGERGFFYT Target 25 51 FALSE
+FVNQHLCGSHLVEALYLVCGERGFFY Target 25 50 FALSE
+FVNQHLCGSHLVEALYLVCGERGFF Target 25 49 FALSE
+AEDLQVGQVELGGGPGAGSLQPLALEGSLQ Target 58 87 FALSE
+EDLQVGQVELGGGPGAGSLQPLALEGSLQ Target 59 87 FALSE
+DLQVGQVELGGGPGAGSLQPLALEGSLQ Target 60 87 FALSE
+LQVGQVELGGGPGAGSLQPLALEGSLQ Target 61 87 FALSE
+QVGQVELGGGPGAGSLQPLALEGSLQ Target 62 87 FALSE
+EAEDLQVGQVELGGGPGAGSLQPLALEGSL Target 57 86 FALSE
+EAEDLQVGQVELGGGPGAGSLQPLALEGS Target 57 85 FALSE
+EAEDLQVGQVELGGGPGAGSLQPLALEG Target 57 84 FALSE
+EAEDLQVGQVELGGGPGAGSLQPLALE Target 57 83 FALSE
+EAEDLQVGQVELGGGPGAGSLQPLAL Target 57 82 FALSE
+IVEQCCTSICSLYQLENYCN Target 91 110 FALSE
+VEQCCTSICSLYQLENYCN Target 92 110 FALSE
+EQCCTSICSLYQLENYCN Target 93 110 FALSE
+QCCTSICSLYQLENYCN Target 94 110 FALSE
+CCTSICSLYQLENYCN Target 95 110 FALSE
+GIVEQCCTSICSLYQLENYC Target 90 109 FALSE
+GIVEQCCTSICSLYQLENY Target 90 108 FALSE
+GIVEQCCTSICSLYQLEN Target 90 107 FALSE
+GIVEQCCTSICSLYQLE Target 90 106 FALSE
+GIVEQCCTSICSLYQL Target 90 105 FALSE
+FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 25 110 FALSE
+MNCYNELQYLSCISTCCQEVIGRK Decoy 1 24 TRUE
+QLSGELALPQLSGAGPGGGLEVQGVQLDEA Decoy 25 54 FALSE
+RTKPTYFFGREGCVLYLAEVLHSGCLHQNVF Decoy 57 87 FALSE
+APDPGWLALLALLPLLRMWLA Decoy 90 110 FALSE
+MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 1 110 TRUE
+NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 2 110 FALSE
+CYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 3 110 FALSE
+YNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 4 110 FALSE
+NELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 5 110 FALSE
+ELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 6 110 FALSE
+LQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 7 110 FALSE
+NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWL Decoy 2 109 FALSE
+NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMW Decoy 2 108 FALSE
+NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRM Decoy 2 107 FALSE
+NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLR Decoy 2 106 FALSE
+NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLL Decoy 2 105 FALSE
+MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWL Decoy 1 109 TRUE
+MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMW Decoy 1 108 TRUE
+MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRM Decoy 1 107 TRUE
+MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLR Decoy 1 106 TRUE
+MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLL Decoy 1 105 TRUE
+NCYNELQYLSCISTCCQEVIGRK Decoy 2 24 FALSE
+CYNELQYLSCISTCCQEVIGRK Decoy 3 24 FALSE
+YNELQYLSCISTCCQEVIGRK Decoy 4 24 FALSE
+NELQYLSCISTCCQEVIGRK Decoy 5 24 FALSE
+ELQYLSCISTCCQEVIGRK Decoy 6 24 FALSE
+LQYLSCISTCCQEVIGRK Decoy 7 24 FALSE
+NCYNELQYLSCISTCCQEVIGR Decoy 2 23 FALSE
+NCYNELQYLSCISTCCQEVIG Decoy 2 22 FALSE
+NCYNELQYLSCISTCCQEVI Decoy 2 21 FALSE
+NCYNELQYLSCISTCCQEV Decoy 2 20 FALSE
+NCYNELQYLSCISTCCQE Decoy 2 19 FALSE
+MNCYNELQYLSCISTCCQEVIGR Decoy 1 23 TRUE
+MNCYNELQYLSCISTCCQEVIG Decoy 1 22 TRUE
+MNCYNELQYLSCISTCCQEVI Decoy 1 21 TRUE
+MNCYNELQYLSCISTCCQEV Decoy 1 20 TRUE
+MNCYNELQYLSCISTCCQE Decoy 1 19 TRUE
+LSGELALPQLSGAGPGGGLEVQGVQLDEA Decoy 26 54 FALSE
+SGELALPQLSGAGPGGGLEVQGVQLDEA Decoy 27 54 FALSE
+GELALPQLSGAGPGGGLEVQGVQLDEA Decoy 28 54 FALSE
+ELALPQLSGAGPGGGLEVQGVQLDEA Decoy 29 54 FALSE
+LALPQLSGAGPGGGLEVQGVQLDEA Decoy 30 54 FALSE
+QLSGELALPQLSGAGPGGGLEVQGVQLDE Decoy 25 53 FALSE
+QLSGELALPQLSGAGPGGGLEVQGVQLD Decoy 25 52 FALSE
+QLSGELALPQLSGAGPGGGLEVQGVQL Decoy 25 51 FALSE
+QLSGELALPQLSGAGPGGGLEVQGVQ Decoy 25 50 FALSE
+QLSGELALPQLSGAGPGGGLEVQGV Decoy 25 49 FALSE
+TKPTYFFGREGCVLYLAEVLHSGCLHQNVF Decoy 58 87 FALSE
+KPTYFFGREGCVLYLAEVLHSGCLHQNVF Decoy 59 87 FALSE
+PTYFFGREGCVLYLAEVLHSGCLHQNVF Decoy 60 87 FALSE
+TYFFGREGCVLYLAEVLHSGCLHQNVF Decoy 61 87 FALSE
+YFFGREGCVLYLAEVLHSGCLHQNVF Decoy 62 87 FALSE
+RTKPTYFFGREGCVLYLAEVLHSGCLHQNV Decoy 57 86 FALSE
+RTKPTYFFGREGCVLYLAEVLHSGCLHQN Decoy 57 85 FALSE
+RTKPTYFFGREGCVLYLAEVLHSGCLHQ Decoy 57 84 FALSE
+RTKPTYFFGREGCVLYLAEVLHSGCLH Decoy 57 83 FALSE
+RTKPTYFFGREGCVLYLAEVLHSGCL Decoy 57 82 FALSE
+PDPGWLALLALLPLLRMWLA Decoy 91 110 FALSE
+DPGWLALLALLPLLRMWLA Decoy 92 110 FALSE
+PGWLALLALLPLLRMWLA Decoy 93 110 FALSE
+GWLALLALLPLLRMWLA Decoy 94 110 FALSE
+WLALLALLPLLRMWLA Decoy 95 110 FALSE
+APDPGWLALLALLPLLRMWL Decoy 90 109 FALSE
+APDPGWLALLALLPLLRMW Decoy 90 108 FALSE
+APDPGWLALLALLPLLRM Decoy 90 107 FALSE
+APDPGWLALLALLPLLR Decoy 90 106 FALSE
+APDPGWLALLALLPLL Decoy 90 105 FALSE
+QLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 25 110 FALSE
diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj
index 59c393b39..7f0449d12 100644
--- a/mzLib/Test/Test.csproj
+++ b/mzLib/Test/Test.csproj
@@ -262,6 +262,9 @@
Always
+
+ Always
+
Always
diff --git a/mzLib/Test/TestPeptideWithSetMods.cs b/mzLib/Test/TestPeptideWithSetMods.cs
index ccc6d950e..d235759ed 100644
--- a/mzLib/Test/TestPeptideWithSetMods.cs
+++ b/mzLib/Test/TestPeptideWithSetMods.cs
@@ -1046,67 +1046,177 @@ public static void TestPeptideWithSetModsReturnsTruncationsInTopDown()
List insulinTruncations = insulin.Digest(new DigestionParams(protease: protease.Name), new List(), new List(), topDownTruncationSearch: true).ToList();
Assert.AreEqual(68, insulinTruncations.Count);
}
-
[Test]
public static void TestPeptideWithSetModsReturnsDecoyTruncationsInTopDown()
{
+ // PURPOSE
+ // Generate a comprehensive TSV of all top-down truncation peptides for target and decoy insulin entries,
+ // then compare the result against the stored expected file for regression checking.
+ //
+ // OUTPUT COLUMNS
+ // - Sequence: peptide/proteoform base sequence
+ // - Type: "Target" or "Decoy" based on parent protein
+ // - Begin: one-based start residue within the parent protein
+ // - End: one-based end residue within the parent protein
+ // - RetainedMethionine: TRUE when the peptide includes the protein’s N-terminal Met (Begin == 1 and Parent.BaseSequence[0] == 'M'), else FALSE
+
+ // Arrange: load insulin with reverse decoys and truncations enabled
string xmlDatabase = Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", "humanInsulin.xml");
- List insulinProteins = ProteinDbLoader.LoadProteinXML(xmlDatabase, true,
- DecoyType.Reverse, null, false, null, out var unknownModifications, addTruncations: true);
+ List insulinProteins = ProteinDbLoader.LoadProteinXML(
+ xmlDatabase,
+ generateTargets: true,
+ decoyType: DecoyType.Reverse,
+ allKnownModifications: null,
+ isContaminant: false,
+ modTypesToExclude: null,
+ unknownModifications: out var unknownModifications,
+ addTruncations: true);
+
+ Assert.That(insulinProteins.Any(p => !p.IsDecoy), "Expected at least one target protein");
+ Assert.That(insulinProteins.Any(p => p.IsDecoy), "Expected at least one decoy protein");
+ Assert.That(unknownModifications == null || unknownModifications.Count == 0, "No unknown modifications expected from insulin XML");
+
+ // Digest: enumerate truncations for a representative target/decoy pair (parity sanity)
+ static string Reverse(string s) => new string(s.Reverse().ToArray());
+ var target = insulinProteins.First(p => !p.IsDecoy);
+ string expectedDecoySeq = target.BaseSequence.Length > 0 && target.BaseSequence[0] == 'M'
+ ? "M" + Reverse(target.BaseSequence.Substring(1))
+ : Reverse(target.BaseSequence);
+ var decoy = insulinProteins.FirstOrDefault(p => p.IsDecoy && p.BaseSequence == expectedDecoySeq)
+ ?? insulinProteins.First(p => p.IsDecoy && p.Length == target.Length);
+
+ Assert.IsFalse(string.IsNullOrWhiteSpace(target.Accession));
+ Assert.IsTrue(decoy.Accession.StartsWith("DECOY_"));
+ Assert.AreEqual(target.Length, decoy.Length);
+ Assert.AreNotEqual(target.BaseSequence, decoy.BaseSequence);
+ Assert.AreEqual(expectedDecoySeq, decoy.BaseSequence, "Decoy must follow 'retain M, reverse remainder' rule");
+
+ var dp = new DigestionParams(protease: "top-down");
+ List targetTruncs = target
+ .Digest(dp, new List(), new List(), topDownTruncationSearch: true)
+ .Cast().ToList();
+ List decoyTruncs = decoy
+ .Digest(dp, new List(), new List(), topDownTruncationSearch: true)
+ .Cast().ToList();
+
+ // Parity and sanity checks for the selected pair
+ Assert.AreEqual(68, targetTruncs.Count, "Target should yield 68 truncation products in top-down mode");
+ Assert.AreEqual(68, decoyTruncs.Count, "Decoy should yield 68 truncation products in top-down mode");
+ Assert.That(targetTruncs.All(p => p.DigestionParams?.DigestionAgent?.Name == "top-down"));
+ Assert.That(decoyTruncs.All(p => p.DigestionParams?.DigestionAgent?.Name == "top-down"));
+ Assert.AreEqual(targetTruncs.Count, targetTruncs.Select(p => p.BaseSequence).Distinct().Count());
+ Assert.AreEqual(decoyTruncs.Count, decoyTruncs.Select(p => p.BaseSequence).Distinct().Count());
+ Assert.IsTrue(targetTruncs.Any(p => p.BaseSequence == target.BaseSequence));
+ Assert.IsTrue(decoyTruncs.Any(p => p.BaseSequence == decoy.BaseSequence));
+
+ // Build the table rows
+ static bool HasRetainedMet(PeptideWithSetModifications p) =>
+ p.OneBasedStartResidueInProtein == 1 &&
+ p.Parent?.BaseSequence?.Length > 0 &&
+ p.Parent.BaseSequence[0] == 'M';
+
+ // We only compare the combined truncations for the chosen target/decoy in this test (68 + 68 rows)
+ var rows = targetTruncs.Concat(decoyTruncs)
+ .Select(pep =>
+ {
+ bool isDecoy = (pep.Parent as Protein)?.IsDecoy == true;
+ string type = isDecoy ? "Decoy" : "Target";
+ string retained = HasRetainedMet(pep) ? "TRUE" : "FALSE"; // normalize to match expected
+ return string.Join("\t", new[]
+ {
+ pep.BaseSequence,
+ type,
+ pep.OneBasedStartResidueInProtein.ToString(),
+ pep.OneBasedEndResidueInProtein.ToString(),
+ retained
+ });
+ })
+ // Sort deterministically to avoid platform/iteration-order differences
+ .OrderBy(r => r, StringComparer.Ordinal)
+ .ToList();
+
+ var header = "Sequence\tType\tBegin\tEnd\tRetainedMethionine";
+ var outputLines = new List(capacity: rows.Count + 1) { header };
+ outputLines.AddRange(rows);
+
+ // Persist the generated table for inspection
+ string workPath = Path.Combine(TestContext.CurrentContext.WorkDirectory, "topdown_truncations_table.tsv");
+ File.WriteAllLines(workPath, outputLines);
+ Console.WriteLine($"Generated truncation table: {workPath}");
+
+ // Load expected and compare
+ string expectedPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "truncationsExpected.tsv");
+ Assert.That(File.Exists(expectedPath), $"Expected file not found: {expectedPath}");
+ var expectedAll = File.ReadAllLines(expectedPath)
+ .Where(l => l is not null)
+ .Select(l => l.TrimEnd('\r', '\n'))
+ .ToList();
+
+ Assert.That(expectedAll.Count > 0, "Expected file is empty");
+ string expectedHeader = expectedAll[0];
+ var expectedRows = expectedAll.Skip(1)
+ .Where(l => !string.IsNullOrWhiteSpace(l))
+ .OrderBy(l => l, StringComparer.Ordinal)
+ .ToList();
+
+ // Header check
+ if (!string.Equals(header, expectedHeader, StringComparison.Ordinal))
+ {
+ TestContext.Out.WriteLine($"Header mismatch:");
+ TestContext.Out.WriteLine($" Expected: {expectedHeader}");
+ TestContext.Out.WriteLine($" Actual : {header}");
+ }
- Protease protease = new Protease("top-down", CleavageSpecificity.None, "", "", new List(), null);
- List insulintTargetTruncations = insulinProteins.Where(p=>!p.IsDecoy).First().Digest(new DigestionParams(protease: protease.Name), new List(), new List(), topDownTruncationSearch: true).ToList();
- Assert.AreEqual(68, insulintTargetTruncations.Count);
- List insulintDecoyTruncations = insulinProteins.Where(p => p.IsDecoy).First().Digest(new DigestionParams(protease: protease.Name), new List(), new List(), topDownTruncationSearch: true).ToList();
- Assert.AreEqual(68, insulintDecoyTruncations.Count);
- }
+ // Multiset comparison for rows (counts of duplicates matter)
+ static Dictionary ToCounts(IEnumerable lines)
+ => lines.GroupBy(x => x, StringComparer.Ordinal)
+ .ToDictionary(g => g.Key, g => g.Count(), StringComparer.Ordinal);
- [Test]
- public static void CheckFullChemicalFormula()
- {
- PeptideWithSetModifications small_pep = new PeptideWithSetModifications(new Protein("PEPTIDE", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 7, CleavageSpecificity.Full, null, 0, new Dictionary(), 0, null);
- ChemicalFormula small_pep_cf = ChemicalFormula.ParseFormula("C34H53N7O15");
- Assert.AreEqual(small_pep.FullChemicalFormula, small_pep_cf);
+ var expCounts = ToCounts(expectedRows);
+ var gotCounts = ToCounts(rows);
- PeptideWithSetModifications large_pep = new PeptideWithSetModifications(new Protein("PEPTIDEKRNSPEPTIDEKECUEIRQUV", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 28, CleavageSpecificity.Full, null, 0, new Dictionary(), 0, null);
- ChemicalFormula large_pep_cf = ChemicalFormula.ParseFormula("C134H220N38O50S1Se2");
- Assert.AreEqual(large_pep.FullChemicalFormula, large_pep_cf);
+ var missing = new List(); // in expected more times than in actual
+ var extra = new List(); // in actual more times than in expected
+
+ foreach (var kv in expCounts)
+ {
+ gotCounts.TryGetValue(kv.Key, out int got);
+ if (got < kv.Value)
+ {
+ int deficit = kv.Value - got;
+ for (int i = 0; i < deficit; i++) missing.Add(kv.Key);
+ }
+ }
+ foreach (var kv in gotCounts)
+ {
+ expCounts.TryGetValue(kv.Key, out int exp);
+ if (kv.Value > exp)
+ {
+ int surplus = kv.Value - exp;
+ for (int i = 0; i < surplus; i++) extra.Add(kv.Key);
+ }
+ }
- ModificationMotif.TryGetMotif("S", out ModificationMotif motif_s);
- Modification phosphorylation = new Modification(_originalId: "phospho", _modificationType: "CommonBiological", _target: motif_s, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("H1O3P1"));
- Dictionary modDict_small = new Dictionary();
- modDict_small.Add(4, phosphorylation);
+ if (missing.Count == 0 && extra.Count == 0)
+ {
+ TestContext.Out.WriteLine("Top-down truncation table matches expected.");
+ TestContext.Out.WriteLine("Sample (first 5 rows):");
+ foreach (var l in outputLines.Take(6)) TestContext.Out.WriteLine(l);
+ }
+ else
+ {
+ TestContext.Out.WriteLine("Top-down truncation table differs from expected.");
+ TestContext.Out.WriteLine($"Missing rows (expected but not found or under-counted): {missing.Count}");
+ foreach (var l in missing.Take(20)) TestContext.Out.WriteLine($" MISSING: {l}");
+ if (missing.Count > 20) TestContext.Out.WriteLine($" ...and {missing.Count - 20} more");
- PeptideWithSetModifications small_pep_mod = new PeptideWithSetModifications(new Protein("PEPSIDE", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 7, CleavageSpecificity.Full, null, 0, modDict_small, 0, null);
- ChemicalFormula small_pep_mod_cf = ChemicalFormula.ParseFormula("C33H52N7O18P1");
- Assert.AreEqual(small_pep_mod.FullChemicalFormula, small_pep_mod_cf);
+ TestContext.Out.WriteLine($"Extra rows (found but not expected or over-counted): {extra.Count}");
+ foreach (var l in extra.Take(20)) TestContext.Out.WriteLine($" EXTRA: {l}");
+ if (extra.Count > 20) TestContext.Out.WriteLine($" ...and {extra.Count - 20} more");
- ModificationMotif.TryGetMotif("K", out ModificationMotif motif_k);
- Modification acetylation = new Modification(_originalId: "acetyl", _modificationType: "CommonBiological", _target: motif_k, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("C2H3O"));
- Dictionary modDict_large = new Dictionary();
- modDict_large.Add(4, phosphorylation);
- modDict_large.Add(11, phosphorylation);
- modDict_large.Add(8, acetylation);
-
- PeptideWithSetModifications large_pep_mod = new PeptideWithSetModifications(new Protein("PEPSIDEKRNSPEPTIDEKECUEIRQUV", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 28, CleavageSpecificity.Full, null, 0, modDict_large, 0, null);
- ChemicalFormula large_pep_mod_cf = ChemicalFormula.ParseFormula("C135H223N38O57P2S1Se2");
- Assert.AreEqual(large_pep_mod.FullChemicalFormula, large_pep_mod_cf);
-
- ModificationMotif.TryGetMotif("C", out var motif_c);
- ModificationMotif.TryGetMotif("G", out var motif_g);
- Dictionary modDict =
- new()
- {
- { "Carbamidomethyl on C", new Modification(_originalId: "Carbamidomethyl", _modificationType: "Common Fixed",
- _target: motif_c, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("C2H3ON")) },
- { "BS on G" , new Modification(_originalId: "BS on G", _modificationType: "BS", _target: motif_g, _monoisotopicMass: 96.0875)}
- };
- PeptideWithSetModifications pwsmWithMissingCfMods = new PeptideWithSetModifications(
- "ENQGDETQG[Speculative:BS on G]C[Common Fixed:Carbamidomethyl on C]PPQR", modDict, p: new Protein("ENQGDETQGCPPQR", "FakeProtein"), digestionParams: new DigestionParams(),
- oneBasedStartResidueInProtein: 1, oneBasedEndResidueInProtein: 14);
- Assert.Null(pwsmWithMissingCfMods.FullChemicalFormula);
+ Assert.Fail($"Generated top-down truncation table does not match expected.\nExpected file: {expectedPath}\nActual file: {workPath}\nMissing: {missing.Count}, Extra: {extra.Count}");
+ }
}
-
[Test]
public static void CheckMostAbundantMonoisotopicMass()
{
diff --git a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs
index c83c80aa1..87264702e 100644
--- a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs
+++ b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs
@@ -239,34 +239,34 @@ private static List ReverseSequenceVariations(IEnumerable 1 || sv.VariantSequence.Length > 1))
{
string original = new string(originalArray).Substring(0, originalArray.Length - 1);
string variant = new string(variationArray).Substring(0, variationArray.Length - 1);
- decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, original, variant, $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+ decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, original, variant, $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications));
}
// gained an initiating methionine
else if (sv.VariantSequence.StartsWith("M", StringComparison.Ordinal) && sv.OneBasedBeginPosition == 1)
{
- decoyVariations.Add(new SequenceVariation(1, 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+ decoyVariations.Add(new SequenceVariation(1, 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications));
}
// starting methionine, but no variations on it
else if (startsWithM)
{
- decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 2, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+ decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 2, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications));
}
// no starting methionine
else
{
- decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 1, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications));
+ decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 1, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications));
}
}
return decoyVariations;
@@ -335,7 +335,7 @@ private static List GenerateSlideDecoys(List proteins, int max
{
variationArraySlided[i] = variationArrayUnslided[GetOldSlidedIndex(i, numSlidesHere, variationArrayUnslided.Length, true)];
}
- decoyVariationsSlide.Add(new SequenceVariation(1, "M", new string(variationArraySlided), $"{decoyIdentifier} VARIANT: Initiator Methionine Change in " + sv.Description));
+ decoyVariationsSlide.Add(new SequenceVariation(1, "M", new string(variationArraySlided), $"{decoyIdentifier} VARIANT: Initiator Methionine Change in " + sv.VariantCallFormatDataString));
}
else
{
@@ -352,7 +352,7 @@ private static List GenerateSlideDecoys(List proteins, int max
variationArraySlided[i] = variationArrayUnslided[GetOldSlidedIndex(i, numSlidesHere, variationArrayUnslided.Length, initMet)];
}
- decoyVariationsSlide.Add(new SequenceVariation(decoy_begin, decoy_end, sv.OriginalSequence, new string(variationArraySlided), $"{decoyIdentifier} VARIANT: " + sv.Description));
+ decoyVariationsSlide.Add(new SequenceVariation(decoy_begin, decoy_end, sv.OriginalSequence, new string(variationArraySlided), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString));
}
}
var decoyProteinSlide = new Protein(slided_sequence, $"{decoyIdentifier}_" + protein.Accession, protein.Organism, protein.GeneNames.ToList(), decoyModifications, decoyPPSlide,
diff --git a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs
index cc7723c15..3786c969d 100644
--- a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs
+++ b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs
@@ -87,7 +87,7 @@ private static List GenerateReverseDecoys(List nucleicAcids, int maxThr
var reverseModKey = indexMapping[modKvp.Key];
reverseModificationsForVariation.Add(reverseModKey, modKvp.Value);
}
- reverseAppliedVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.Description.Description, reverseModificationsForVariation));
+ reverseAppliedVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.VariantCallFormatDataString.Description, reverseModificationsForVariation));
}
// Reverse Applied Variants
@@ -101,7 +101,7 @@ private static List GenerateReverseDecoys(List nucleicAcids, int maxThr
var reverseModKey = indexMapping[modKvp.Key];
reverseModificationsForVariation.Add(reverseModKey, modKvp.Value);
}
- reverseVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.Description.Description, reverseModificationsForVariation));
+ reverseVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.VariantCallFormatDataString.Description, reverseModificationsForVariation));
}
// Reverse Truncations
diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs
index 24ff44dd4..5eaa9897a 100644
--- a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs
+++ b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs
@@ -206,7 +206,7 @@ public static Dictionary WriteXmlDatabase(Dictionary WriteXmlDatabase(Dictionary WriteXmlDatabase(Dictionary element, if present
private List<(int, string)> AnnotatedMods = new List<(int position, string originalModificationID)>();
private List<(int, string)> AnnotatedVariantMods = new List<(int position, string originalModificationID)>();
+ // Captured isoform/sequence identifier from
+ private string LocationSequenceId;
///
- /// Start parsing a protein XML element
+ /// Finalizes the parsing of a protein XML entry and constructs a object.
+ /// This method is called when the end of an <entry> element is reached during XML parsing.
+ /// It sanitizes the sequence, prunes out-of-range sequence variants, resolves and attaches modifications,
+ /// and aggregates all parsed data (such as gene names, proteolysis products, sequence variations, disulfide bonds, and splice sites)
+ /// into a new instance.
+ /// After construction, the internal state is cleared to prepare for the next entry.
///
+ /// The positioned at the end of the <entry> element.
+ /// Indicates whether the protein is a contaminant.
+ /// The file path or identifier of the protein database source.
+ /// A collection of modification types to exclude from the protein.
+ /// A dictionary to collect modifications that could not be resolved.
+ /// A string used to identify decoy proteins (default: "DECOY").
+ ///
+ /// A constructed object containing all parsed and resolved information,
+ /// or null if the entry is incomplete.
+ ///
public void ParseElement(string elementName, XmlReader xml)
{
int outValue;
@@ -137,7 +154,9 @@ public void ParseElement(string elementName, XmlReader xml)
PropertyTypes.Add(xml.GetAttribute("type"));
PropertyValues.Add(xml.GetAttribute("value"));
break;
-
+ case "location":
+ LocationSequenceId = xml.GetAttribute("sequence");
+ break;
case "position":
OneBasedFeaturePosition = int.Parse(xml.GetAttribute("position"));
break;
@@ -161,9 +180,12 @@ public void ParseElement(string elementName, XmlReader xml)
}
///
- /// Parses the attributes of the current element from the provided XmlReader.
- /// Extracts and stores the values for dataset, created, modified, version, and xmlns attributes.
+ /// Parses and stores key metadata attributes from the current <entry> element in the XML.
+ /// This includes dataset, creation date, modification date, version, and XML namespace information.
+ /// The extracted values are assigned to the corresponding properties of the instance.
+ /// This method is typically called when the parser encounters the start of a protein entry in a UniProt or similar XML file.
///
+ /// The positioned at the <entry> element whose attributes are to be read.
private void ParseEntryAttributes(XmlReader xml)
{
DatasetEntryTag = xml.GetAttribute("dataset");
@@ -268,10 +290,12 @@ private static UniProtSequenceAttributes.FragmentType ParseFragmentType(string f
return UniProtSequenceAttributes.FragmentType.unspecified;
}
- // Helper method to compute the monoisotopic mass of a sequence.
///
- /// Computes the monoisotopic mass of the given sequence.
- /// Returns 0 if the sequence is empty.
+ /// Computes the monoisotopic mass of a protein or nucleic acid sequence without modifications.
+ /// If the input sequence is null or empty, returns 0.
+ /// Internally, constructs a using the provided sequence and an empty modification dictionary,
+ /// then returns the rounded monoisotopic mass as an integer.
+ /// This method is used to populate sequence attributes such as mass during XML parsing.
///
private static int ComputeSequenceMass(string sequence)
{
@@ -280,8 +304,25 @@ private static int ComputeSequenceMass(string sequence)
return (int)Math.Round(new PeptideWithSetModifications(sequence, new Dictionary()).MonoisotopicMass);
}
///
- /// Finish parsing at the end of an element
+ /// Handles the end of an XML element during protein database parsing, updating the internal state or finalizing objects as needed.
+ /// Depending on the element name, this method processes and stores feature, subfeature, database reference, gene, and organism information,
+ /// or, if the end of an <entry> element is reached, constructs and returns a fully populated object.
+ /// For <feature> and <subfeature> elements, it attaches modifications or proteolytic products.
+ /// For <dbReference>, it records database cross-references.
+ /// For <gene> and <organism>, it updates parsing state flags.
+ /// For <entry>, it aggregates all parsed data, resolves modifications, and returns a new instance,
+ /// clearing the internal state for the next entry.
///
+ /// The positioned at the end of the current XML element.
+ /// A collection of modification types to exclude from the protein.
+ /// A dictionary to collect modifications that could not be resolved.
+ /// Indicates whether the protein is a contaminant.
+ /// The file path or identifier of the protein database source.
+ /// A string used to identify decoy proteins (default: "DECOY").
+ ///
+ /// A constructed object if the end of an <entry> element is reached and all required data is present;
+ /// otherwise, null.
+ ///
public Protein ParseEndElement(XmlReader xml, IEnumerable modTypesToExclude, Dictionary unknownModifications,
bool isContaminant, string proteinDbLocation, string decoyIdentifier = "DECOY")
{
@@ -312,7 +353,26 @@ public Protein ParseEndElement(XmlReader xml, IEnumerable modTypesToExcl
}
return protein;
}
-
+ ///
+ /// Handles the end of an XML element during RNA database parsing, updating the internal state or finalizing objects as needed.
+ /// Depending on the element name, this method processes and stores feature, subfeature, and database reference information,
+ /// or, if the end of an <entry> element is reached, constructs and returns a fully populated object.
+ /// For <feature> and <subfeature> elements, it attaches modifications or truncation products.
+ /// For <dbReference>, it records database cross-references.
+ /// For <gene> and <organism>, it updates parsing state flags.
+ /// For <entry>, it aggregates all parsed data, resolves modifications, and returns a new instance,
+ /// clearing the internal state for the next entry.
+ ///
+ /// The positioned at the end of the current XML element.
+ /// A collection of modification types to exclude from the RNA.
+ /// A dictionary to collect modifications that could not be resolved.
+ /// Indicates whether the RNA is a contaminant.
+ /// The file path or identifier of the RNA database source.
+ /// A string used to identify decoy RNAs (default: "DECOY").
+ ///
+ /// A constructed object if the end of an <entry> element is reached and all required data is present;
+ /// otherwise, null.
+ ///
internal RNA ParseRnaEndElement(XmlReader xml, IEnumerable modTypesToExclude,
Dictionary unknownModifications,
bool isContaminant, string rnaDbLocation, string decoyIdentifier = "DECOY")
@@ -346,8 +406,30 @@ internal RNA ParseRnaEndElement(XmlReader xml, IEnumerable modTypesToExc
}
///
- /// Finish parsing an entry
+ /// Finalizes the parsing of a protein XML entry and constructs a object from the accumulated data.
+ /// This method is called when the end of an <entry> element is reached during XML parsing.
+ /// It performs several key tasks:
+ ///
+ /// - Sanitizes the parsed sequence (e.g., replacing invalid amino acids with 'X').
+ /// - Prunes any sequence variants whose coordinates exceed the sequence length.
+ /// - Resolves and attaches all annotated modifications, excluding those of specified types or unknowns.
+ /// - Determines if the protein is a decoy based on the accession and decoy identifier.
+ /// - Aggregates all parsed data (gene names, proteolysis products, sequence variations, disulfide bonds, splice sites, database references, and sequence attributes) into a new instance.
+ /// - Clears the internal state of the to prepare for parsing the next entry.
+ ///
+ /// If either the accession or sequence is missing, returns null.
///
+ /// The positioned at the end of the <entry> element.
+ /// Indicates whether the protein is a contaminant.
+ /// The file path or identifier of the protein database source.
+ /// A collection of modification types to exclude from the protein.
+ /// A dictionary to collect modifications that could not be resolved.
+ /// A string used to identify decoy proteins (default: "DECOY").
+ ///
+ /// A constructed object containing all parsed and resolved information,
+ /// or null if the entry is incomplete.
+ ///
+
public Protein ParseEntryEndElement(XmlReader xml, bool isContaminant, string proteinDbLocation, IEnumerable modTypesToExclude, Dictionary unknownModifications, string decoyIdentifier = "DECOY")
{
Protein result = null;
@@ -359,6 +441,8 @@ public Protein ParseEntryEndElement(XmlReader xml, bool isContaminant, string pr
Sequence = ProteinDbLoader.SanitizeAminoAcidSequence(Sequence, 'X');
ParseAnnotatedMods(OneBasedModifications, modTypesToExclude, unknownModifications, AnnotatedMods);
+ //prune any sequence variants whose coordinates exceed the known sequence length
+ PruneOutOfRangeSequenceVariants();
if (Accession.StartsWith(decoyIdentifier))
{
isDecoy = true;
@@ -370,7 +454,30 @@ public Protein ParseEntryEndElement(XmlReader xml, bool isContaminant, string pr
Clear();
return result;
}
-
+ ///
+ /// Finalizes the parsing of an RNA XML entry and constructs an object from the accumulated data.
+ /// This method is called when the end of an <entry> element is reached during XML parsing for RNA records.
+ /// It performs several key tasks:
+ ///
+ /// - Sanitizes the parsed sequence (e.g., replacing invalid characters with 'X').
+ /// - Prunes any sequence variants whose coordinates exceed the sequence length.
+ /// - Resolves and attaches all annotated modifications, excluding those of specified types or unknowns.
+ /// - Determines if the RNA is a decoy based on the accession and decoy identifier.
+ /// - Aggregates all parsed data (gene names, proteolysis products, sequence variations, and other metadata) into a new instance.
+ /// - Clears the internal state of the to prepare for parsing the next entry.
+ ///
+ /// If either the accession or sequence is missing, returns null.
+ ///
+ /// The positioned at the end of the <entry> element.
+ /// Indicates whether the RNA is a contaminant.
+ /// The file path or identifier of the RNA database source.
+ /// A collection of modification types to exclude from the RNA.
+ /// A dictionary to collect modifications that could not be resolved.
+ /// A string used to identify decoy RNAs (default: "DECOY").
+ ///
+ /// A constructed object containing all parsed and resolved information,
+ /// or null if the entry is incomplete.
+ ///
internal RNA ParseRnaEntryEndElement(XmlReader xml, bool isContaminant, string rnaDbLocation,
IEnumerable modTypesToExclude, Dictionary unknownModifications, string decoyIdentifier = "DECOY")
{
@@ -381,6 +488,8 @@ internal RNA ParseRnaEntryEndElement(XmlReader xml, bool isContaminant, string r
// sanitize the sequence to replace unexpected characters with X (unknown amino acid)
// sometimes strange characters get added by RNA sequencing software, etc.
Sequence = ProteinDbLoader.SanitizeAminoAcidSequence(Sequence, 'X');
+ //prune any sequence variants whose coordinates exceed the known sequence length
+ PruneOutOfRangeSequenceVariants();
if (Accession.StartsWith(decoyIdentifier))
{
isDecoy = true;
@@ -407,8 +516,19 @@ public void ParseSubFeatureEndElement(XmlReader xml, IEnumerable modType
}
///
- /// Finish parsing a feature element
+ /// Processes the end of a <feature> element during XML parsing and updates the internal state with the parsed feature information.
+ /// Depending on the feature type, this method:
+ ///
+ /// - Adds modification annotations for "modified residue" and "lipid moiety-binding region" features.
+ /// - Creates and adds objects for proteolytic features such as "peptide", "propeptide", "chain", and "signal peptide".
+ /// - Handles "sequence variant" features by creating objects, including variant-specific modifications, and ensures they apply to the correct sequence or isoform.
+ /// - Creates and adds or objects for their respective feature types, using available position information.
+ ///
+ /// After processing, resets feature-related state variables to prepare for the next feature.
///
+ /// The positioned at the end of the <feature> element.
+ /// A collection of modification types to exclude from the protein.
+ /// A dictionary to collect modifications that could not be resolved.
public void ParseFeatureEndElement(XmlReader xml, IEnumerable modTypesToExclude, Dictionary unknownModifications)
{
if (FeatureType == "modified residue")
@@ -445,24 +565,56 @@ public void ParseFeatureEndElement(XmlReader xml, IEnumerable modTypesTo
}
else
{
- type += ("null-null");
+ type += "null-null";
}
}
ProteolysisProducts.Add(new TruncationProduct(OneBasedBeginPosition, OneBasedEndPosition, type));
}
- else if (FeatureType == "sequence variant" && VariationValue != null && VariationValue != "") // Only keep if there is variant sequence information and position information
+ else if (FeatureType == "sequence variant" && VariationValue != null && VariationValue != "")
{
- ParseAnnotatedMods(OneBasedVariantModifications, modTypesToExclude, unknownModifications, AnnotatedVariantMods);
- if (OneBasedBeginPosition != null && OneBasedEndPosition != null)
+ bool appliesToThisSequence = true;
+ if (!string.IsNullOrEmpty(LocationSequenceId))
{
- SequenceVariations.Add(new SequenceVariation((int)OneBasedBeginPosition, (int)OneBasedEndPosition, OriginalValue, VariationValue, FeatureDescription, OneBasedVariantModifications));
+ string acc = Accession ?? "";
+ appliesToThisSequence =
+ LocationSequenceId.Equals(acc, StringComparison.OrdinalIgnoreCase)
+ || (!string.IsNullOrEmpty(acc) && LocationSequenceId.Equals($"{acc}-1", StringComparison.OrdinalIgnoreCase));
}
- else if (OneBasedFeaturePosition >= 1)
+
+ if (appliesToThisSequence)
{
- SequenceVariations.Add(new SequenceVariation(OneBasedFeaturePosition, OriginalValue, VariationValue, FeatureDescription, OneBasedVariantModifications));
+ ParseAnnotatedMods(OneBasedVariantModifications, modTypesToExclude, unknownModifications, AnnotatedVariantMods);
+
+ // NOTE: We can NOT validate coordinate vs sequence length here because sequence is usually parsed later.
+ // Validation is deferred to PruneOutOfRangeSequenceVariants() during ParseEntryEndElement.
+
+ if (OneBasedBeginPosition != null && OneBasedEndPosition != null)
+ {
+ SequenceVariations.Add(
+ new SequenceVariation(
+ (int)OneBasedBeginPosition,
+ (int)OneBasedEndPosition,
+ OriginalValue,
+ VariationValue,
+ FeatureDescription,
+ //variantCallFormatDataString: null,
+ oneBasedModifications: OneBasedVariantModifications));
+ }
+ else if (OneBasedFeaturePosition >= 1)
+ {
+ SequenceVariations.Add(
+ new SequenceVariation(
+ OneBasedFeaturePosition,
+ OriginalValue,
+ VariationValue,
+ FeatureDescription,
+ //variantCallFormatDataString: null,
+ oneBasedModifications: OneBasedVariantModifications));
+ }
+
+ AnnotatedVariantMods = new List<(int, string)>();
+ OneBasedVariantModifications = new Dictionary>();
}
- AnnotatedVariantMods = new List<(int, string)>();
- OneBasedVariantModifications = new Dictionary>();
}
else if (FeatureType == "disulfide bond")
{
@@ -491,64 +643,7 @@ public void ParseFeatureEndElement(XmlReader xml, IEnumerable modTypesTo
OneBasedFeaturePosition = -1;
OriginalValue = "";
VariationValue = "";
- }
-
- private static void ParseAnnotatedMods(Dictionary> destination, IEnumerable modTypesToExclude,
- Dictionary unknownModifications, List<(int, string)> annotatedMods)
- {
- foreach (var annotatedMod in annotatedMods)
- {
- string annotatedId = annotatedMod.Item2;
- int annotatedModLocation = annotatedMod.Item1;
-
- if (ProteinDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out Modification foundMod)
- || RnaDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out foundMod))
- {
- // if the list of known mods contains this IdWithMotif
- if (!modTypesToExclude.Contains(foundMod.ModificationType))
- {
- if (destination.TryGetValue(annotatedModLocation, out var listOfModsAtThisLocation))
- {
- listOfModsAtThisLocation.Add(foundMod);
- }
- else
- {
- destination.Add(annotatedModLocation, new List { foundMod });
- }
- }
- // else - the mod ID was found but the motif didn't fit the annotated location
- }
-
- // no known mod - try looking it up in the dictionary of mods without motif appended
- else if (ProteinDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out IList mods)
- || RnaDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out mods))
- {
- foreach (Modification mod in mods)
- {
- if (!modTypesToExclude.Contains(mod.ModificationType))
- {
- if (destination.TryGetValue(annotatedModLocation, out var listOfModsAtThisLocation))
- {
- listOfModsAtThisLocation.Add(mod);
- }
- else
- {
- destination.Add(annotatedModLocation, new List { mod });
- }
- break;
- }
- }
- }
- else
- {
- // could not find the annotated mod's ID in our list of known mods - it's an unknown mod
- // I don't think this really does anything...
- if (!unknownModifications.ContainsKey(annotatedId))
- {
- unknownModifications.Add(annotatedId, new Modification(annotatedId));
- }
- }
- }
+ LocationSequenceId = null;
}
///
@@ -604,6 +699,86 @@ private void Clear()
GeneNames = new List>();
ReadingGene = false;
ReadingOrganism = false;
+ LocationSequenceId = null;
+ AnnotatedVariantMods = new List<(int, string)>();
+ OneBasedVariantModifications = new Dictionary>();
+ }
+ ///
+ /// Resolves and attaches annotated modifications to the specified destination dictionary based on parsed feature or variant annotations.
+ /// For each annotated modification, attempts to look up the modification by its identifier (with motif) in both protein and RNA modification dictionaries.
+ /// If found and not excluded by , the modification is added to the destination at the specified position.
+ /// If not found by identifier, attempts to resolve the modification by possible matches (without motif) and adds the first non-excluded match.
+ /// If no match is found, records the modification as unknown in to avoid repeated warnings.
+ /// This method is used to populate the protein or variant modification dictionaries during XML parsing.
+ ///
+ /// Dictionary mapping one-based positions to lists of modifications to be populated.
+ /// A collection of modification types to exclude from assignment.
+ /// A dictionary to collect modifications that could not be resolved by identifier or type.
+ /// List of (position, modification identifier) tuples parsed from XML features or subfeatures.
+ private static void ParseAnnotatedMods(
+ Dictionary> destination,
+ IEnumerable modTypesToExclude,
+ Dictionary unknownModifications,
+ List<(int position, string originalModificationID)> annotatedMods)
+ {
+ foreach (var (annotatedModLocation, annotatedId) in annotatedMods)
+ {
+ if (ProteinDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out Modification foundMod)
+ || RnaDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out foundMod))
+ {
+ if (!modTypesToExclude.Contains(foundMod.ModificationType))
+ {
+ if (destination.TryGetValue(annotatedModLocation, out var listOfModsAtThisLocation))
+ {
+ listOfModsAtThisLocation.Add(foundMod);
+ }
+ else
+ {
+ destination.Add(annotatedModLocation, new List { foundMod });
+ }
+ }
+ }
+ else if (ProteinDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out IList mods)
+ || RnaDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out mods))
+ {
+ foreach (Modification mod in mods)
+ {
+ if (!modTypesToExclude.Contains(mod.ModificationType))
+ {
+ if (destination.TryGetValue(annotatedModLocation, out var listOfModsAtThisLocation))
+ {
+ listOfModsAtThisLocation.Add(mod);
+ }
+ else
+ {
+ destination.Add(annotatedModLocation, new List { mod });
+ }
+ break;
+ }
+ }
+ }
+ else
+ {
+ if (!unknownModifications.ContainsKey(annotatedId))
+ {
+ unknownModifications.Add(annotatedId, new Modification(annotatedId));
+ }
+ }
+ }
+ }
+ private void PruneOutOfRangeSequenceVariants()
+ {
+ if (string.IsNullOrEmpty(Sequence) || SequenceVariations.Count == 0)
+ return;
+
+ int len = Sequence.Length;
+ int removed = SequenceVariations.RemoveAll(v =>
+ v.OneBasedBeginPosition > len || v.OneBasedEndPosition > len);
+
+ if (removed > 0)
+ {
+ Trace.TraceWarning($"Pruned {removed} out-of-range sequence variant(s) for accession {Accession} (protein length {len}).");
+ }
}
}
}
\ No newline at end of file