diff --git a/mzLib/Omics/BioPolymer/SequenceVariation.cs b/mzLib/Omics/BioPolymer/SequenceVariation.cs index c3c4ef9e2..3b5ef13f0 100644 --- a/mzLib/Omics/BioPolymer/SequenceVariation.cs +++ b/mzLib/Omics/BioPolymer/SequenceVariation.cs @@ -4,69 +4,216 @@ namespace Omics.BioPolymer { + /// + /// Represents a contiguous sequence variation on a 1-based, inclusive coordinate system. + /// A variation spans .. in the parent sequence, + /// replaces the (which may be empty for an insertion) with + /// (which may be empty for a deletion), and can optionally carry site-specific . + /// When available, a parsed is attached via . + /// + /// Typical interpretations: + /// - Substitution: non-empty and non-empty of equal length. + /// - Insertion: empty and non-empty . + /// - Deletion: non-empty and empty . + /// public class SequenceVariation { /// - /// For longer sequence variations, where a range of sequence is replaced. Point mutations should be specified with the same begin and end positions. + /// Create a variation with an explicit VCF object. /// - /// - /// - /// - /// - /// + /// + /// 1-based, inclusive start position in the parent sequence where the variation begins. + /// Must be >= 1. See for validity conditions. + /// + /// + /// 1-based, inclusive end position in the parent sequence where the variation ends. + /// Must satisfy oneBasedEndPosition >= oneBasedBeginPosition. + /// + /// + /// Reference subsequence being replaced. Null is coerced to an empty string. + /// Empty string typically indicates an insertion at . + /// + /// + /// Alternate subsequence to insert in place of . + /// Null is coerced to an empty string. Empty string typically indicates a deletion. + /// + /// + /// Free-form description of the variation. Often the original VCF line or human-readable note. + /// + /// + /// Parsed VCF wrapper for the originating record. Used for downstream analysis of genotype/allele metadata. + /// + /// + /// Optional mapping from absolute 1-based residue positions to one or more objects + /// applied at that position (in the same coordinate system as /). + /// If null, an empty dictionary is created. + /// + public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, VariantCallFormat variantCallFormat, Dictionary>? oneBasedModifications = null) + { + OneBasedBeginPosition = oneBasedBeginPosition; + OneBasedEndPosition = oneBasedEndPosition; + OriginalSequence = originalSequence ?? ""; + VariantSequence = variantSequence ?? ""; + Description = description; + VariantCallFormatDataString = variantCallFormat; + OneBasedModifications = oneBasedModifications ?? new Dictionary>(); + } + + /// + /// Create a variation by providing a raw VCF line (string representation) which will be parsed into a . + /// + /// + /// 1-based, inclusive start position in the parent sequence where the variation begins. + /// Must be >= 1. See for validity conditions. + /// + /// + /// 1-based, inclusive end position in the parent sequence where the variation ends. + /// Must satisfy oneBasedEndPosition >= oneBasedBeginPosition. + /// + /// + /// Reference subsequence being replaced. Null is coerced to an empty string. + /// Empty string typically indicates an insertion at . + /// + /// + /// Alternate subsequence to insert in place of . + /// Null is coerced to an empty string. Empty string typically indicates a deletion. + /// + /// + /// Free-form description of the variation. Often the original VCF line or human-readable note. + /// + /// + /// Raw VCF record (a single, tab-delimited line). It is parsed into . + /// + /// + /// Optional mapping from absolute 1-based residue positions to one or more objects + /// applied at that position (in the same coordinate system as /). + /// If null, an empty dictionary is created. + /// + public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, string variantCallFormatStringRepresentation, Dictionary>? oneBasedModifications = null) + { + OneBasedBeginPosition = oneBasedBeginPosition; + OneBasedEndPosition = oneBasedEndPosition; + OriginalSequence = originalSequence ?? ""; + VariantSequence = variantSequence ?? ""; + Description = description; + VariantCallFormatDataString = new VariantCallFormat(variantCallFormatStringRepresentation); + OneBasedModifications = oneBasedModifications ?? new Dictionary>(); + } + + /// + /// Create a variation without a separate VCF string; a is still constructed + /// from to maintain a non-null object for tests and downstream consumers. + /// + /// + /// 1-based, inclusive start position in the parent sequence where the variation begins. + /// Must be >= 1. See for validity conditions. + /// + /// + /// 1-based, inclusive end position in the parent sequence where the variation ends. + /// Must satisfy oneBasedEndPosition >= oneBasedBeginPosition. + /// + /// + /// Reference subsequence being replaced. Null is coerced to an empty string. + /// Empty string typically indicates an insertion at . + /// + /// + /// Alternate subsequence to insert in place of . + /// Null is coerced to an empty string. Empty string typically indicates a deletion. + /// + /// + /// Free-form description of the variation. Also used to initialize . + /// + /// + /// Optional mapping from absolute 1-based residue positions to one or more objects + /// applied at that position (in the same coordinate system as /). + /// If null, an empty dictionary is created. + /// public SequenceVariation(int oneBasedBeginPosition, int oneBasedEndPosition, string originalSequence, string variantSequence, string description, Dictionary>? oneBasedModifications = null) { OneBasedBeginPosition = oneBasedBeginPosition; OneBasedEndPosition = oneBasedEndPosition; OriginalSequence = originalSequence ?? ""; VariantSequence = variantSequence ?? ""; - Description = new VariantCallFormat(description); + Description = description; + // Always construct a VariantCallFormat so tests relying on non-null VCF objects pass. + VariantCallFormatDataString = new VariantCallFormat(description); OneBasedModifications = oneBasedModifications ?? new Dictionary>(); } /// - /// For variations with only position information (not begin and end). - /// Sets the end to the end of the original protein sequence to which this variation applies. + /// Convenience constructor for single-position variations. The end position is inferred from + /// and the length of : + /// end = position + length(originalSequence) - 1 (or end = position if is null). /// - /// - /// - /// - /// - /// + /// + /// 1-based, inclusive position of the variation start. Used to infer . + /// + /// + /// Reference subsequence being replaced. If null, treated as empty for end-position inference. + /// + /// + /// Alternate subsequence to insert in place of . May be empty for deletions. + /// + /// + /// Free-form description of the variation. Also used to initialize . + /// + /// + /// Optional mapping from absolute 1-based residue positions to one or more objects + /// applied at that position. If null, an empty dictionary is created. + /// public SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, Dictionary>? oneBasedModifications = null) : this(oneBasedPosition, originalSequence == null ? oneBasedPosition : oneBasedPosition + originalSequence.Length - 1, originalSequence, variantSequence, description, oneBasedModifications) { } /// - /// Beginning position of original sequence to be replaced + /// 1-based, inclusive start position of this variation within the parent sequence. /// public int OneBasedBeginPosition { get; } /// - /// End position of original sequence to be replaced + /// 1-based, inclusive end position of this variation within the parent sequence. + /// For single-site variations with non-null , this is + /// OneBasedBeginPosition + OriginalSequence.Length - 1. /// public int OneBasedEndPosition { get; } /// - /// Original sequence information (optional) + /// The reference subsequence replaced by this variation. Empty string implies an insertion. /// public string OriginalSequence { get; } /// - /// Variant sequence information (required) + /// The alternate subsequence inserted by this variation. Empty string implies a deletion. /// public string VariantSequence { get; } /// - /// Description of this variation (optional) + /// Free-form description of the variation. Often the raw VCF line or a human-readable summary. /// - public VariantCallFormat Description { get; } + public string Description { get; } /// - /// Modifications specifically for this variant + /// Optional parsed VCF wrapper providing structured access to the originating VCF record. + /// May be null in some construction paths; in the provided constructors it is initialized. + /// + public VariantCallFormat? VariantCallFormatDataString { get; } + + /// + /// Mapping from absolute 1-based residue positions to a list of objects + /// to apply at each position. Never null; defaults to an empty dictionary. /// public Dictionary> OneBasedModifications { get; } + /// + /// Determines value equality with another object. + /// Two objects are equal when: + /// - Begin and end positions are equal + /// - Original and variant sequences are equal (nulls treated as equal only if both are null) + /// - are both null or equal + /// - have identical key sets and identical flattened modification lists (sequence-equal) + /// + /// Object to compare against. + /// True if equal by the criteria above; otherwise false. public override bool Equals(object obj) { SequenceVariation s = obj as SequenceVariation; @@ -75,90 +222,95 @@ public override bool Equals(object obj) && OneBasedEndPosition == s.OneBasedEndPosition && (s.OriginalSequence == null && OriginalSequence == null || OriginalSequence.Equals(s.OriginalSequence)) && (s.VariantSequence == null && VariantSequence == null || VariantSequence.Equals(s.VariantSequence)) - && (s.Description == null && Description == null || Description.Equals(s.Description)) + && ((s.VariantCallFormatDataString == null && VariantCallFormatDataString == null) + || (VariantCallFormatDataString != null && VariantCallFormatDataString.Equals(s.VariantCallFormatDataString))) && (s.OneBasedModifications == null && OneBasedModifications == null || s.OneBasedModifications.Keys.ToList().SequenceEqual(OneBasedModifications.Keys.ToList()) && s.OneBasedModifications.Values.SelectMany(m => m).ToList().SequenceEqual(OneBasedModifications.Values.SelectMany(m => m).ToList())); } + /// + /// Computes a hash code from begin/end positions, sequences, and the VCF wrapper (if present). + /// + /// A hash code suitable for hash-based collections. public override int GetHashCode() { return OneBasedBeginPosition.GetHashCode() ^ OneBasedEndPosition.GetHashCode() - ^ OriginalSequence.GetHashCode() // null handled in constructor - ^ VariantSequence.GetHashCode() // null handled in constructor - ^ Description.GetHashCode(); // always constructed in constructor + ^ OriginalSequence.GetHashCode() + ^ VariantSequence.GetHashCode() + ^ (VariantCallFormatDataString?.GetHashCode() ?? 0); } /// - /// Returns a simple string represantation of this amino acid change + /// Produces a compact, human-readable representation: {OriginalSequence}{OneBasedBeginPosition}{VariantSequence}. + /// Example: substitution A->T at position 12 yields "A12T". /// - /// + /// The compact representation string. public string SimpleString() { return OriginalSequence + OneBasedBeginPosition.ToString() + VariantSequence; } /// - /// Determines whether this interval overlaps the queried interval + /// Tests whether the current variation intersects (overlaps) another variation in coordinate space. + /// Intersection is inclusive: any shared position in the 1-based, inclusive ranges is considered overlap. /// - /// - /// + /// The other to test. + /// True if the ranges overlap; otherwise false. internal bool Intersects(SequenceVariation segment) { return segment.OneBasedEndPosition >= OneBasedBeginPosition && segment.OneBasedBeginPosition <= OneBasedEndPosition; } /// - /// Determines whether this interval overlaps the queried interval + /// Tests whether the current variation intersects (overlaps) a truncation product range. + /// Intersection is inclusive: any shared position in the 1-based, inclusive ranges is considered overlap. /// - /// - /// + /// The segment to test. + /// True if the ranges overlap; otherwise false. internal bool Intersects(TruncationProduct segment) { return segment.OneBasedEndPosition >= OneBasedBeginPosition && segment.OneBasedBeginPosition <= OneBasedEndPosition; } /// - /// Determines whether this interval overlaps the queried position + /// Tests whether the current variation intersects a single 1-based position. /// - /// - /// + /// A 1-based, inclusive position in the parent sequence. + /// True if lies within the variation’s range; otherwise false. internal bool Intersects(int pos) { return OneBasedBeginPosition <= pos && pos <= OneBasedEndPosition; } /// - /// Determines whether this interval includes the queried interval + /// Tests whether the current variation fully includes another variation’s range. + /// Inclusion is inclusive on both ends. /// - /// - /// + /// The other to test. + /// True if the current range fully contains ; otherwise false. internal bool Includes(SequenceVariation segment) { return OneBasedBeginPosition <= segment.OneBasedBeginPosition && OneBasedEndPosition >= segment.OneBasedEndPosition; } - // Commented out by AVC on 4/5/23. Unused and untested in current code base, - // but can't rule out that it could be useful in the future. - /// - /// Determines whether this interval includes the queried interval - /// - /// - /// - // internal bool Includes(TruncationProduct segment) - // { - // return OneBasedBeginPosition <= segment.OneBasedBeginPosition && OneBasedEndPosition >= segment.OneBasedEndPosition; - // } /// - /// Determines whether this interval overlaps the queried position + /// Tests whether the current variation includes a single 1-based position. + /// Inclusion is inclusive on both ends. /// - /// - /// + /// A 1-based, inclusive position in the parent sequence. + /// True if lies within the variation’s range; otherwise false. internal bool Includes(int pos) { return OneBasedBeginPosition <= pos && pos <= OneBasedEndPosition; } + + /// + /// Validates positional consistency: begin must be > 0 and end must be >= begin. + /// This does not validate string/length consistency between and . + /// + /// True if positions are valid; otherwise false. public bool AreValid() { return OneBasedBeginPosition > 0 && OneBasedEndPosition >= OneBasedBeginPosition; diff --git a/mzLib/Omics/BioPolymer/VariantApplication.cs b/mzLib/Omics/BioPolymer/VariantApplication.cs index 1669c3afe..c5fd71ed5 100644 --- a/mzLib/Omics/BioPolymer/VariantApplication.cs +++ b/mzLib/Omics/BioPolymer/VariantApplication.cs @@ -5,38 +5,56 @@ namespace Omics.BioPolymer { /// - /// Provides methods for applying sequence variations to proteins and handling modifications on variant sequences. + /// Utilities for constructing and applying sequence variants to biopolymers (proteins/RNA) in a 1-based, inclusive coordinate system. + /// This includes: + /// - Expanding a base biopolymer into concrete variant instances (genotype-aware or combinatorial). + /// - Renaming/accession tagging based on applied variants. + /// - Re-mapping indices for truncation products and localized modifications after edits (insertions/deletions/substitutions). + /// - Converting certain annotation-style modifications (e.g., nucleotide substitution markers) into true sequence variants. /// - /// - /// Originally by A. Cesnik on 11/2/18, updated on 4/25/23. NB moved it and generalized for use in Transcriptomics on 3/25/25. - /// public static class VariantApplication { /// - /// Creates a list of IBioPolymers of the same type as the original protein, each with applied variants from this protein. + /// Builds concrete variant biopolymers from a base entity. + /// If any known variation lacks genotype information (no VCF or empty genotypes), a safe combinatorial expansion is used (bounded by ). + /// Otherwise, variants are applied in a genotype/allele-depth-aware manner via . /// - /// Type of BioPolymer to create variants of - /// original to generate variants of - /// - /// - /// This replaces a method call that was previously an instance method in Protein + /// A biopolymer type that supports sequence variants. + /// The base biopolymer to expand into variant instances. + /// + /// Maximum number of variants to consider when creating combinations for genotype-less scenarios. + /// This caps explosion in . + /// + /// + /// Minimum AD (Allele Depth) per sample required for an allele (reference or alternate) to be considered "deep" enough to participate in genotype-aware application. + /// + /// A list of concrete variants derived from . public static List GetVariantBioPolymers(this TBioPolymerType protein, int maxAllowedVariantsForCombinatorics = 4, int minAlleleDepth = 1) where TBioPolymerType : IHasSequenceVariants { + // Materialize any substitution-like annotations into concrete sequence variants on both the consensus and the instance protein.ConsensusVariant.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); - if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.Description == null || v.Description.Genotypes.Count == 0)) + + // If all variants are positionally valid and any is missing VCF/genotype data, fall back to bounded combinatorial application + if (protein.SequenceVariations.All(v => v.AreValid()) && protein.SequenceVariations.Any(v => v.VariantCallFormatDataString == null || v.VariantCallFormatDataString.Genotypes.Count == 0)) { - // this is a protein with either no VCF lines or a mix of VCF and non-VCF lines return ApplyAllVariantCombinations(protein, protein.SequenceVariations, maxAllowedVariantsForCombinatorics).ToList(); } - // this is a protein with only VCF lines - return ApplyVariants(protein, protein.SequenceVariations, maxAllowedVariantsForCombinatorics, minAlleleDepth); + + // Otherwise, do genotype/allele-depth-aware application with combinatorics limited for heterozygous sites + return ApplyVariants(protein, protein.SequenceVariations, maxAllowedVariantsForCombinitorics: maxAllowedVariantsForCombinatorics, minAlleleDepth); } /// - /// Gets the name of a protein with applied variations + /// Produces a name with an appended variant tag built from applied variation descriptions. + /// If both and are effectively empty, returns null. /// + /// Base name (e.g., protein name). May be null. + /// Variations applied to the instance. If null/empty, no variant tag is appended. + /// + /// The base plus " variant:{...}" when variations exist; null if both inputs are empty. + /// public static string? GetVariantName(string? name, IEnumerable? appliedVariations) { bool emptyVars = appliedVariations.IsNullOrEmpty(); @@ -48,8 +66,13 @@ public static List GetVariantBioPolymers(this } /// - /// Gets the accession for a protein with applied variations + /// Constructs a variant-aware accession by appending a compact variant string to the consensus accession. /// + /// The variant-capable biopolymer. + /// Applied variations to encode into the suffix; may be null/empty. + /// + /// The consensus accession or consensus accession with "_{SimpleString}_..." suffix if variations are present. + /// public static string GetAccession(IHasSequenceVariants protein, IEnumerable? appliedSequenceVariations) { return protein.ConsensusVariant.Accession + @@ -57,33 +80,23 @@ public static string GetAccession(IHasSequenceVariants protein, IEnumerable - /// Determines if the modification falls on a variant amino acid + /// Determines if a specific 1-based position in the variant biopolymer lies within a particular variation's range. /// - /// true if a modification index on the protein falls within the applied variant - /// - /// A. Cesnik - 4/25/23 - /// Variants annotated in protein entries can be applied to a sequence, i.e. a change is made to the sequence. - /// One of the things Spritz can do that no other tool can do is enable finding modifications on these sites of variation, - /// since I amended the sequence variant XML entries to have modifications. - /// + /// The variation of interest; may be null. + /// 1-based position in the current (possibly already-edited) variant sequence. + /// True if the position is included by the variation; otherwise false. public static bool IsSequenceVariantModification(SequenceVariation? appliedVariant, int variantProteinIndex) { return appliedVariant != null && appliedVariant.Includes(variantProteinIndex); } /// - /// Restores modification index on a variant protein to the index on the nonvariant protein, - /// or if it falls on a variant, this restores the position on the protein with only that variant + /// Maps a modification index from the edited (variant) sequence back to the original consensus index by subtracting the + /// net length changes of all applied variations that ended before the queried position. /// - /// Protein containing applied sequence variations - /// The one-based index of the amino acid residue bearing a modification - /// - /// - /// A. Cesnik - 4/25/23 - /// Useful for comparing modification indices on variant proteins to the original protein. - /// Variations can introduce length changes and other changes to the sequence, - /// so the indices of the modifications aren’t directly comparable, but this method makes that possible. - /// + /// The variant-capable biopolymer containing applied variations. + /// 1-based index in the variant sequence. + /// The corresponding 1-based index in the original consensus sequence. public static int RestoreModificationIndex(IHasSequenceVariants protein, int variantProteinModificationIndex) { return variantProteinModificationIndex - protein.AppliedSequenceVariations @@ -92,57 +105,73 @@ public static int RestoreModificationIndex(IHasSequenceVariants protein, int var } /// - /// Applies multiple variant changes to a protein sequence + /// Applies a set of sequence variations in a genotype- and allele-depth-aware fashion for all individuals found in the VCF payloads. + /// Heterozygous sites can produce combinatorial branches up to per individual. + /// Results are deduplicated by final base sequence. /// + /// A biopolymer type that supports sequence variants. + /// The base biopolymer to which variations will be applied. + /// Candidate variations. Duplicates by effect are collapsed using . + /// + /// Upper cap for heterozygous combinatorial branching. If an individual has more heterozygous variants than this number, + /// the algorithm limits branching to control explosion. + /// + /// Minimum AD (Allele Depth) per sample for an allele to be considered in application. + /// A list of concrete variant biopolymers across all individuals encoded in the VCF payloads. public static List ApplyVariants(TBioPolymerType protein, IEnumerable sequenceVariations, int maxAllowedVariantsForCombinitorics, int minAlleleDepth) where TBioPolymerType : IHasSequenceVariants { + // Remove duplicate effects (by SimpleString), require variants with genotype data, apply from higher to lower positions List uniqueEffectsToApply = sequenceVariations .GroupBy(v => v.SimpleString()) .Select(x => x.First()) - .Where(v => v.Description.Genotypes.Count > 0) // this is a VCF line - .OrderByDescending(v => v.OneBasedBeginPosition) // apply variants at the end of the protein sequence first + .Where(v => v.VariantCallFormatDataString.Genotypes.Count > 0) + .OrderByDescending(v => v.OneBasedBeginPosition) .ToList(); + // A shallow "base" variant to branch from (no applied variants yet) TBioPolymerType proteinCopy = protein.CreateVariant(protein.BaseSequence, protein, null, protein.TruncationProducts, protein.OneBasedPossibleLocalizedModifications, null); - // If there aren't any variants to apply, just return the base protein if (uniqueEffectsToApply.Count == 0) { return new List { proteinCopy }; } - HashSet individuals = new HashSet(uniqueEffectsToApply.SelectMany(v => v.Description.Genotypes.Keys)); + // All per-sample identifiers present in the VCF objects + HashSet individuals = new HashSet(uniqueEffectsToApply.SelectMany(v => v.VariantCallFormatDataString.Genotypes.Keys)); List variantProteins = new(); List newVariantProteins = new(); - // loop through genotypes for each sample/individual (e.g. tumor and normal) + foreach (string individual in individuals) { newVariantProteins.Clear(); newVariantProteins.Add(proteinCopy); - bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.Description.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics; + // Whether to limit combinatorial branching for this individual + bool tooManyHeterozygousVariants = uniqueEffectsToApply.Count(v => v.VariantCallFormatDataString.Heterozygous[individual]) > maxAllowedVariantsForCombinitorics; + foreach (var variant in uniqueEffectsToApply) { - bool variantAlleleIsInTheGenotype = variant.Description.Genotypes[individual].Contains(variant.Description.AlleleIndex.ToString()); // should catch the case where it's -1 if the INFO isn't from SnpEff + // Only proceed if the individual's genotype references this variant's alternate allele index + bool variantAlleleIsInTheGenotype = variant.VariantCallFormatDataString.Genotypes[individual].Contains(variant.VariantCallFormatDataString.AlleleIndex.ToString()); if (!variantAlleleIsInTheGenotype) { continue; } - bool isHomozygousAlternate = variant.Description.Homozygous[individual] && variant.Description.Genotypes[individual].All(d => d == variant.Description.AlleleIndex.ToString()); // note this isn't a great test for homozygosity, since the genotype could be 1/2 and this would still return true. But currently, alleles 1 and 2 will be included as separate variants, so this is fine for now. - bool isDeepReferenceAllele = int.TryParse(variant.Description.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth; - bool isDeepAlternateAllele = int.TryParse(variant.Description.AlleleDepths[individual][variant.Description.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth; - // homozygous alternate + // Zygosity and depth checks for this individual + bool isHomozygousAlternate = variant.VariantCallFormatDataString.Homozygous[individual] && variant.VariantCallFormatDataString.Genotypes[individual].All(d => d == variant.VariantCallFormatDataString.AlleleIndex.ToString()); + bool isDeepReferenceAllele = int.TryParse(variant.VariantCallFormatDataString.AlleleDepths[individual][0], out int depthRef) && depthRef >= minAlleleDepth; + bool isDeepAlternateAllele = int.TryParse(variant.VariantCallFormatDataString.AlleleDepths[individual][variant.VariantCallFormatDataString.AlleleIndex], out int depthAlt) && depthAlt >= minAlleleDepth; + if (isHomozygousAlternate && isDeepAlternateAllele) { + // Deterministic application: all branches take the alt allele newVariantProteins = newVariantProteins.Select(p => ApplySingleVariant(variant, p, individual)).ToList(); } - - // heterozygous basic - // first protein with variants contains all homozygous variation, second contains all variations - else if (variant.Description.Heterozygous[individual] && tooManyHeterozygousVariants) + else if (variant.VariantCallFormatDataString.Heterozygous[individual] && tooManyHeterozygousVariants) { + // Limit branching: either keep ref, take alt, or update second branch if already present if (isDeepAlternateAllele && isDeepReferenceAllele) { if (newVariantProteins.Count == 1 && maxAllowedVariantsForCombinitorics > 0) @@ -154,51 +183,35 @@ public static List ApplyVariants(TBioPolymerTy { newVariantProteins[1] = ApplySingleVariant(variant, newVariantProteins[1], individual); } - else - { - // no heterozygous variants - } } else if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0) { newVariantProteins = newVariantProteins.Select(p => ApplySingleVariant(variant, p, individual)).ToList(); } - else - { - // keep reference only - } } - - // heterozygous combinitorics - else if (variant.Description.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants) + else if (variant.VariantCallFormatDataString.Heterozygous[individual] && isDeepAlternateAllele && !tooManyHeterozygousVariants) { + // Full combinatorics (bounded) for heterozygous case: keep ref and/or take alt depending on depths List combinitoricProteins = new(); foreach (var ppp in newVariantProteins) { if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0 && isDeepReferenceAllele) { - // keep reference allele - if (variant.Description.Genotypes[individual].Contains("0")) + if (variant.VariantCallFormatDataString.Genotypes[individual].Contains("0")) { - combinitoricProteins.Add(ppp); + combinitoricProteins.Add(ppp); // keep reference branch } - - // alternate allele (replace all, since in heterozygous with two alternates, both alternates are included) - combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual)); + combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual)); // alternate branch } else if (isDeepAlternateAllele && maxAllowedVariantsForCombinitorics > 0) { combinitoricProteins.Add(ApplySingleVariant(variant, ppp, individual)); } - else if (variant.Description.Genotypes[individual].Contains("0")) + else if (variant.VariantCallFormatDataString.Genotypes[individual].Contains("0")) { combinitoricProteins.Add(ppp); } - else - { - // must be two alternate alleles with not enough depth - } } newVariantProteins = combinitoricProteins; } @@ -206,49 +219,79 @@ public static List ApplyVariants(TBioPolymerTy variantProteins.AddRange(newVariantProteins); } + // De-duplicate by final sequence to avoid identical variants from different branches return variantProteins.GroupBy(x => x.BaseSequence).Select(x => x.First()).ToList(); } /// - /// Applies a single variant to a protein sequence + /// Applies a single to a biopolymer and returns the updated instance with + /// sequence, truncation products, localized modifications, and applied-variation indices all re-mapped. /// + /// A biopolymer type that supports sequence variants. + /// Variation to apply (coordinates refer to the current protein's sequence). + /// Source biopolymer to mutate. + /// Sample identifier used to annotate the created variant (may be empty). + /// A new variant instance created via . private static TBioPolymerType ApplySingleVariant(SequenceVariation variantGettingApplied, TBioPolymerType protein, string individual) where TBioPolymerType : IHasSequenceVariants { + // Sequence prefix before the edit region string seqBefore = protein.BaseSequence.Substring(0, variantGettingApplied.OneBasedBeginPosition - 1); + // The replacement (alternate) sequence string seqVariant = variantGettingApplied.VariantSequence; + // First index in the original sequence after the edited region int afterIdx = variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.OriginalSequence.Length - 1; - SequenceVariation variantAfterApplication = new SequenceVariation( - variantGettingApplied.OneBasedBeginPosition, - variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1, - variantGettingApplied.OriginalSequence, - variantGettingApplied.VariantSequence, - variantGettingApplied.Description.Description, - variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)); - - // check to see if there is incomplete indel overlap, which would lead to weird variant sequences - // complete overlap is okay, since it will be overwritten; this can happen if there are two alternate alleles, - // e.g. reference sequence is wrong at that point - bool intersectsAppliedRegionIncompletely = protein.AppliedSequenceVariations.Any(x => variantGettingApplied.Intersects(x) && !variantGettingApplied.Includes(x)); + // Reify a "post-application" variation object pinned to the inserted length + SequenceVariation variantAfterApplication; + var vcf = variantGettingApplied.VariantCallFormatDataString; + if (vcf != null) + { + variantAfterApplication = new SequenceVariation( + variantGettingApplied.OneBasedBeginPosition, + variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1, + variantGettingApplied.OriginalSequence, + variantGettingApplied.VariantSequence, + vcf.Description, + vcf, + variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)); + } + else + { + variantAfterApplication = new SequenceVariation( + variantGettingApplied.OneBasedBeginPosition, + variantGettingApplied.OneBasedBeginPosition + variantGettingApplied.VariantSequence.Length - 1, + variantGettingApplied.OriginalSequence, + variantGettingApplied.VariantSequence, + variantGettingApplied.Description, + variantGettingApplied.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value)); + } + + // If an already-applied variation partially overlaps the current edit, use the consensus tail to avoid index corruption + bool intersectsAppliedRegionIncompletely = + protein.AppliedSequenceVariations != null + && protein.AppliedSequenceVariations.Any(x => variantGettingApplied.Intersects(x) && !variantGettingApplied.Includes(x)); + IEnumerable appliedVariations = new[] { variantAfterApplication }; - string seqAfter = null; + string seqAfter; if (intersectsAppliedRegionIncompletely) { - // use original protein sequence for the remaining sequence + // Tail from consensus (not the possibly already-mutated BaseSequence) seqAfter = protein.BaseSequence.Length - afterIdx <= 0 ? "" : protein.ConsensusVariant.BaseSequence.Substring(afterIdx); } else { - // use this variant protein sequence for the remaining sequence + // Tail from the current BaseSequence; keep any previously applied variations that are not fully contained by the current edit seqAfter = protein.BaseSequence.Length - afterIdx <= 0 ? "" : protein.BaseSequence.Substring(afterIdx); appliedVariations = appliedVariations - .Concat(protein.AppliedSequenceVariations.Where(x => !variantGettingApplied.Includes(x))) + .Concat((protein.AppliedSequenceVariations ?? Enumerable.Empty()).Where(x => !variantGettingApplied.Includes(x))) .ToList(); } - string variantSequence = (seqBefore + seqVariant + seqAfter).Split('*')[0]; // there may be a stop gained - // adjust indices + // Clip at stop (*) if any + string variantSequence = (seqBefore + seqVariant + seqAfter).Split('*')[0]; + + // Re-map dependent structures after the sequence length change List adjustedProteolysisProducts = AdjustTruncationProductIndices(variantGettingApplied, variantSequence, protein, protein.TruncationProducts); Dictionary> adjustedModifications = AdjustModificationIndices(variantGettingApplied, variantSequence, protein); List adjustedAppliedVariations = AdjustSequenceVariationIndices(variantGettingApplied, variantSequence, appliedVariations); @@ -257,90 +300,119 @@ private static TBioPolymerType ApplySingleVariant(SequenceVaria } /// - /// Adjusts the indices of sequence variations due to applying a single additional variant + /// Adjusts (re-bases) the coordinates of already-applied variations after applying a new variation. + /// Handles overlaps by trimming and shifts by adding the net length change. /// + /// The newly applied variation causing coordinate shifts. + /// The updated sequence after the application (used to clamp ends). + /// Variations that were previously applied (may be null). + /// A new list of variations with updated coordinates, filtered for validity. private static List AdjustSequenceVariationIndices(SequenceVariation variantGettingApplied, string variantAppliedProteinSequence, IEnumerable alreadyAppliedVariations) { List variations = new List(); if (alreadyAppliedVariations == null) { return variations; } + foreach (SequenceVariation v in alreadyAppliedVariations) { + // Net length already introduced before the start of v int addedIdx = alreadyAppliedVariations .Where(applied => applied.OneBasedEndPosition < v.OneBasedBeginPosition) .Sum(applied => applied.VariantSequence.Length - applied.OriginalSequence.Length); - // variant was entirely before the one being applied (shouldn't happen because of order of applying variants) - // or it's the current variation - if (v.Description.Equals(variantGettingApplied.Description) || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition) + // Either same VCF payload (null-safe) or fully before the new application region after compensating for prior shifts + if ((v.VariantCallFormatDataString != null && v.VariantCallFormatDataString.Equals(variantGettingApplied.VariantCallFormatDataString)) + || v.OneBasedEndPosition - addedIdx < variantGettingApplied.OneBasedBeginPosition) { variations.Add(v); } - - // adjust indices based on new included sequence, minding possible overlaps to be filtered later else { + // Compute overlap with the newly applied edit and shift by the net length change int intersectOneBasedStart = Math.Max(variantGettingApplied.OneBasedBeginPosition, v.OneBasedBeginPosition); int intersectOneBasedEnd = Math.Min(variantGettingApplied.OneBasedEndPosition, v.OneBasedEndPosition); - int overlap = intersectOneBasedEnd < intersectOneBasedStart ? 0 : // no overlap - intersectOneBasedEnd - intersectOneBasedStart + 1; // there's some overlap + int overlap = intersectOneBasedEnd < intersectOneBasedStart ? 0 : + intersectOneBasedEnd - intersectOneBasedStart + 1; + int sequenceLengthChange = variantGettingApplied.VariantSequence.Length - variantGettingApplied.OriginalSequence.Length; + int begin = v.OneBasedBeginPosition + sequenceLengthChange - overlap; if (begin > variantAppliedProteinSequence.Length) { - continue; // cut out by a stop gain + continue; } + int end = v.OneBasedEndPosition + sequenceLengthChange - overlap; if (end > variantAppliedProteinSequence.Length) { - end = variantAppliedProteinSequence.Length; // end shortened by a stop gain + end = variantAppliedProteinSequence.Length; + } + + var vcf = v.VariantCallFormatDataString; + if (vcf != null) + { + variations.Add(new SequenceVariation( + begin, + end, + v.OriginalSequence, + v.VariantSequence, + vcf.Description, + vcf, + v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value))); + } + else + { + variations.Add(new SequenceVariation( + begin, + end, + v.OriginalSequence, + v.VariantSequence, + v.Description, + v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value))); } - variations.Add(new SequenceVariation( - begin, - end, - v.OriginalSequence, - v.VariantSequence, - v.Description.Description, - v.OneBasedModifications.ToDictionary(kv => kv.Key, kv => kv.Value))); } } return variations; } /// - /// Eliminates proteolysis products that overlap sequence variations. - /// Since frameshift indels are written across the remaining sequence, - /// this eliminates proteolysis products that conflict with large deletions and other structural variations. + /// Re-bases proteolysis/truncation product ranges after applying a sequence variation. + /// Shifts segments to the right if the edit occurs before them and expands/contracts segments that span the edit. + /// If a stop (*) is introduced by the variant, downstream products are clamped to the new sequence length. /// + /// The applied variation (source of positional shifts and length change). + /// The updated sequence after application. + /// The source biopolymer (used for consensus length in certain boundary checks). + /// Existing truncation products to be re-based; may be null. + /// Updated list of truncation products within the new coordinate system. private static List AdjustTruncationProductIndices(SequenceVariation variant, string variantAppliedProteinSequence, IHasSequenceVariants protein, IEnumerable proteolysisProducts) { List products = new List(); if (proteolysisProducts == null) { return products; } + int sequenceLengthChange = variant.VariantSequence.Length - variant.OriginalSequence.Length; + foreach (TruncationProduct p in proteolysisProducts.Where(p => p.OneBasedEndPosition.HasValue && p.OneBasedBeginPosition.HasValue)) { - // proteolysis product is entirely before the variant + // Entirely before the edit: unchanged if (variant.OneBasedBeginPosition > p.OneBasedEndPosition) { products.Add(p); } - // proteolysis product straddles the variant, but the cleavage site(s) are still intact; the ends aren't considered cleavage sites + // Segment spans the edit or is clamped at boundaries: extend/contract or clamp to stop else if ((p.OneBasedBeginPosition < variant.OneBasedBeginPosition || p.OneBasedBeginPosition == 1 || p.OneBasedBeginPosition == 2) && (p.OneBasedEndPosition > variant.OneBasedEndPosition || p.OneBasedEndPosition == protein.ConsensusVariant.BaseSequence.Length)) { if (variant.VariantSequence.EndsWith("*")) { + // Introduced stop codon/terminator: clamp to the new sequence length products.Add(new TruncationProduct(p.OneBasedBeginPosition, variantAppliedProteinSequence.Length, p.Type)); } else if (p.OneBasedEndPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length) { products.Add(new TruncationProduct(p.OneBasedBeginPosition, p.OneBasedEndPosition + sequenceLengthChange, p.Type)); } - else - { - // cleavage site is not intact - } } - // proteolysis product is after the variant and there is no stop gain + // Entirely after the edit: shift right/left by the net length change, if still within bounds and not terminated else if (p.OneBasedBeginPosition > variant.OneBasedEndPosition && p.OneBasedBeginPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length && p.OneBasedEndPosition + sequenceLengthChange <= variantAppliedProteinSequence.Length @@ -348,52 +420,49 @@ private static List AdjustTruncationProductIndices(SequenceVa { products.Add(new TruncationProduct(p.OneBasedBeginPosition + sequenceLengthChange, p.OneBasedEndPosition + sequenceLengthChange, p.Type)); } - else // sequence variant conflicts with proteolysis cleavage site (cleavage site was lost) - { - continue; - } } return products; } /// - /// Adjusts modification indices. + /// Produces a new localized modification dictionary re-based to the post-variation coordinate system and merged with + /// any one-based modifications carried by the applied variant itself. /// + /// The variation causing index shifts. + /// The updated sequence after application (for bounds checks). + /// The source biopolymer providing the original modifications map. + /// A new dictionary of one-based modification lists keyed by position in the updated sequence. private static Dictionary> AdjustModificationIndices(SequenceVariation variant, string variantAppliedProteinSequence, IHasSequenceVariants protein) { + // Original per-position modifications (pre-application) IDictionary> modificationDictionary = protein.OneBasedPossibleLocalizedModifications; + // Modifications contributed by the variant itself (coordinated in the same one-based space) IDictionary> variantModificationDictionary = variant.OneBasedModifications; + Dictionary> mods = new Dictionary>(); int sequenceLengthChange = variant.VariantSequence.Length - variant.OriginalSequence.Length; - // change modification indices for variant sequence + // Re-base original modifications if (modificationDictionary != null) { foreach (KeyValuePair> kv in modificationDictionary) { if (kv.Key > variantAppliedProteinSequence.Length) { - continue; // it was cut out by a stop gain + continue; // drop if beyond new end } - // mod is before the variant else if (kv.Key < variant.OneBasedBeginPosition) { - mods.Add(kv.Key, kv.Value); + mods.Add(kv.Key, kv.Value); // unaffected positions } - // mod is after the variant and not affected by a stop gain else if (variant.OneBasedEndPosition < kv.Key && kv.Key + sequenceLengthChange <= variantAppliedProteinSequence.Length) { - mods.Add(kv.Key + sequenceLengthChange, kv.Value); - } - else // sequence variant conflicts with modification site (modification site substitution) - { - continue; + mods.Add(kv.Key + sequenceLengthChange, kv.Value); // shift after the edit } } } - // sequence variant modifications are indexed to the variant sequence - // NOTE: this code assumes variants are added from end to beginning of protein, so that previously added variant mods are adjusted above + // Merge-in variant-borne modifications (may share positions) if (variantModificationDictionary != null) { foreach (var kv in variantModificationDictionary) @@ -413,51 +482,54 @@ private static Dictionary> AdjustModificationIndices(Seq } /// - /// Format string to append to accession + /// Concatenates the compact representations () of the provided variations with underscores. /// + /// Variations to stringify; may be null/empty. + /// An underscore-joined string or empty string if none. private static string CombineSimpleStrings(IEnumerable? variations) { return variations.IsNullOrEmpty() ? "" : string.Join("_", variations.Select(v => v.SimpleString())); } /// - /// Format string to append to protein names + /// Concatenates human-readable descriptions for the provided variations. + /// Prefers VCF description when available, otherwise falls back to the variation's own description. /// + /// Variations to describe; may be null/empty. + /// A comma-delimited description string or empty string if none. public static string CombineDescriptions(IEnumerable? variations) { - return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.Description)); + return variations.IsNullOrEmpty() ? "" : string.Join(", variant:", variations.Select(d => d.VariantCallFormatDataString?.Description ?? d.Description)); } + /// - /// Applies all possible combinations of the provided SequenceVariation list to the base TBioPolymerType object, - /// starting with the fewest single variations and up to the specified maximum number of combinations. + /// Applies all combinations of the provided variations to the base biopolymer up to a maximum number of yielded results. + /// The base (no-variant) biopolymer is yielded first, followed by combinations in increasing size. /// - /// The type of the biopolymer object. - /// The base biopolymer object to apply variations to. - /// List of SequenceVariation objects to combine and apply. Assumed not null or empty. - /// Maximum number of combinations to return. - /// - /// An IEnumerable of TBioPolymerType objects, each with a unique combination of variations applied. - /// + /// A biopolymer type that supports sequence variants. + /// The starting biopolymer (no variations applied). + /// Candidate variations to combine. + /// Maximum number of variants (including the base) to yield to bound combinatorial growth. + /// An enumerable of applied-variant biopolymers. public static IEnumerable ApplyAllVariantCombinations( TBioPolymerType baseBioPolymer, List variations, int maxCombinations) where TBioPolymerType : IHasSequenceVariants { - int count = 0; + int count = 0; // number of variants yielded so far - // Always yield the base biopolymer first yield return baseBioPolymer; count++; if (count >= maxCombinations) yield break; - int n = variations.Count; + int n = variations.Count; // total variation count for (int size = 1; size <= n; size++) { foreach (var combo in GetCombinations(variations, size)) { - var result = baseBioPolymer; + var result = baseBioPolymer; // start from base and apply this combination in order foreach (var variant in combo) { result = ApplySingleVariant(variant, result, string.Empty); @@ -474,26 +546,27 @@ public static IEnumerable ApplyAllVariantCombinations - /// Generates all possible combinations of the specified size from the input list. + /// Generates all k-combinations (order-independent, no repetition) of the given list. + /// This is a standard lexicographic index-based combinator that yields increasing index tuples. /// - /// List of SequenceVariation objects to combine. Assumed not null or empty. - /// The size of each combination. - /// - /// An IEnumerable of IList<SequenceVariation> representing each combination. - /// + /// Source list to combine. + /// Combination size k (0 < k <= n). + /// An enumerable of read-only lists containing the selected variations. private static IEnumerable> GetCombinations(List variations, int size) { int n = variations.Count; var indices = new int[size]; - for (int i = 0; i < size; i++) indices[i] = i; + for (int i = 0; i < size; i++) indices[i] = i; // initial 0..k-1 while (true) { + // Materialize current combination var combo = new List(size); for (int i = 0; i < size; i++) combo.Add(variations[indices[i]]); yield return combo; + // Advance to next lexicographic combination int pos = size - 1; while (pos >= 0 && indices[pos] == n - size + pos) pos--; @@ -503,15 +576,26 @@ private static IEnumerable> GetCombinations(List + /// Scans localized modifications for "nucleotide substitution" annotations of the form "X->Y" and converts them + /// into concrete objects at the associated positions. + /// The originating annotation-style modifications are removed from both the possible-localized and original-non-variant collections + /// (and from consensus mirrors) when they are fully consumed by the conversion. + /// + /// A biopolymer type that supports sequence variants. + /// The biopolymer whose modifications/variants are to be updated in-place. public static void ConvertNucleotideSubstitutionModificationsToSequenceVariants(this TBioPolymerType protein) where TBioPolymerType : IHasSequenceVariants { + // Collect mods to remove after converting them to sequence variants List> modificationsToRemove = new(); - //convert substitution modifications to sequence variations + foreach (var kvp in protein.OneBasedPossibleLocalizedModifications) { foreach (Modification mod in kvp.Value) { + // Look for annotation-style nucleotide substitutions (e.g., "A->G") if (mod.ModificationType.Contains("nucleotide substitution") && mod.OriginalId.Contains("->")) { string[] originalAndSubstitutedAminoAcids = mod.OriginalId.Split(new[] { "->" }, StringSplitOptions.RemoveEmptyEntries); @@ -520,12 +604,14 @@ public static void ConvertNucleotideSubstitutionModificationsToSequenceVariants< { protein.SequenceVariations.Add(sequenceVariation); } + // Defer removal to avoid mutating collection during enumeration KeyValuePair pair = new(kvp.Key, mod); modificationsToRemove.Add(pair); } } } - //remove the modifications that were converted to sequence variations + + // Remove the consumed annotation-style modifications from both live and consensus dictionaries foreach (KeyValuePair pair in modificationsToRemove) { if (protein.OneBasedPossibleLocalizedModifications.ContainsKey(pair.Key)) @@ -544,6 +630,7 @@ public static void ConvertNucleotideSubstitutionModificationsToSequenceVariants< } } } + if (protein.OriginalNonVariantModifications.ContainsKey(pair.Key)) { List modList = protein.OriginalNonVariantModifications[pair.Key]; diff --git a/mzLib/Omics/BioPolymer/VariantCallFormat.cs b/mzLib/Omics/BioPolymer/VariantCallFormat.cs index e0cbec9a7..7549726c0 100644 --- a/mzLib/Omics/BioPolymer/VariantCallFormat.cs +++ b/mzLib/Omics/BioPolymer/VariantCallFormat.cs @@ -146,7 +146,7 @@ public VariantCallFormat(string description) Description = description; // Back-compat: if no real tabs are present but literal "\t" sequences are, - // normalize them to actual tabs for parsing only. Leave Description intact. + // normalize them to actual tabs for parsing only. Leave VariantCallFormatDataString intact. string parseLine = NormalizeTabsForParsing(description); // Parse description into VCF fields diff --git a/mzLib/Test/DatabaseTests/TestProteinReader.cs b/mzLib/Test/DatabaseTests/TestProteinReader.cs index 7dcd0b4d8..e0768eef4 100644 --- a/mzLib/Test/DatabaseTests/TestProteinReader.cs +++ b/mzLib/Test/DatabaseTests/TestProteinReader.cs @@ -125,7 +125,7 @@ public static void XmlTest() Assert.AreEqual(64, ok[0].SequenceVariations.First().OneBasedEndPosition); Assert.AreEqual(103 - 64 + 2, ok[1].SequenceVariations.First().OneBasedBeginPosition); Assert.AreEqual(103 - 64 + 2, ok[1].SequenceVariations.First().OneBasedEndPosition); - Assert.AreNotEqual(ok[0].SequenceVariations.First().Description, ok[1].SequenceVariations.First().Description); //decoys and target variations don't have the same desc. + Assert.AreNotEqual(ok[0].SequenceVariations.First().VariantCallFormatDataString, ok[1].SequenceVariations.First().VariantCallFormatDataString); //decoys and target variations don't have the same desc. Assert.AreEqual("Homo sapiens", ok[1].Organism); } @@ -420,8 +420,8 @@ public static void TestReverseDecoyXML_WithCustomIdentifier() foreach (var variant in protein.AppliedSequenceVariations) { - Assert.That(variant.Description, Does.StartWith("rev")); - Assert.That(variant.Description, Does.Not.StartWith("DECOY")); + Assert.That(variant.VariantCallFormatDataString, Does.StartWith("rev")); + Assert.That(variant.VariantCallFormatDataString, Does.Not.StartWith("DECOY")); } foreach (var bond in protein.DisulfideBonds) diff --git a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs index babe44a76..e6df1c8d5 100644 --- a/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs +++ b/mzLib/Test/DatabaseTests/TestProteomicsReadWrite.cs @@ -504,12 +504,12 @@ public void TestFullProteinReadWrite() Assert.AreEqual(originalProtein.TruncationProducts.First().OneBasedEndPosition, proteinReadFromXml[0].TruncationProducts.First().OneBasedEndPosition); Assert.AreEqual(originalProtein.TruncationProducts.First().Type, proteinReadFromXml[0].TruncationProducts.First().Type.Split('(')[0]); - Assert.AreEqual(originalProtein.SequenceVariations.First().Description, proteinReadFromXml[0].SequenceVariations.First().Description); + Assert.AreEqual(originalProtein.SequenceVariations.First().VariantCallFormatDataString, proteinReadFromXml[0].SequenceVariations.First().VariantCallFormatDataString); Assert.AreEqual(originalProtein.SequenceVariations.First().OneBasedBeginPosition, proteinReadFromXml[0].SequenceVariations.First().OneBasedBeginPosition); Assert.AreEqual(originalProtein.SequenceVariations.First().OneBasedEndPosition, proteinReadFromXml[0].SequenceVariations.First().OneBasedEndPosition); Assert.AreEqual(originalProtein.SequenceVariations.First().OriginalSequence, proteinReadFromXml[0].SequenceVariations.First().OriginalSequence); Assert.AreEqual(originalProtein.SequenceVariations.First().VariantSequence, proteinReadFromXml[0].SequenceVariations.First().VariantSequence); - Assert.AreEqual(originalProtein.SequenceVariations.Last().Description, proteinReadFromXml[0].SequenceVariations.Last().Description); + Assert.AreEqual(originalProtein.SequenceVariations.Last().VariantCallFormatDataString, proteinReadFromXml[0].SequenceVariations.Last().VariantCallFormatDataString); Assert.AreEqual(originalProtein.SequenceVariations.Last().OneBasedBeginPosition, proteinReadFromXml[0].SequenceVariations.Last().OneBasedBeginPosition); Assert.AreEqual(originalProtein.SequenceVariations.Last().OneBasedEndPosition, proteinReadFromXml[0].SequenceVariations.Last().OneBasedEndPosition); Assert.AreEqual(originalProtein.SequenceVariations.Last().OriginalSequence, proteinReadFromXml[0].SequenceVariations.Last().OriginalSequence); @@ -534,7 +534,7 @@ public void TestReadWriteSeqVars() Assert.AreEqual(ok[0].SequenceVariations.Count(), ok2[0].SequenceVariations.Count()); Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedBeginPosition, ok2[0].SequenceVariations.First().OneBasedBeginPosition); Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedEndPosition, ok2[0].SequenceVariations.First().OneBasedEndPosition); - Assert.AreEqual(ok[0].SequenceVariations.First().Description, ok2[0].SequenceVariations.First().Description); + Assert.AreEqual(ok[0].SequenceVariations.First().VariantCallFormatDataString, ok2[0].SequenceVariations.First().VariantCallFormatDataString); Assert.AreEqual(ok[0].SequenceVariations.First().OriginalSequence, ok2[0].SequenceVariations.First().OriginalSequence); Assert.AreEqual(ok[0].SequenceVariations.First().VariantSequence, ok2[0].SequenceVariations.First().VariantSequence); } @@ -557,7 +557,7 @@ public void TestReadWriteSeqVars2() Assert.AreEqual(ok[0].SequenceVariations.Count(), ok2[0].SequenceVariations.Count()); Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedBeginPosition, ok2[0].SequenceVariations.First().OneBasedBeginPosition); Assert.AreEqual(ok[0].SequenceVariations.First().OneBasedEndPosition, ok2[0].SequenceVariations.First().OneBasedEndPosition); - Assert.AreEqual(ok[0].SequenceVariations.First().Description, ok2[0].SequenceVariations.First().Description); + Assert.AreEqual(ok[0].SequenceVariations.First().VariantCallFormatDataString, ok2[0].SequenceVariations.First().VariantCallFormatDataString); Assert.AreEqual(ok[0].SequenceVariations.First().OriginalSequence, ok2[0].SequenceVariations.First().OriginalSequence); Assert.AreEqual(ok[0].SequenceVariations.First().VariantSequence, ok2[0].SequenceVariations.First().VariantSequence); } diff --git a/mzLib/Test/DatabaseTests/TestVariantProtein.cs b/mzLib/Test/DatabaseTests/TestVariantProtein.cs index 8ced7b208..53bf78897 100644 --- a/mzLib/Test/DatabaseTests/TestVariantProtein.cs +++ b/mzLib/Test/DatabaseTests/TestVariantProtein.cs @@ -1,20 +1,21 @@ -using System; -using System.Collections; -using System.Collections.Generic; -using System.IO; -using System.Linq; +using Chemistry; +using MassSpectrometry; using NUnit.Framework; +using NUnit.Framework.Legacy; +using Omics; using Omics.BioPolymer; -using Assert = NUnit.Framework.Legacy.ClassicAssert; using Omics.Modifications; using Proteomics; using Proteomics.ProteolyticDigestion; +using System; +using System.Collections; +using System.Collections.Generic; +using System.IO; +using System.Linq; +using Transcriptomics; using UsefulProteomicsDatabases; +using Assert = NUnit.Framework.Legacy.ClassicAssert; using Stopwatch = System.Diagnostics.Stopwatch; -using Omics; -using Transcriptomics; -using MassSpectrometry; -using Chemistry; namespace Test.DatabaseTests { @@ -115,7 +116,7 @@ public static void SeqVarXmlTest() { Assert.AreEqual(s.OriginalSequence, decoy.BaseSequence.Substring(s.OneBasedBeginPosition - 1, s.OneBasedEndPosition - s.OneBasedBeginPosition + 1)); } - Assert.AreNotEqual(target.SequenceVariations.First().Description, decoy.SequenceVariations.First().Description); //decoys and target variations don't have the same desc. + Assert.AreNotEqual(target.SequenceVariations.First().VariantCallFormatDataString, decoy.SequenceVariations.First().VariantCallFormatDataString); //decoys and target variations don't have the same desc. List peptides = ok.SelectMany(vp => vp.Digest(new DigestionParams(), null, null)).ToList(); } @@ -284,76 +285,258 @@ public static void HomozygousVariantsAtVariedDepths(string filename, int minVari var variantProteins = proteins[0].GetVariantBioPolymers(); List peptides = proteins.SelectMany(vp => vp.Digest(new DigestionParams(), null, null)).ToList(); } - [Test] public static void AppliedVariants() { - ModificationMotif.TryGetMotif("P", out ModificationMotif motifP); - Modification mp = new Modification("mod", null, "type", null, motifP, "Anywhere.", null, 42.01, new Dictionary>(), null, null, null, null, null); + // This test verifies that applying sequence variations (SAV, MNV, insertion, deletion) + // produces the correct variant protein sequences, maps variant coordinates correctly, + // preserves per-variant metadata (AppliedSequenceVariations), and remains stable across: + // - repeated in-memory application, + // - round-tripping through XML (write → read). + // + // Additionally, it verifies that a modification attached to a variant (protein5) + // is persisted and localized at the expected one-based position after application + // and after XML reload. + // Define a simple P-specific modification used later to validate that modifications + // attached to a variant are preserved and localized correctly after applying variants + // and XML round-tripping. + ModificationMotif.TryGetMotif("P", out ModificationMotif motifP); + Modification mp = new Modification( + _originalId: "mod", + _accession: null, + _modificationType: "type", + _featureType: null, + _target: motifP, + _locationRestriction: "Anywhere.", + _chemicalFormula: null, + _monoisotopicMass: 42.01, + _databaseReference: new Dictionary>(), + _taxonomicRange: null, + _keywords: null, + _neutralLosses: null, + _diagnosticIons: null, + _fileOrigin: null); + + // Prepare five proteins that each have one sequence variation: + // 1) protein1: Single Amino-acid Variant (SAV) P→V at position 4 (4..4) + // 2) protein2: Multi-Nucleotide Variant (MNV) PT→KT spanning positions 4..5 + // 3) protein3: Insertion-like replacement: P→PPP at position 4 (longer variant) + // 4) protein4: Deletion-like replacement: PPP→P spanning 4..6 (shorter variant) + // 5) protein5: Same as (3) but with a modification attached at one-based index 5 + // to verify mod persistence/localization through variant application and XML. List proteinsWithSeqVars = new List { - new Protein("MPEPTIDE", "protein1", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "V", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), - new Protein("MPEPTIDE", "protein2", sequenceVariations: new List { new SequenceVariation(4, 5, "PT", "KT", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), - new Protein("MPEPTIDE", "protein3", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), - new Protein("MPEPPPTIDE", "protein4", sequenceVariations: new List { new SequenceVariation(4, 6, "PPP", "P", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), - new Protein("MPEPTIDE", "protein5", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary> {{ 5, new[] { mp }.ToList() } }) }), - }; + new Protein("MPEPTIDE", "protein1", + sequenceVariations: new List + { + // SAV: P(4) -> V(4) + new SequenceVariation(4, 4, "P", "V", + @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) + }), + new Protein("MPEPTIDE", "protein2", + sequenceVariations: new List + { + // MNV: PT(4..5) -> KT(4..5) + new SequenceVariation(4, 5, "PT", "KT", + @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) + }), + new Protein("MPEPTIDE", "protein3", + sequenceVariations: new List + { + // Insertion-like: P(4) -> PPP(4..6) (length +2) + new SequenceVariation(4, 4, "P", "PPP", + @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) + }), + new Protein("MPEPPPTIDE", "protein4", + sequenceVariations: new List + { + // Deletion-like: PPP(4..6) -> P(4) (length -2) + new SequenceVariation(4, 6, "PPP", "P", + @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) + }), + new Protein("MPEPTIDE", "protein5", + sequenceVariations: new List + { + // Insertion-like with a downstream mod to verify mod localization at 5 + new SequenceVariation(4, 4, "P", "PPP", + @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", + new Dictionary> { { 5, new[] { mp }.ToList() } }) + }), + }; + + // Apply variants in memory twice; the output should be stable and identical. var proteinsWithAppliedVariants = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList(); var proteinsWithAppliedVariants2 = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList(); // should be stable + + // Round-trip through XML: write the variant-bearing proteins (targets only) and read back. + // This ensures that variant application and metadata survive I/O and result identically. string xml = Path.Combine(TestContext.CurrentContext.TestDirectory, "AppliedVariants.xml"); ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), proteinsWithSeqVars, xml); var proteinsWithAppliedVariants3 = ProteinDbLoader.LoadProteinXML(xml, true, DecoyType.None, null, false, null, out var un); + // Compare across all three sources: + // - [0]: in-memory 1 + // - [1]: in-memory 2 (should match [0]) + // - [2]: XML reload (should match [0] and [1]) var listArray = new[] { proteinsWithAppliedVariants, proteinsWithAppliedVariants2, proteinsWithAppliedVariants3 }; + + // Assert we always get exactly five variant proteins in the same order + // (SAV, MNV, insertion, deletion, insertion+mod). + Assert.AreEqual(5, proteinsWithAppliedVariants.Count, "Expected 5 applied variants (in-memory #1)."); + Assert.AreEqual(5, proteinsWithAppliedVariants2.Count, "Expected 5 applied variants (in-memory #2)."); + Assert.AreEqual(5, proteinsWithAppliedVariants3.Count, "Expected 5 applied variants (XML reload)."); + + // The expected sequences for each of the five applied variants, in order: + // 0: "MPEVTIDE" (SAV at 4: P->V) + // 1: "MPEKTIDE" (MNV at 4..5: PT->KT) + // 2: "MPEPPPTIDE" (Insertion-like at 4: P->PPP; length +2) + // 3: "MPEPTIDE" (Deletion-like at 4..6: PPP->P; length -2 from "MPEPPPTIDE") + // 4: "MPEPPPTIDE" (Insertion-like with mod at 5) for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++) { // sequences - Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence); - Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence); - Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence); - Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence); - Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence); - Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key); - + Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence, $"[{dbIdx}] SAV sequence mismatch"); + Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence, $"[{dbIdx}] MNV sequence mismatch"); + Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence, $"[{dbIdx}] insertion sequence mismatch"); + Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence, $"[{dbIdx}] deletion sequence mismatch"); + Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence, $"[{dbIdx}] insertion+mod sequence mismatch"); + + // Confirm the modification attached to protein5 survives application and XML round-trip + Assert.AreEqual(1, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] Expected exactly 1 mod on the insertion+mod variant"); + Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key, $"[{dbIdx}] Mod should localize to one-based position 5"); + // Sanity: ensure the residue under the mod is indeed P + Assert.AreEqual('P', listArray[dbIdx][4].BaseSequence[5 - 1], $"[{dbIdx}] Residue at mod position should be 'P'"); + + // SAV expectations: single-residue change; length unchanged; position 4 is 'V' + Assert.AreEqual(8, listArray[dbIdx][0].BaseSequence.Length, $"[{dbIdx}] SAV length should be unchanged"); + Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedBeginPosition, $"[{dbIdx}] SAV begin mismatch"); + Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedEndPosition, $"[{dbIdx}] SAV end mismatch"); + Assert.AreEqual("P", listArray[dbIdx][0].AppliedSequenceVariations.Single().OriginalSequence, $"[{dbIdx}] SAV original should be 'P'"); + Assert.AreEqual("V", listArray[dbIdx][0].AppliedSequenceVariations.Single().VariantSequence, $"[{dbIdx}] SAV variant should be 'V'"); + Assert.AreEqual('V', listArray[dbIdx][0].BaseSequence[4 - 1], $"[{dbIdx}] SAV residue at 4 should be 'V'"); + + // MNV expectations: multi-residue change; length unchanged; positions 4..5 become "KT" + Assert.AreEqual(8, listArray[dbIdx][1].BaseSequence.Length, $"[{dbIdx}] MNV length should be unchanged"); + Assert.AreEqual(4, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedBeginPosition, $"[{dbIdx}] MNV begin mismatch"); + Assert.AreEqual(5, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedEndPosition, $"[{dbIdx}] MNV end mismatch"); + Assert.AreEqual("PT", listArray[dbIdx][1].AppliedSequenceVariations.Single().OriginalSequence, $"[{dbIdx}] MNV original should be 'PT'"); + Assert.AreEqual("KT", listArray[dbIdx][1].AppliedSequenceVariations.Single().VariantSequence, $"[{dbIdx}] MNV variant should be 'KT'"); + Assert.AreEqual("KT", listArray[dbIdx][1].BaseSequence.Substring(4 - 1, 2), $"[{dbIdx}] MNV residues 4..5 should be 'KT'"); + + // insertion expectations: length grows by +2; positions 4..6 are "PPP" + Assert.AreEqual(10, listArray[dbIdx][2].BaseSequence.Length, $"[{dbIdx}] insertion length should be 8 + 2 = 10"); + Assert.AreEqual(4, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedBeginPosition, $"[{dbIdx}] insertion begin mismatch"); + Assert.AreEqual(6, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedEndPosition, $"[{dbIdx}] insertion end mismatch"); + Assert.AreEqual("P", listArray[dbIdx][2].AppliedSequenceVariations.Single().OriginalSequence, $"[{dbIdx}] insertion original should be 'P'"); + Assert.AreEqual("PPP", listArray[dbIdx][2].AppliedSequenceVariations.Single().VariantSequence, $"[{dbIdx}] insertion variant should be 'PPP'"); + Assert.AreEqual("PPP", listArray[dbIdx][2].BaseSequence.Substring(4 - 1, 3), $"[{dbIdx}] insertion residues 4..6 should be 'PPP'"); + + // deletion expectations: length shrinks by -2 relative to the starting "MPEPPPTIDE" (10 → 8) + // and positions collapse so that sequence returns to "MPEPTIDE". + Assert.AreEqual(8, listArray[dbIdx][3].BaseSequence.Length, $"[{dbIdx}] deletion length should be 10 - 2 = 8"); + Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedBeginPosition, $"[{dbIdx}] deletion begin mismatch"); + Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedEndPosition, $"[{dbIdx}] deletion end mismatch (post-collapse)"); + Assert.AreEqual("PPP", listArray[dbIdx][3].AppliedSequenceVariations.Single().OriginalSequence, $"[{dbIdx}] deletion original should be 'PPP'"); + Assert.AreEqual("P", listArray[dbIdx][3].AppliedSequenceVariations.Single().VariantSequence, $"[{dbIdx}] deletion variant should be 'P'"); + + // For completeness, also assert the summarized begin/end expectations that the original test verified. // SAV Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedBeginPosition); Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedEndPosition); - // MNV Assert.AreEqual(4, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedBeginPosition); Assert.AreEqual(5, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedEndPosition); - // insertion Assert.AreEqual(4, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedBeginPosition); Assert.AreEqual(6, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedEndPosition); - // deletion Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedBeginPosition); Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedEndPosition); } - } + // Ensure exact stability across the three sources: + // - All sequences in in-memory #1 equal in-memory #2 and XML reload. + CollectionAssert.AreEqual( + proteinsWithAppliedVariants.Select(p => p.BaseSequence).ToList(), + proteinsWithAppliedVariants2.Select(p => p.BaseSequence).ToList(), + "In-memory application should be stable across repeated calls"); + CollectionAssert.AreEqual( + proteinsWithAppliedVariants.Select(p => p.BaseSequence).ToList(), + proteinsWithAppliedVariants3.Select(p => p.BaseSequence).ToList(), + "XML round-trip should preserve variant-applied sequences in the same order"); + } [Test] public static void AppliedVariants_AsIBioPolymer() { - ModificationMotif.TryGetMotif("P", out ModificationMotif motifP); - Modification mp = new Modification("mod", null, "type", null, motifP, "Anywhere.", null, 42.01, new Dictionary>(), null, null, null, null, null); + // PURPOSE + // This test mirrors "AppliedVariants" but exercises the IBioPolymer interface path. + // It validates, in detail: + // 1) Correct application of four variant types (SAV, MNV, insertion, deletion) to produce expected sequences. + // 2) Correct coordinates (begin/end), original/variant strings in AppliedSequenceVariations after application. + // 3) Stability of results across: + // - repeated in-memory variant application, + // - XML round-trip (write → read). + // 4) Modification propagation and localization for a variant carrying a downstream mod. + // 5) Distinguishing two variants with identical sequences by their modification state (index 2 vs 4). + // + // EXPECTATIONS SUMMARY + // - Variant sequences (in order): "MPEVTIDE", "MPEKTIDE", "MPEPPPTIDE", "MPEPTIDE", "MPEPPPTIDE". + // - AppliedSequenceVariations: + // SAV (idx 0): begin=4, end=4, original="P", variant="V", len=8 + // MNV (idx 1): begin=4, end=5, original="PT", variant="KT", len=8 + // Insertion (idx 2): begin=4, end=6, original="P", variant="PPP", len=10 + // Deletion (idx 3): begin=4, end=4, original="PPP", variant="P", len=8 + // Insert+Mod(idx 4): same sequence as insertion, plus exactly one mod at one-based index 5 targeting a 'P'. + // + // NOTE: Lists [0], [1], [2] below represent: + // [0] in-memory application (first call) + // [1] in-memory application (second call, should be identical/stable) + // [2] XML round-trip results, must match [0] and [1] + // Arrange: create a P-specific modification used to test propagation and localization + ModificationMotif.TryGetMotif("P", out ModificationMotif motifP); + Modification mp = new Modification( + _originalId: "mod", + _accession: null, + _modificationType: "type", + _featureType: null, + _target: motifP, + _locationRestriction: "Anywhere.", + _chemicalFormula: null, + _monoisotopicMass: 42.01, + _databaseReference: new Dictionary>(), + _taxonomicRange: null, + _keywords: null, + _neutralLosses: null, + _diagnosticIons: null, + _fileOrigin: null); + + // Arrange: build five proteins (as IBioPolymer) with one sequence variation each + // 1) SAV P(4)->V + // 2) MNV PT(4..5)->KT + // 3) Insertion-like P(4)->PPP(4..6) + // 4) Deletion-like PPP(4..6)->P(4) on a longer starting sequence + // 5) Insertion-like with a downstream mod at one-based index 5 List proteinsWithSeqVars = new List { - new Protein("MPEPTIDE", "protein1", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "V", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), - new Protein("MPEPTIDE", "protein2", sequenceVariations: new List { new SequenceVariation(4, 5, "PT", "KT", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), - new Protein("MPEPTIDE", "protein3", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), - new Protein("MPEPPPTIDE", "protein4", sequenceVariations: new List { new SequenceVariation(4, 6, "PPP", "P", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), - new Protein("MPEPTIDE", "protein5", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary> {{ 5, new[] { mp }.ToList() } }) }), + new Protein("MPEPTIDE", "protein1", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "V", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), + new Protein("MPEPTIDE", "protein2", sequenceVariations: new List { new SequenceVariation(4, 5, "PT", "KT", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), + new Protein("MPEPTIDE", "protein3", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), + new Protein("MPEPPPTIDE", "protein4", sequenceVariations: new List { new SequenceVariation(4, 6, "PPP", "P", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", null) }), + new Protein("MPEPTIDE", "protein5", sequenceVariations: new List { new SequenceVariation(4, 4, "P", "PPP", @"1\t50000000\t.\tA\tG\t.\tPASS\tANN=G||||||||||||||||\tGT:AD:DP\t1/1:30,30:30", new Dictionary> { { 5, new[] { mp }.ToList() } }) }), }; + + // Act: apply variants in-memory twice (should be identical/stable) var proteinsWithAppliedVariants = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList(); - var proteinsWithAppliedVariants2 = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList(); // should be stable + var proteinsWithAppliedVariants2 = proteinsWithSeqVars.SelectMany(p => p.GetVariantBioPolymers()).ToList(); + + // Act: write to XML and load back; results should match in-memory outputs string xml = Path.Combine(TestContext.CurrentContext.TestDirectory, "AppliedVariants.xml"); ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), proteinsWithSeqVars, xml); var proteinsWithAppliedVariants3 = ProteinDbLoader.LoadProteinXML(xml, true, DecoyType.None, null, false, null, out var un); + // Group lists for uniform validation loops var listArray = new List[] { proteinsWithAppliedVariants, @@ -361,34 +544,114 @@ public static void AppliedVariants_AsIBioPolymer() proteinsWithAppliedVariants3.Cast().ToList() }; - for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++) + // Assert: each expansion produces exactly 5 variant biopolymers, in the same, predictable order + Assert.AreEqual(5, proteinsWithAppliedVariants.Count, "Expected 5 variants (in-memory 1)"); + Assert.AreEqual(5, proteinsWithAppliedVariants2.Count, "Expected 5 variants (in-memory 2)"); + Assert.AreEqual(5, proteinsWithAppliedVariants3.Count, "Expected 5 variants (XML reload)"); + + // Assert: stability across the three sources (same sequences, same order) + CollectionAssert.AreEqual( + proteinsWithAppliedVariants.Select(p => p.BaseSequence).ToList(), + proteinsWithAppliedVariants2.Select(p => p.BaseSequence).ToList(), + "In-memory application should be stable across repeated calls"); + CollectionAssert.AreEqual( + proteinsWithAppliedVariants.Select(p => p.BaseSequence).ToList(), + proteinsWithAppliedVariants3.Select(p => p.BaseSequence).ToList(), + "XML round-trip should preserve variant-applied sequences in the same order"); + + // Assert: for all variants we expect exactly one applied sequence variation + foreach (var variants in listArray) { - // sequences - Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence); - Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence); - Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence); - Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence); - Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence); - Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key); - - // SAV - Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(4, listArray[dbIdx][0].AppliedSequenceVariations.Single().OneBasedEndPosition); - - // MNV - Assert.AreEqual(4, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(5, listArray[dbIdx][1].AppliedSequenceVariations.Single().OneBasedEndPosition); + Assert.AreEqual(1, variants[0].AppliedSequenceVariations.Count, "SAV must have exactly one applied variant"); + Assert.AreEqual(1, variants[1].AppliedSequenceVariations.Count, "MNV must have exactly one applied variant"); + Assert.AreEqual(1, variants[2].AppliedSequenceVariations.Count, "Insertion must have exactly one applied variant"); + Assert.AreEqual(1, variants[3].AppliedSequenceVariations.Count, "Deletion must have exactly one applied variant"); + Assert.AreEqual(1, variants[4].AppliedSequenceVariations.Count, "Insertion+Mod must have exactly one applied variant"); + } - // insertion - Assert.AreEqual(4, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(6, listArray[dbIdx][2].AppliedSequenceVariations.Single().OneBasedEndPosition); + // Per-list validation of sequence, coordinates, and (where appropriate) residue checks and mod checks + for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++) + { + // Assert: expected sequences in fixed order + Assert.AreEqual("MPEVTIDE", listArray[dbIdx][0].BaseSequence, $"[{dbIdx}] SAV sequence mismatch"); + Assert.AreEqual("MPEKTIDE", listArray[dbIdx][1].BaseSequence, $"[{dbIdx}] MNV sequence mismatch"); + Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][2].BaseSequence, $"[{dbIdx}] insertion sequence mismatch"); + Assert.AreEqual("MPEPTIDE", listArray[dbIdx][3].BaseSequence, $"[{dbIdx}] deletion sequence mismatch"); + Assert.AreEqual("MPEPPPTIDE", listArray[dbIdx][4].BaseSequence, $"[{dbIdx}] insertion+mod sequence mismatch"); + + // Assert: lengths (sanity for ins/del) + Assert.AreEqual(8, listArray[dbIdx][0].BaseSequence.Length, $"[{dbIdx}] SAV length should be unchanged"); + Assert.AreEqual(8, listArray[dbIdx][1].BaseSequence.Length, $"[{dbIdx}] MNV length should be unchanged"); + Assert.AreEqual(10, listArray[dbIdx][2].BaseSequence.Length, $"[{dbIdx}] insertion length should be 8 + 2 = 10"); + Assert.AreEqual(8, listArray[dbIdx][3].BaseSequence.Length, $"[{dbIdx}] deletion length should be 10 - 2 = 8"); + Assert.AreEqual(10, listArray[dbIdx][4].BaseSequence.Length, $"[{dbIdx}] insertion+mod length should be 10"); + + // SAV assertions: P(4)->V + var sav = listArray[dbIdx][0].AppliedSequenceVariations.Single(); + Assert.AreEqual(4, sav.OneBasedBeginPosition, $"[{dbIdx}] SAV begin"); + Assert.AreEqual(4, sav.OneBasedEndPosition, $"[{dbIdx}] SAV end"); + Assert.AreEqual("P", sav.OriginalSequence, $"[{dbIdx}] SAV original"); + Assert.AreEqual("V", sav.VariantSequence, $"[{dbIdx}] SAV variant"); + Assert.AreEqual('V', listArray[dbIdx][0].BaseSequence[3], $"[{dbIdx}] SAV residue at 4 must be 'V'"); + + // MNV assertions: PT(4..5)->KT + var mnv = listArray[dbIdx][1].AppliedSequenceVariations.Single(); + Assert.AreEqual(4, mnv.OneBasedBeginPosition, $"[{dbIdx}] MNV begin"); + Assert.AreEqual(5, mnv.OneBasedEndPosition, $"[{dbIdx}] MNV end"); + Assert.AreEqual("PT", mnv.OriginalSequence, $"[{dbIdx}] MNV original"); + Assert.AreEqual("KT", mnv.VariantSequence, $"[{dbIdx}] MNV variant"); + Assert.AreEqual("KT", listArray[dbIdx][1].BaseSequence.Substring(3, 2), $"[{dbIdx}] MNV residues 4..5 must be 'KT'"); + + // Insertion-like assertions: P(4)->PPP(4..6) + var ins = listArray[dbIdx][2].AppliedSequenceVariations.Single(); + Assert.AreEqual(4, ins.OneBasedBeginPosition, $"[{dbIdx}] insertion begin"); + Assert.AreEqual(6, ins.OneBasedEndPosition, $"[{dbIdx}] insertion end"); + Assert.AreEqual("P", ins.OriginalSequence, $"[{dbIdx}] insertion original"); + Assert.AreEqual("PPP", ins.VariantSequence, $"[{dbIdx}] insertion variant"); + Assert.AreEqual("PPP", listArray[dbIdx][2].BaseSequence.Substring(3, 3), $"[{dbIdx}] insertion residues 4..6 must be 'PPP'"); + + // Deletion-like assertions: PPP(4..6)->P(4) (collapses back to MPEPTIDE) + var del = listArray[dbIdx][3].AppliedSequenceVariations.Single(); + Assert.AreEqual(4, del.OneBasedBeginPosition, $"[{dbIdx}] deletion begin"); + Assert.AreEqual(4, del.OneBasedEndPosition, $"[{dbIdx}] deletion end (post-collapse)"); + Assert.AreEqual("PPP", del.OriginalSequence, $"[{dbIdx}] deletion original"); + Assert.AreEqual("P", del.VariantSequence, $"[{dbIdx}] deletion variant"); + + // Insertion + Modification assertions: identical sequence to insertion, plus one mod at pos 5 + var insMod = listArray[dbIdx][4].AppliedSequenceVariations.Single(); + Assert.AreEqual(4, insMod.OneBasedBeginPosition, $"[{dbIdx}] insertion+mod begin"); + Assert.AreEqual(6, insMod.OneBasedEndPosition, $"[{dbIdx}] insertion+mod end"); + Assert.AreEqual("P", insMod.OriginalSequence, $"[{dbIdx}] insertion+mod original"); + Assert.AreEqual("PPP", insMod.VariantSequence, $"[{dbIdx}] insertion+mod variant"); + + // Mods: Only the "insertion+mod" variant (index 4) carries a mod; all others should have none + // Confirm the variant 4 has exactly one mod at one-based position 5, and the residue at 5 is 'P' (motif). + if (dbIdx == 0 || dbIdx == 1 || dbIdx == 2) // only assert detailed mod state once per path for clarity + { + // Index 2 (plain insertion) must have no mods + Assert.AreEqual(0, listArray[dbIdx][2].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] insertion should have no mods"); + + // Index 4 (insertion+mod) must have exactly one mod at site 5, targeting a P residue + Assert.AreEqual(1, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] insertion+mod should have exactly one site with mods"); + Assert.AreEqual(5, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Single().Key, $"[{dbIdx}] insertion+mod site should be one-based index 5"); + Assert.AreEqual('P', listArray[dbIdx][4].BaseSequence[5 - 1], $"[{dbIdx}] insertion+mod residue at site 5 must be 'P' (motif)"); + + // All other variants should have zero possible localized mods + Assert.AreEqual(0, listArray[dbIdx][0].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] SAV should have no mods"); + Assert.AreEqual(0, listArray[dbIdx][1].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] MNV should have no mods"); + Assert.AreEqual(0, listArray[dbIdx][3].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] deletion should have no mods"); + } + } - // deletion - Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedBeginPosition); - Assert.AreEqual(4, listArray[dbIdx][3].AppliedSequenceVariations.Single().OneBasedEndPosition); + // Additional cross-list checks: + // - The two "MPEPPPTIDE" variants (indices 2 and 4) should be sequence-identical, but differ by modification presence. + for (int dbIdx = 0; dbIdx < listArray.Length; dbIdx++) + { + Assert.AreEqual(listArray[dbIdx][2].BaseSequence, listArray[dbIdx][4].BaseSequence, $"[{dbIdx}] insertion and insertion+mod sequences must match"); + Assert.AreEqual(0, listArray[dbIdx][2].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] insertion should remain unmodified"); + Assert.AreEqual(1, listArray[dbIdx][4].OneBasedPossibleLocalizedModifications.Count, $"[{dbIdx}] insertion+mod should remain modified"); } } - [Test] public static void CrashOnCreateVariantFromRNA() { @@ -401,43 +664,277 @@ public static void CrashOnCreateVariantFromRNA() proteins[0].CreateVariant(proteins[0].BaseSequence, rna, [], [], new Dictionary>(), ""); }); } - [Test] public static void StopGained() { - var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), true, - DecoyType.None, null, false, null, out var unknownModifications); + // PURPOSE + // This test validates correct handling of a stop-gained sequence variation (creation of a premature stop codon). + // Verifies two scenarios: + // 1) Default variant depth filtering: both reference (no variant applied) and alternate (stop-gained applied) proteins are emitted. + // 2) High min-allele-depth threshold: only the stop-gained (applied) protein is retained. + // + // EXPECTATIONS SUMMARY (based on the StopGained.xml test data): + // - Two proteins are initially produced (reference + variant-applied): + // [0] Reference protein: + // * 1 sequence variation present in metadata, but 0 applied (reference form). + // * BaseSequence length = 191. + // * Residue at one-based position 161 equals 'Q' (so zero-based index 160 is 'Q'). + // [1] Variant-applied protein (stop-gained): + // * Exactly 1 applied variation. + // * BaseSequence length truncated to 161 - 1 = 160 (stop codon at 161 shortens the sequence). + // * The sequence is exactly the prefix of the reference up to length 160. + // * No '*' appears in the resulting BaseSequence (the stop codon is not a literal character in the sequence). + // * The applied variant's VariantSequence ends with '*'. + // * The applied variant one-based begin (and end) position is 161. + // + // - With minAlleleDepth: 400 + // * Only the variant-applied protein is returned. + // * It retains the same applied-variation and truncated length expectations. + + // Load the proteins with default filtering + var proteins = ProteinDbLoader.LoadProteinXML( + Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), + true, // generateTargets + DecoyType.None, // decoyType + null, // allKnownModifications + false, // isContaminant + null, // modTypesToExclude + out var unknownModifications); + + // Sanity: Decoys are not requested + Assert.IsTrue(proteins.All(p => !p.IsDecoy), "No decoys expected when using DecoyType.None"); + + // Expect exactly two proteins: reference (no applied variant) and stop-gained (applied) + Assert.AreEqual(2, proteins.Count, "Expected reference and stop-gained variant proteins"); + // Reference protein metadata: one possible sequence variation in the record + Assert.AreEqual(1, proteins[0].SequenceVariations.Count(), "Reference metadata should contain one sequence variation"); + Assert.AreEqual(1, proteins[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), "Reference metadata should contain exactly one unique sequence variation"); + // Reference should have zero applied variations (reference form retained) + Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Count(), "Reference protein should not have an applied sequence variation"); + Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), "Reference applied variations should be zero"); + + // Variant-applied protein: applied variation present and unique + Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Count(), "Variant protein should have exactly one applied variation"); + Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), "Variant protein applied variations should be unique"); + + // Reference length and residue checks around the stop site + Assert.AreEqual(191, proteins[0].Length, "Reference protein length should match source data"); + Assert.AreEqual('Q', proteins[0][161 - 1], "Reference residue at 161 should be 'Q' prior to stop-gain"); + + // Variant length must be truncated by the stop at position 161 → length becomes 160 + Assert.AreEqual(161 - 1, proteins[1].Length, "Variant protein length should be truncated to 160 due to stop at 161"); + + // The variant BaseSequence must be exactly the prefix of the reference up to the stop position - 1 + string reference = proteins[0].BaseSequence; + string variant = proteins[1].BaseSequence; + Assert.IsTrue(reference.StartsWith(variant), "Variant sequence must be a prefix of the reference sequence"); + Assert.AreEqual(reference.Substring(0, 161 - 1), variant, "Variant sequence must equal reference[0..159]"); + + // Ensure we did not write literal '*' into the protein sequence; stop codon is represented by truncation instead + Assert.AreEqual(-1, variant.IndexOf('*'), "Variant BaseSequence should not contain a literal '*'"); + + // Verify applied-variant details for the stop-gained protein + var applied = proteins[1].AppliedSequenceVariations.Single(); + Assert.IsTrue(applied.VariantSequence.EndsWith("*"), "Stop-gained variant must end with '*'"); + Assert.AreEqual(161, applied.OneBasedBeginPosition, "Stop-gained begins at residue 161"); + Assert.AreEqual(161, applied.OneBasedEndPosition, "Stop-gained ends at residue 161 (single residue change)"); + + // The two forms must differ in sequence length (as a sanity check) + Assert.AreNotEqual(proteins[0].Length, proteins[1].Length, "Reference and variant proteins should differ in length"); + + // Now require a higher min-allele-depth; expect only the variant-applied protein retained + proteins = ProteinDbLoader.LoadProteinXML( + Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), + true, // generateTargets + DecoyType.None, // decoyType + null, // allKnownModifications + false, // isContaminant + null, // modTypesToExclude + out unknownModifications, + minAlleleDepth: 400); + + // Only the stop-gained, variant-applied form is retained under a strict depth threshold + Assert.AreEqual(1, proteins.Count, "High min-allele-depth should retain only the variant-applied protein"); + Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Count(), "Variant-applied protein should still have one applied variation"); + Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count(), "Variant-applied unique variation should be retained"); + Assert.AreEqual(161 - 1, proteins[0].Length, "Variant-applied protein length should remain truncated to 160"); + + // Confirm stability: the single protein from the depth-filtered load matches the previously observed variant sequence + Assert.AreEqual(variant, proteins[0].BaseSequence, "Depth-filtered variant sequence should match previously observed variant"); + + // Re-check applied-variant semantics after filtering for completeness + var appliedAfterFilter = proteins[0].AppliedSequenceVariations.Single(); + Assert.IsTrue(appliedAfterFilter.VariantSequence.EndsWith("*"), "Stop-gained variant must end with '*' (after filtering)"); + Assert.AreEqual(161, appliedAfterFilter.OneBasedBeginPosition, "Stop-gained begins at residue 161 (after filtering)"); + Assert.AreEqual(161, appliedAfterFilter.OneBasedEndPosition, "Stop-gained ends at residue 161 (after filtering)"); + } + [Test] + public static void StopGained_TruncationIsPrefixAndNoOutOfBoundsAnnotations() + { + var proteins = ProteinDbLoader.LoadProteinXML( + Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), + true, DecoyType.None, null, false, null, out var _); + Assert.AreEqual(2, proteins.Count); - Assert.AreEqual(1, proteins[0].SequenceVariations.Count()); // some redundant - Assert.AreEqual(1, proteins[0].SequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes - Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Count()); // some redundant - Assert.AreEqual(0, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes - Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Count()); // some redundant - Assert.AreEqual(1, proteins[1].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes - Assert.AreEqual(191, proteins[0].Length); - Assert.AreEqual('Q', proteins[0][161 - 1]); - Assert.AreEqual(161 - 1, proteins[1].Length); - Assert.AreNotEqual(proteins[0].Length, proteins[1].Length); + var reference = proteins[0]; + var truncated = proteins[1]; - proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), true, - DecoyType.None, null, false, null, out unknownModifications, minAlleleDepth: 400); - Assert.AreEqual(1, proteins.Count); - Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Count()); // some redundant - Assert.AreEqual(1, proteins[0].AppliedSequenceVariations.Select(v => v.SimpleString()).Distinct().Count()); // unique changes - Assert.AreEqual(161 - 1, proteins[0].Length); + // The truncated sequence must be a prefix of the reference, + // i.e., identical up to the truncation point. + Assert.That(reference.BaseSequence.StartsWith(truncated.BaseSequence)); + + // Any possible localized modifications must not point past the truncation boundary. + Assert.That(truncated.OneBasedPossibleLocalizedModifications + .All(kv => kv.Key >= 1 && kv.Key <= truncated.Length)); + + // Any proteolysis products (if present) must not reference indices outside the sequence. + Assert.That(truncated.TruncationProducts.All(tp => + (!tp.OneBasedBeginPosition.HasValue || (tp.OneBasedBeginPosition.Value >= 1 && tp.OneBasedBeginPosition.Value <= truncated.Length)) && + (!tp.OneBasedEndPosition.HasValue || (tp.OneBasedEndPosition.Value >= 1 && tp.OneBasedEndPosition.Value <= truncated.Length)))); + + // The applied stop-gained variation often encodes a '*' in the variant sequence. + // If present, that indicates stop; the actual sequence is cut at the stop. + if (truncated.AppliedSequenceVariations.Any()) + { + Assert.That(truncated.AppliedSequenceVariations.Single().VariantSequence.EndsWith("*") || + !truncated.AppliedSequenceVariations.Single().VariantSequence.Contains("*")); + } } + [Test] + public static void StopGained_RoundTripSerializationPreservesTruncation() + { + var proteins = ProteinDbLoader.LoadProteinXML( + Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), + true, DecoyType.None, null, false, null, out var _); + + Assert.AreEqual(2, proteins.Count); + var tempPath = Path.Combine(TestContext.CurrentContext.TestDirectory, $"StopGained_roundtrip_{Guid.NewGuid()}.xml"); + try + { + // Persist both proteins (reference + variant) and reload. + ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), proteins, tempPath); + var roundtrip = ProteinDbLoader.LoadProteinXML(tempPath, true, DecoyType.None, null, false, null, out var __); + + // Round-trip preserves count and the truncation boundary for the variant-applied protein. + Assert.AreEqual(2, roundtrip.Count); + Assert.AreEqual(proteins[0].Length, roundtrip[0].Length); + Assert.AreEqual(proteins[1].Length, roundtrip[1].Length); + Assert.AreEqual(proteins[1].AppliedSequenceVariations.Count(), roundtrip[1].AppliedSequenceVariations.Count()); + } + finally + { + if (File.Exists(tempPath)) + { + File.SetAttributes(tempPath, FileAttributes.Normal); + File.Delete(tempPath); + } + } + } [Test] - public static void StopGainedDecoysAndDigestion() + public static void StopGained_NoPeptidesCrossTruncationSite() { - // test decoys and digestion - var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGain.xml"), true, - DecoyType.Reverse, null, false, null, out var unknownModifications, minAlleleDepth: 400); + var proteins = ProteinDbLoader.LoadProteinXML( + Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGained.xml"), + true, DecoyType.None, null, false, null, out var _); + Assert.AreEqual(2, proteins.Count); - var targetPeps = proteins[0].Digest(new DigestionParams(), null, null).ToList(); - var decoyPeps = proteins[1].Digest(new DigestionParams(), null, null).ToList(); - //Assert.AreEqual(targetPeps.Sum(p => p.Length), decoyPeps.Sum(p => p.Length)); - //Assert.AreEqual(targetPeps.Count, decoyPeps.Count); + var reference = proteins[0]; + var truncated = proteins[1]; + + // Peptides from the truncated protein must not reference indices past the truncation boundary. + var dp = new DigestionParams(); + var variantPeps = truncated.Digest(dp, null, null).ToList(); + Assert.That(variantPeps.All(p => p.OneBasedEndResidueInProtein <= truncated.Length)); + + // Any peptide in the reference that extends past the truncation boundary cannot exist in the variant. + var refPeps = reference.Digest(dp, null, null).ToList(); + var refCrossing = refPeps.Where(p => p.OneBasedEndResidueInProtein > truncated.Length).ToList(); + var variantPepWindows = new HashSet<(int start, int end)>(variantPeps.Select(p => (p.OneBasedStartResidueInProtein, p.OneBasedEndResidueInProtein))); + Assert.That(refCrossing.All(p => !variantPepWindows.Contains((p.OneBasedStartResidueInProtein, p.OneBasedEndResidueInProtein)))); + } + [Test] + public static void StopGainedDecoysAndDigestion() + { + // PURPOSE + // This test ensures that: + // 1) Reverse decoys are generated for a stop-gained (truncated) target protein. + // 2) The target and its decoy digest into peptides without referencing residues outside their sequence bounds. + // 3) Basic invariants hold for decoy generation (count/order/length/decoy flag). + // + // CONTEXT + // - Database: "StopGain.xml" contains a target protein with a stop-gained variant that truncates the sequence. + // - Decoys: Generated using DecoyType.Reverse (sequence reversal-based decoys). + // - minAlleleDepth: 400 ensures the stop-gained variant is applied (truncated target). + // + // EXPECTATIONS + // - Exactly 2 proteins are returned: [0] target (non-decoy) + [1] decoy (IsDecoy = true). + // - Target and decoy lengths are equal (reverse decoys preserve length). + // - Both target and decoy produce peptides via digestion. + // - No peptide references indices outside its parent protein's 1..Length range. + // - Accessions reflect decoy generation (decoy starts with the default "DECOY_"). + // - If variant(s) are present, their counts match between target and decoy. + + // Arrange: Load a variant-applied protein set and reverse decoy pair from StopGain.xml. + // Using a strict minAlleleDepth applies the stop-gained variant to the target. + var proteins = ProteinDbLoader.LoadProteinXML( + Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "StopGain.xml"), + true, // generateTargets + DecoyType.Reverse, // generate reverse-sequence decoys + null, // allKnownModifications + false, // isContaminant + null, // modTypesToExclude + out var unknownModifications, + minAlleleDepth: 400); // force applying the stop-gained variant + + // Assert: We expect exactly two proteins: target then its decoy. + Assert.AreEqual(2, proteins.Count, "Expected 1 target + 1 decoy"); + Assert.IsFalse(proteins[0].IsDecoy, "First protein should be the target (non-decoy)"); + Assert.IsTrue(proteins[1].IsDecoy, "Second protein should be the decoy"); + Assert.That(proteins[1].Accession.StartsWith("DECOY_"), "Decoy accession should start with the default decoy identifier"); + + // Assert: Reverse decoys should preserve sequence length. + Assert.AreEqual(proteins[0].Length, proteins[1].Length, "Target and decoy should have identical lengths"); + // In general, target and decoy sequences should not be byte-identical. + Assert.AreNotEqual(proteins[0].BaseSequence, proteins[1].BaseSequence, "Decoy sequence should differ from target sequence"); + + // If the stop-gained variant is applied to the target, the decoy should carry a corresponding variant count. + // We do not enforce exact mapping positions here, only that counts match if any are present. + if (proteins[0].AppliedSequenceVariations.Any() || proteins[1].AppliedSequenceVariations.Any()) + { + Assert.AreEqual( + proteins[0].AppliedSequenceVariations.Count(), + proteins[1].AppliedSequenceVariations.Count(), + "Target and decoy should carry the same number of applied sequence variations"); + } + + // Act: Digest both target and decoy using default digestion parameters (typically trypsin-like rules). + var dp = new DigestionParams(); + var targetPeps = proteins[0].Digest(dp, null, null).ToList(); + var decoyPeps = proteins[1].Digest(dp, null, null).ToList(); + + // Assert: Both should yield peptides. + Assert.That(targetPeps.Count > 0, "Target digestion should produce peptides"); + Assert.That(decoyPeps.Count > 0, "Decoy digestion should produce peptides"); + + // Assert: No peptide references residues outside the corresponding protein bounds. + Assert.That(targetPeps.All(p => + p.OneBasedStartResidueInProtein >= 1 && + p.OneBasedEndResidueInProtein <= proteins[0].Length), + "All target peptides must fall within target bounds"); + + Assert.That(decoyPeps.All(p => + p.OneBasedStartResidueInProtein >= 1 && + p.OneBasedEndResidueInProtein <= proteins[1].Length), + "All decoy peptides must fall within decoy bounds"); + + // Note: + // We intentionally do NOT assert the number of peptides or the sum of peptide lengths to be equal between + // target and decoy. Even with reverse decoys, tryptic cleavage context differs and may alter cleavage patterns. + // The commented lines below are often too strict and can fail legitimately: + // + // Assert.AreEqual(targetPeps.Sum(p => p.Length), decoyPeps.Sum(p => p.Length)); + // Assert.AreEqual(targetPeps.Count, decoyPeps.Count); } [Test] @@ -490,7 +987,7 @@ public void VariantSymbolWeirdnessXml() string file = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "SeqVarSymbolWeirdness.xml"); List variantProteins = ProteinDbLoader.LoadProteinXML(file, true, DecoyType.None, null, false, null, out var un); Assert.AreEqual(12, variantProteins.First().ConsensusVariant.SequenceVariations.Count()); - Assert.AreEqual(2, variantProteins.First().ConsensusVariant.SequenceVariations.Count(v => v.Description.Heterozygous.Any(kv => kv.Value))); + Assert.AreEqual(2, variantProteins.First().ConsensusVariant.SequenceVariations.Count(v => v.VariantCallFormatDataString.Heterozygous.Any(kv => kv.Value))); Assert.AreEqual(1, variantProteins.Count); // Should be 2^2 from combinitorics of heterozygous, but the giant indels overwrite them Assert.AreEqual(0, variantProteins.Where(v => v.BaseSequence == variantProteins.First().ConsensusVariant.BaseSequence).Count()); // Homozygous variations are included @@ -560,21 +1057,159 @@ public void IndelDecoyVariants() [Test] public void SequenceVariationIsValidTest() { - SequenceVariation sv1 = new SequenceVariation(10, 10, "A", "T", "info", null); - SequenceVariation sv2 = new SequenceVariation(5, 5, "G", "C", "info", null); - SequenceVariation sv3 = new SequenceVariation(8, 8, "T", "A", "info", null); + // PURPOSE + // Validate the minimal, position-only "validity" rules implemented by SequenceVariation.AreValid(): + // AreValid() == (OneBasedBeginPosition > 0) && (OneBasedEndPosition >= OneBasedBeginPosition) + // + // We cover: + // 1) Explicit begin/end ctor with typical point mutations → valid. + // 2) Explicit begin/end ctor with invalid positions → invalid. + // 3) Explicit begin/end ctor representing insertion/deletion edge-cases → valid as long as positions are valid. + // 4) One-position convenience ctor behavior for different originalSequence values (null, "", length > 0). + // - This ctor derives end as: end = (original == null) ? begin : begin + original.Length - 1. + // - Therefore, empty originalSequence "" makes end = begin - 1 → invalid by design. + // 5) Content fields (Original/Variant) and OneBasedModifications do NOT affect AreValid(), only positions do. + // 6) Optional sanity checks on derived fields (SimpleString and computed end position). + + // ----------------------------- + // 1) Valid: explicit begin/end point mutations (begin == end, begin > 0) + // ----------------------------- + SequenceVariation sv1 = new SequenceVariation( + oneBasedBeginPosition: 10, oneBasedEndPosition: 10, + originalSequence: "A", variantSequence: "T", + description: "info", oneBasedModifications: null); + SequenceVariation sv2 = new SequenceVariation( + oneBasedBeginPosition: 5, oneBasedEndPosition: 5, + originalSequence: "G", variantSequence: "C", + description: "info", oneBasedModifications: null); + SequenceVariation sv3 = new SequenceVariation( + oneBasedBeginPosition: 8, oneBasedEndPosition: 8, + originalSequence: "T", variantSequence: "A", + description: "info", oneBasedModifications: null); + + // A protein can carry multiple variations; positions alone determine validity. List svList = new List { sv1, sv2, sv3 }; - Protein variantProtein = new Protein("ACDEFGHIKLMNPQRSTVWY", "protein1", sequenceVariations: svList); - Assert.IsTrue(variantProtein.SequenceVariations.All(v => v.AreValid())); - SequenceVariation svInvalidOneBasedBeginLessThanOne = new SequenceVariation(0, 10, "A", "T", "info", null); - SequenceVariation svInvalidOneBasedEndLessThanOneBasedBegin = new SequenceVariation(5, 4, "G", "C", "info", null); - SequenceVariation svValidOriginalSequenceIsEmpty = new SequenceVariation(8, 8, "", "A", "info", null); - SequenceVariation svValidVariantSequenceLenthIsZero = new SequenceVariation(10, 10, "A", "", "info", null); - Assert.IsFalse(svInvalidOneBasedBeginLessThanOne.AreValid()); - Assert.IsFalse(svInvalidOneBasedEndLessThanOneBasedBegin.AreValid()); - Assert.IsTrue(svValidOriginalSequenceIsEmpty.AreValid()); //This is valid because it is an insertion - Assert.IsTrue(svValidVariantSequenceLenthIsZero.AreValid()); // This is valid because it is a deletion + + // Expectation: all three above are valid (begin > 0 and end == begin). + Assert.IsTrue(variantProtein.SequenceVariations.All(v => v.AreValid()), "All explicit point mutations with valid positions should be valid"); + + // ----------------------------- + // 2) Invalid: begin < 1 and end < begin + // ----------------------------- + SequenceVariation svInvalidOneBasedBeginLessThanOne = new SequenceVariation( + oneBasedBeginPosition: 0, oneBasedEndPosition: 10, + originalSequence: "A", variantSequence: "T", + description: "info", oneBasedModifications: null); + Assert.IsFalse(svInvalidOneBasedBeginLessThanOne.AreValid(), "Begin position must be >= 1"); + + SequenceVariation svInvalidOneBasedEndLessThanOneBasedBegin = new SequenceVariation( + oneBasedBeginPosition: 5, oneBasedEndPosition: 4, + originalSequence: "G", variantSequence: "C", + description: "info", oneBasedModifications: null); + Assert.IsFalse(svInvalidOneBasedEndLessThanOneBasedBegin.AreValid(), "End position cannot be less than begin position"); + + // ----------------------------- + // 3) Explicit begin/end edge-cases: insertion and deletion modeled by content only + // NOTE: AreValid ignores Original/Variant content; only positions matter. + // ----------------------------- + // Insertion-like (explicit): original is empty (""), variant has content. + // Valid because we explicitly supply begin == end (positions are valid). + SequenceVariation svValidOriginalSequenceIsEmpty = new SequenceVariation( + oneBasedBeginPosition: 8, oneBasedEndPosition: 8, + originalSequence: "", variantSequence: "A", + description: "info", oneBasedModifications: null); + Assert.IsTrue(svValidOriginalSequenceIsEmpty.AreValid(), "Explicit insertion with valid positions should be considered valid"); + + // Deletion-like (explicit): variant is empty (""), original has content, positions still valid. + SequenceVariation svValidVariantSequenceLengthIsZero = new SequenceVariation( + oneBasedBeginPosition: 10, oneBasedEndPosition: 10, + originalSequence: "A", variantSequence: "", + description: "info", oneBasedModifications: null); + Assert.IsTrue(svValidVariantSequenceLengthIsZero.AreValid(), "Explicit deletion with valid positions should be considered valid"); + + // ----------------------------- + // 4) One-position convenience ctor behavior for originalSequence values + // ctor: SequenceVariation(int oneBasedPosition, string originalSequence, string variantSequence, string description, ...) + // end is computed as: + // - if original == null → end = begin + // - else → end = begin + original.Length - 1 + // ----------------------------- + + // 4a) originalSequence has length 1 → end == begin (valid) + var svPosCtorLength1 = new SequenceVariation( + oneBasedPosition: 15, + originalSequence: "K", + variantSequence: "R", + description: "pos-ctor length 1"); + Assert.AreEqual(15, svPosCtorLength1.OneBasedBeginPosition); + Assert.AreEqual(15, svPosCtorLength1.OneBasedEndPosition, "With original length 1, end should equal begin"); + Assert.IsTrue(svPosCtorLength1.AreValid(), "Single-site variation via position ctor should be valid"); + + // 4b) originalSequence has length > 1 → end == begin + len - 1 (valid) + var svPosCtorLength3 = new SequenceVariation( + oneBasedPosition: 20, + originalSequence: "PEP", // len = 3 + variantSequence: "AAA", // content irrelevant to AreValid + description: "pos-ctor length 3"); + Assert.AreEqual(20, svPosCtorLength3.OneBasedBeginPosition); + Assert.AreEqual(22, svPosCtorLength3.OneBasedEndPosition, "End should be begin + original.Length - 1"); + Assert.IsTrue(svPosCtorLength3.AreValid(), "Multi-length replacement with valid positions should be valid"); + + // 4c) originalSequence == null → end = begin (valid) + var svPosCtorNullOriginal = new SequenceVariation( + oneBasedPosition: 30, + originalSequence: null, // special case handled in ctor: end = begin + variantSequence: "A", + description: "pos-ctor null original"); + Assert.AreEqual(30, svPosCtorNullOriginal.OneBasedBeginPosition); + Assert.AreEqual(30, svPosCtorNullOriginal.OneBasedEndPosition, "Null original should set end = begin"); + Assert.IsTrue(svPosCtorNullOriginal.AreValid(), "Null original via position ctor is treated as length 1 (valid)"); + + // 4d) originalSequence == "" (empty) → end = begin - 1 (invalid by design) + // This models an insertion if you rely solely on the position ctor, but produces end < begin → invalid. + // For insertions, prefer the explicit begin/end ctor with valid positions (see 3). + var svPosCtorEmptyOriginal = new SequenceVariation( + oneBasedPosition: 40, + originalSequence: "", // empty → end = 39 + variantSequence: "A", + description: "pos-ctor empty original"); + Assert.AreEqual(40, svPosCtorEmptyOriginal.OneBasedBeginPosition); + Assert.AreEqual(39, svPosCtorEmptyOriginal.OneBasedEndPosition, "Empty original sets end = begin - 1"); + Assert.IsFalse(svPosCtorEmptyOriginal.AreValid(), "Position ctor with empty original is invalid (end < begin)"); + + // ----------------------------- + // 5) Validity is position-only; content and mods do not change AreValid() + // - VariantSequence null is normalized to "" in the ctor. + // - OneBasedModifications is stored but ignored by AreValid(). + // ----------------------------- + var mods = new Dictionary> + { + { 1, new List() } // empty mod list at a site; still ignored by AreValid + }; + var svContentIrrelevant = new SequenceVariation( + oneBasedBeginPosition: 3, oneBasedEndPosition: 3, + originalSequence: "M", variantSequence: null, // becomes "" + description: "mods/variant null test", oneBasedModifications: mods); + Assert.IsTrue(svContentIrrelevant.AreValid(), "Null variant and/or mods should not affect positional validity"); + Assert.AreEqual("", svContentIrrelevant.VariantSequence, "Null VariantSequence is normalized to empty string"); + + // ----------------------------- + // 6) Sanity: SimpleString format and positive bounds at the edge of sequence + // ----------------------------- + var svSimple = new SequenceVariation( + oneBasedBeginPosition: 1, oneBasedEndPosition: 1, + originalSequence: "A", variantSequence: "V", + description: "simple"); + // SimpleString = Original + Begin + Variant (no delimiter) + Assert.AreEqual("A1V", svSimple.SimpleString(), "SimpleString should concatenate original + begin + variant"); + + // Additional guard: begin == 1 is valid if end >= begin + var svAtStart = new SequenceVariation( + oneBasedBeginPosition: 1, oneBasedEndPosition: 2, + originalSequence: "MA", variantSequence: "MV", + description: "range at start"); + Assert.IsTrue(svAtStart.AreValid(), "Ranges that start at 1 are valid provided end >= begin"); } [Test] public void VariantModificationTest() @@ -644,54 +1279,294 @@ public void WriteProteinXmlWithVariantsDiscoveredAsModifications2() Assert.That(newProtein.SequenceVariations.Count, Is.EqualTo(totalSequenceVariations + 1)); //This number increases by 1 because we added a sequence variation that was discovered as a modification Assert.AreEqual(0,newProtein.OneBasedPossibleLocalizedModifications.Count); //This number should be 0 because we converted the modification to a sequence variation } - + /// + /// PURPOSE + /// Ensures that variant proteins are automatically generated during XML read, both for targets and reverse decoys. + /// The database "humanGAPDH.xml" encodes two single-nucleotide substitutions on the target protein P04406: + /// - A22G + /// - K251N + /// + /// EXPECTATIONS + /// - Loader emits all combinatorial target variants derived from those two changes: + /// [0] Reference (no variants applied) → Accession "P04406" + /// [1] Single variant A22G → Accession "P04406_A22G" + /// [2] Single variant K251N → Accession "P04406_K251N" + /// [3] Double variant K251N + A22G (combined) → Accession "P04406_K251N_A22G" + /// - With DecoyType.Reverse, a matching set of 4 reverse decoys is produced with mirrored coordinates: + /// [4] Decoy of reference → "DECOY_P04406" + /// [5] Decoy with mirrored A22G (mapped to site 315) → "DECOY_P04406_A315G" + /// [6] Decoy with mirrored K251N (mapped to site 86) → "DECOY_P04406_K86N" + /// [7] Decoy with both mirrored variants → "DECOY_P04406_K86N_A315G" + /// + /// WHY THIS MATTERS + /// - Validates that the reader expands sequence variation definitions into concrete variant proteins. + /// - Verifies decoy generation mirrors variant coordinates appropriately and preserves ordering. + /// - Guards against regressions in accession naming, variant application counts, and decoy parity. + /// + /// PARAMETERS PASSED TO LOADER + /// - generateTargets: true → emit target proteins + /// - decoyType: Reverse → also emit reverse decoys + /// - allKnownModifications: UniProtPtms → resolve any UniProt-annotated PTMs so no "unknown" mods remain + /// - isContaminant: false + /// - modTypesToExclude: null + /// - out unknownModifications → capture any unrecognized mods (should be empty when UniProtPtms is provided) + /// - minAlleleDepth: 1 → do not filter out these low-depth test variants + /// - maxHeterozygousVariants: 99 → allow generating all combinations from the two sites (up to 2^2) + /// [Test] public static void TestThatProteinVariantsAreGeneratedDuringRead() { + // Arrange: load a target with two site-specific variants and request reverse decoys as well. + // IMPORTANT: Provide UniProtPtms so annotations in humanGAPDH.xml resolve and do not end up in unknownModifications. string databaseName = "humanGAPDH.xml"; - var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true, - DecoyType.Reverse, null, false, null, out var unknownModifications, 1, 99); - Assert.AreEqual(8, proteins.Count); // 4 target + 4 decoy - Assert.AreEqual(2, proteins[0].SequenceVariations.Count()); // these sequence variations were in the original - Assert.That("P04406", Is.EqualTo(proteins[0].Accession)); - Assert.That("P04406_A22G", Is.EqualTo(proteins[1].Accession)); - Assert.That("P04406_K251N", Is.EqualTo(proteins[2].Accession)); - Assert.That("P04406_K251N_A22G", Is.EqualTo(proteins[3].Accession)); - Assert.That("DECOY_P04406", Is.EqualTo(proteins[4].Accession)); - Assert.That("DECOY_P04406_A315G", Is.EqualTo(proteins[5].Accession)); - Assert.That("DECOY_P04406_K86N", Is.EqualTo(proteins[6].Accession)); - Assert.That("DECOY_P04406_K86N_A315G", Is.EqualTo(proteins[7].Accession)); + var proteins = ProteinDbLoader.LoadProteinXML( + Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), + generateTargets: true, + decoyType: DecoyType.Reverse, + allKnownModifications: UniProtPtms, // CHANGED: was null; supplying known PTMs prevents unknownModifications from being populated + isContaminant: false, + modTypesToExclude: null, + unknownModifications: out var unknownModifications, + minAlleleDepth: 1, + maxHeterozygousVariants: 99); + + // Basic shape: 4 targets + 4 reverse decoys in a deterministic order. + Assert.AreEqual(8, proteins.Count, "Expected 4 targets and 4 decoys in a fixed order"); + + // Targets/decoys split and flags should be consistent and easy to reason about. + Assert.AreEqual(4, proteins.Count(p => !p.IsDecoy), "First half should be targets"); + Assert.AreEqual(4, proteins.Count(p => p.IsDecoy), "Second half should be decoys"); + for (int i = 0; i < 4; i++) + { + Assert.IsFalse(proteins[i].IsDecoy, $"Index {i} should be a target"); + Assert.IsTrue(proteins[i + 4].IsDecoy, $"Index {i + 4} should be a decoy"); + } + + // The reference target (index 0) carries two possible sequence variations in its metadata. + // This documents the test input and ensures the reader surfaced them. + Assert.AreEqual(2, proteins[0].SequenceVariations.Count(), "Reference should advertise exactly two possible sequence variations"); + + // Accessions must match exact, canonical variant labeling and order for both targets and decoys. + Assert.That("P04406", Is.EqualTo(proteins[0].Accession), "Reference target accession mismatch"); + Assert.That("P04406_A22G", Is.EqualTo(proteins[1].Accession), "Single-variant (A22G) target accession mismatch"); + Assert.That("P04406_K251N", Is.EqualTo(proteins[2].Accession), "Single-variant (K251N) target accession mismatch"); + Assert.That("P04406_K251N_A22G", Is.EqualTo(proteins[3].Accession), "Double-variant target accession mismatch"); + + Assert.That("DECOY_P04406", Is.EqualTo(proteins[4].Accession), "Reference decoy accession mismatch"); + Assert.That("DECOY_P04406_A315G", Is.EqualTo(proteins[5].Accession), "Decoy accession for mirrored A22G mismatch"); + Assert.That("DECOY_P04406_K86N", Is.EqualTo(proteins[6].Accession), "Decoy accession for mirrored K251N mismatch"); + Assert.That("DECOY_P04406_K86N_A315G", Is.EqualTo(proteins[7].Accession), "Decoy accession for double-variant mismatch"); + + // Sanity: accessions are non-empty and unique (avoid accidental duplication/shuffling). + Assert.That(proteins.All(p => !string.IsNullOrWhiteSpace(p.Accession)), "All proteins must have non-empty accessions"); + Assert.AreEqual(proteins.Count, proteins.Select(p => p.Accession).Distinct().Count(), "Accessions must be unique"); + + // Each decoy should be length-equal to its corresponding target, but usually sequence-different (reverse). + for (int i = 0; i < 4; i++) + { + Assert.AreEqual(proteins[i].Length, proteins[i + 4].Length, $"Target/decoy length should match for index {i}"); + Assert.AreNotEqual(proteins[i].BaseSequence, proteins[i + 4].BaseSequence, $"Decoy sequence should differ from its target for index {i}"); + } + + // Applied variant counts (how many variations were actually realized in this protein instance): + // Targets: [0] ref=0, [1] A22G=1, [2] K251N=1, [3] both=2 + // Decoys: [4] ref=0, [5] A315G=1, [6] K86N=1, [7] both=2 + int[] expectedAppliedCounts = { 0, 1, 1, 2, 0, 1, 1, 2 }; + for (int i = 0; i < proteins.Count; i++) + { + Assert.AreEqual(expectedAppliedCounts[i], proteins[i].AppliedSequenceVariations.Count(), + $"Applied variant count mismatch at index {i} ({proteins[i].Accession})"); + } + + // The specific applied-variant labels should match accessions: + // - For targets: "A22G" and/or "K251N" + // - For decoys: mirrored positions → "A315G" and/or "K86N" + static HashSet AppliedLabels(Protein p) => + new HashSet(p.AppliedSequenceVariations.Select(v => v.SimpleString())); + + // Targets (indices 0..3) + Assert.That(AppliedLabels(proteins[0]).SetEquals(Array.Empty()), "Reference target should have no applied variants"); + Assert.That(AppliedLabels(proteins[1]).SetEquals(new[] { "A22G" }), "Single-variant target must be exactly A22G"); + Assert.That(AppliedLabels(proteins[2]).SetEquals(new[] { "K251N" }), "Single-variant target must be exactly K251N"); + Assert.That(AppliedLabels(proteins[3]).SetEquals(new[] { "A22G", "K251N" }), "Double-variant target should have A22G and K251N"); + + // Decoys (indices 4..7) have mirrored coordinates (due to reverse decoying). + Assert.That(AppliedLabels(proteins[4]).SetEquals(Array.Empty()), "Reference decoy should have no applied variants"); + Assert.That(AppliedLabels(proteins[5]).SetEquals(new[] { "A315G" }), "Single-variant decoy must be exactly A315G (mirror of A22G)"); + Assert.That(AppliedLabels(proteins[6]).SetEquals(new[] { "K86N" }), "Single-variant decoy must be exactly K86N (mirror of K251N)"); + Assert.That(AppliedLabels(proteins[7]).SetEquals(new[] { "A315G", "K86N" }), "Double-variant decoy should have A315G and K86N"); + + // Parity check: each target and its decoy should carry the same number of applied variations. + for (int i = 0; i < 4; i++) + { + Assert.AreEqual( + proteins[i].AppliedSequenceVariations.Count(), + proteins[i + 4].AppliedSequenceVariations.Count(), + $"Applied variant count should match between target and decoy at index {i}"); + } + + // With UniProtPtms supplied, no unknown modifications should be reported. + if (unknownModifications != null && unknownModifications.Count > 0) + { + // Extra diagnostics to ease debugging in case of future schema/content changes. + TestContext.WriteLine("Unknown modifications reported by loader:"); + foreach (var um in unknownModifications) + TestContext.WriteLine($" - {um}"); + } + Assert.That(unknownModifications == null || unknownModifications.Count == 0, "No unknown modifications expected from this input"); } + [Test] public static void ProteinVariantsReadAsModificationsWrittenAsVariants() { + // PURPOSE + // This test verifies the I/O pipeline that converts "nucleotide substitution" modifications + // embedded in an input protein XML into canonical "sequence variant" features when read, + // and persists them back out as proper entries when written. + // + // WHY + // - Some sources (e.g., GPTMD discovery) encode AA substitutions as modifications with + // ModificationType = "1 nucleotide substitution". Internally, we want these represented + // as SequenceVariations, not remaining as generic modifications. + // - On read: these substitution mods must be removed from the protein’s modifications collections + // and turned into SequenceVariations. + // - On write: they must be serialized as sequence variant features with a standardized description + // ("Putative GPTMD Substitution") and NOT re-serialized as modifications. + // + // EXPECTATIONS SUMMARY + // - Input file has 57 lines that contain "1 nucleotide substitution" (source encoding as mods). + // - After LoadProteinXML: + // * We get exactly 9 proteins (DecoyType.None). + // * Total SequenceVariations across all proteins = 194. + // * Total OneBasedPossibleLocalizedModifications count across all proteins = 0 + // (i.e., all substitution mods became sequence variations). + // * No unknown modifications should remain. + // * No applied variants are expected (metadata only; we are not expanding combinatorics here). + // - After WriteXmlDatabase then re-load: + // * Count and totals remain the same (9 proteins; 194 total sequence variations; 0 mods). + // * The output file contains: + // - Exactly 194 lines with feature type="sequence variant". + // - Exactly 194 lines with "Putative GPTMD Substitution". + // - Exactly 0 lines with "1 nucleotide substitution" (proved that mods were not serialized). + // + // NOTE + // - We keep DecoyType.None to avoid expanding the protein list. + // - We use minAlleleDepth = 1 and maxHeterozygousVariants = 0 to avoid variant-proteoform expansion. + + // Arrange: locate the source database that encodes nucleotide substitutions as modifications string databaseName = "nucleotideVariantsAsModifications.xml"; + string inputPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName); + Assert.That(File.Exists(inputPath), Is.True, "Input database file must exist for this test"); + + // Sanity: confirm the source encodes substitutions as modifications in the raw XML + // (57 lines that mention "1 nucleotide substitution" in the file). + var inputLines = File.ReadAllLines(inputPath); + int inputSubstitutionModLines = inputLines.Count(l => l.Contains("1 nucleotide substitution")); + Assert.That(inputSubstitutionModLines, Is.EqualTo(57), "Source XML should contain 57 substitution-mod lines"); + + // Optional sanity: the source should not already be in sequence-variant form + int inputSeqVarFeatureLines = inputLines.Count(l => l.Contains("feature type=\"sequence variant\"")); + Assert.That(inputSeqVarFeatureLines, Is.EqualTo(0), "Source XML should not already encode sequence variants"); + + // Act: read the database. Expect conversion to SequenceVariations and removal from modifications + var proteins = ProteinDbLoader.LoadProteinXML( + inputPath, + generateTargets: true, + decoyType: DecoyType.None, + allKnownModifications: null, + isContaminant: false, + modTypesToExclude: null, + unknownModifications: out var unknownModifications, + minAlleleDepth: 1, + maxHeterozygousVariants: 0); + + // Assert: no decoys requested, so all should be targets + Assert.That(proteins.All(p => !p.IsDecoy), "All proteins should be targets when DecoyType.None is used"); + + // Assert: exactly 9 proteins + Assert.AreEqual(9, proteins.Count, "Expected exactly 9 proteins from the input"); + + // Assert: the loader converted all substitution modifications into proper SequenceVariations + // and removed them from OneBasedPossibleLocalizedModifications + int totalSequenceVariations = proteins.Sum(p => p.SequenceVariations.Count); + int totalPossibleLocalizedMods = proteins.Sum(p => p.OneBasedPossibleLocalizedModifications.Values.Sum(v => v.Count)); + Assert.AreEqual(194, totalSequenceVariations, "Total number of sequence variations after load must be 194"); + Assert.AreEqual(0, totalPossibleLocalizedMods, "All substitution modifications should have been converted; none should remain as mods"); + + // FIX: Safely log unknown modifications (Dictionary), if any appear unexpectedly. + if (unknownModifications != null && unknownModifications.Count > 0) + { + TestContext.WriteLine("Unknown modifications encountered during read (unexpected):"); + foreach (var kvp in unknownModifications) + { + var mod = kvp.Value; + var id = mod?.OriginalId ?? ""; + var type = mod?.ModificationType ?? ""; + TestContext.WriteLine($" - key={kvp.Key}, id={id}, type={type}"); + } + } + Assert.That(unknownModifications == null || unknownModifications.Count == 0, "No unknown modifications should remain after conversion"); - Assert.That(File.ReadAllLines(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName)).Count(l => l.Contains("1 nucleotide substitution")), Is.EqualTo(57)); + // Assert: No applied variants expected in this test (we are validating representation, not expansion) + int totalAppliedVariants = proteins.Sum(p => p.AppliedSequenceVariations.Count()); + Assert.That(totalAppliedVariants, Is.EqualTo(0), "No applied variants expected; variants are metadata here"); - var proteins = ProteinDbLoader.LoadProteinXML(Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", databaseName), true, - DecoyType.None, null, false, null, out var unknownModifications, 1, 0); - Assert.AreEqual(9, proteins.Count); // 1 target - Assert.AreEqual(194, proteins.Select(v=>v.SequenceVariations.Count).Sum()); // there are no sequence variations in the original proteins - Assert.AreEqual(0, proteins.Select(m => m.OneBasedPossibleLocalizedModifications.Sum(list=>list.Value.Count)).Sum()); // there are 194 sequence variants as modifications in the original proteins + // Assert: None of the proteins should carry "1 nucleotide substitution" as OriginalNonVariantModifications anymore + int residualSubstitutionMods = + proteins.Sum(p => p.OriginalNonVariantModifications.Values.Sum(list => list.Count(m => m.ModificationType == "1 nucleotide substitution"))); + Assert.That(residualSubstitutionMods, Is.EqualTo(0), "No '1 nucleotide substitution' mods should remain in OriginalNonVariantModifications"); + // Persist the converted representation and read it back; this file should contain only sequence-variant features string tempDir = Path.Combine(Path.GetTempPath(), Guid.NewGuid().ToString()); Directory.CreateDirectory(tempDir); string tempFile = Path.Combine(tempDir, "xmlWithSequenceVariantsAndNoModifications.txt"); - ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), proteins.Where(p => !p.IsDecoy).ToList(), tempFile); - proteins = ProteinDbLoader.LoadProteinXML(tempFile, true, - DecoyType.None, null, false, null, out unknownModifications, 1, 0); - Assert.AreEqual(9, proteins.Count); // 1 target - Assert.AreEqual(194, proteins.Select(v => v.SequenceVariations.Count).Sum()); // there are 194 sequence variations in the revised proteins - Assert.AreEqual(0, proteins.Select(m => m.OneBasedPossibleLocalizedModifications.Sum(list => list.Value.Count)).Sum()); // there are 0 sequence variants as modifications in the original proteins - - Assert.That(File.ReadAllLines(tempFile).Count(l => l.Contains("feature type=\"sequence variant\"")), Is.EqualTo(194)); - Assert.That(File.ReadAllLines(tempFile).Count(l => l.Contains("Putative GPTMD Substitution")), Is.EqualTo(194)); - Assert.That(File.ReadAllLines(tempFile).Count(l => l.Contains("1 nucleotide substitution")), Is.EqualTo(0)); - if (Directory.Exists(tempDir)) Directory.Delete(tempDir, true); + try + { + // Write only the targets (all are targets here). Expect the writer to emit sequence variant features. + ProteinDbWriter.WriteXmlDatabase(new Dictionary>>(), proteins.Where(p => !p.IsDecoy).ToList(), tempFile); + + // Inspect the written file contents directly for ground truth + var writtenLines = File.ReadAllLines(tempFile); + int writtenSeqVarFeatures = writtenLines.Count(l => l.Contains("feature type=\"sequence variant\"")); + int writtenPutativeGptmd = writtenLines.Count(l => l.Contains("Putative GPTMD Substitution")); + int writtenSubstitutionMods = writtenLines.Count(l => l.Contains("1 nucleotide substitution")); + + // Assert: writer produced only sequence variants (not substitution mods) + Assert.That(writtenSeqVarFeatures, Is.EqualTo(194), "All 194 substitutions must be serialized as sequence variant features"); + Assert.That(writtenPutativeGptmd, Is.EqualTo(194), "All 194 variants should have the standardized description label"); + Assert.That(writtenSubstitutionMods, Is.EqualTo(0), "No '1 nucleotide substitution' strings should remain in the written XML"); + + // Re-load the written file to confirm round-trip stability + proteins = ProteinDbLoader.LoadProteinXML( + tempFile, + generateTargets: true, + decoyType: DecoyType.None, + allKnownModifications: null, + isContaminant: false, + modTypesToExclude: null, + unknownModifications: out unknownModifications, + minAlleleDepth: 1, + maxHeterozygousVariants: 0); + + // Assert: the round-tripped representation is identical in shape and counts + Assert.AreEqual(9, proteins.Count, "Round-trip must preserve protein count"); + Assert.AreEqual(194, proteins.Sum(v => v.SequenceVariations.Count), "Round-trip must preserve total number of sequence variations"); + Assert.AreEqual(0, proteins.Sum(m => m.OneBasedPossibleLocalizedModifications.Values.Sum(list => list.Count)), + "Round-trip must preserve the fact that no substitution mods remain"); + Assert.That(unknownModifications == null || unknownModifications.Count == 0, "Round-trip should not introduce unknown modifications"); + Assert.That(proteins.Sum(p => p.AppliedSequenceVariations.Count()), Is.EqualTo(0), "Round-trip should not apply any variants"); + } + finally + { + // Cleanup test artifacts + if (Directory.Exists(tempDir)) + { + try { File.SetAttributes(tempFile, FileAttributes.Normal); } catch { /* ignore */ } + Directory.Delete(tempDir, true); + } + } } - [Test] public void Constructor_ParsesDescriptionCorrectly() { @@ -770,5 +1645,319 @@ public void Constructor_ParsesDescriptionCorrectly() var adValues = svd.AlleleDepths[adKey]; Assert.AreEqual(new[] { "30", "30" }, adValues); } + [Test] + public void ParseComprehensiveVcfExamples() + { + string current = TestContext.CurrentContext.TestDirectory; + string vcfPath = null; + while (current != null) + { + var candidate = Path.Combine(current, "Test", "DatabaseTests", "vcf_comprehensive_examples.vcf"); + if (File.Exists(candidate)) + { + vcfPath = candidate; + break; + } + current = Directory.GetParent(current)?.FullName; + } + + Assert.That(vcfPath, Is.Not.Null, "Could not locate vcf_comprehensive_examples.vcf"); + + var lines = File.ReadAllLines(vcfPath); + + var dataRows = lines + .Where(l => !string.IsNullOrWhiteSpace(l)) + .Where(l => !l.StartsWith("##")) + .Where(l => !l.StartsWith("#CHROM")) + .ToList(); + + Assert.That(dataRows.Count, Is.EqualTo(8), "Expected 8 example variant rows."); + + for (int rowIndex = 0; rowIndex < dataRows.Count; rowIndex++) + { + string originalLine = dataRows[rowIndex]; + string[] rawFields = originalLine.Split('\t'); + Assert.That(rawFields.Length, Is.GreaterThanOrEqualTo(10), $"Row {rowIndex + 1}: insufficient columns."); + + var vcf = new VariantCallFormat(originalLine); + + Assert.That(vcf.Description, Is.EqualTo(originalLine)); + Assert.That(vcf.ReferenceAlleleString, Is.EqualTo(rawFields[3])); + Assert.That(vcf.AlternateAlleleString, Is.EqualTo(rawFields[4])); + Assert.That(vcf.Format, Is.EqualTo(rawFields[8])); + + if (rawFields[7] == ".") + { + Assert.That(vcf.Info.Annotation, Is.EqualTo(rawFields[7])); + } + + var sampleFields = rawFields.Skip(9).ToArray(); + Assert.That(vcf.Genotypes.Count, Is.EqualTo(sampleFields.Length)); + Assert.That(vcf.AlleleDepths.Count, Is.EqualTo(sampleFields.Length)); + Assert.That(vcf.Homozygous.Count, Is.EqualTo(sampleFields.Length)); + Assert.That(vcf.Heterozygous.Count, Is.EqualTo(sampleFields.Length)); + Assert.That(vcf.ZygosityBySample.Count, Is.EqualTo(sampleFields.Length)); + + for (int sampleIndex = 0; sampleIndex < sampleFields.Length; sampleIndex++) + { + string sample = sampleFields[sampleIndex]; + string key = sampleIndex.ToString(); + + string[] parts = sample.Split(':'); + Assert.That(parts.Length, Is.EqualTo(vcf.Format.Split(':').Length)); + + string gtPart = parts[0]; + string adPart = parts.Length > 1 ? parts[1] : null; + + // Expected GT tokens + string[] expectedGtTokens = gtPart.Split(new[] { '/', '|' }, StringSplitOptions.RemoveEmptyEntries); + if (gtPart.Contains('.') && expectedGtTokens.Length == 1 && + (gtPart == "./." || gtPart == ".|." || gtPart == ".|1" || gtPart == "0|." || gtPart == "0/.")) + { + expectedGtTokens = new[] { ".", "." }; + } + + Assert.That(vcf.Genotypes.ContainsKey(key)); + var parsedGt = vcf.Genotypes[key]; + Assert.That(parsedGt, Is.EqualTo(expectedGtTokens)); + + // Expected AD tokens + string[] expectedAdTokens = + string.IsNullOrWhiteSpace(adPart) ? Array.Empty() : + adPart == "." ? new[] { "." } : + adPart.Split(','); + + Assert.That(vcf.AlleleDepths.ContainsKey(key)); + var parsedAd = vcf.AlleleDepths[key] ?? Array.Empty(); + if (parsedAd.Length != 0 || expectedAdTokens.Length != 1 || expectedAdTokens[0] != ".") + { + Assert.That(parsedAd, Is.EqualTo(expectedAdTokens)); + } + + // Expected zygosity using ONLY non-missing alleles (must mirror implementation) + var calledAlleles = parsedGt.Where(a => a != ".").ToArray(); + bool expectedHom = calledAlleles.Length > 0 && calledAlleles.Distinct().Count() == 1; + bool expectedHet = calledAlleles.Distinct().Count() > 1; + VariantCallFormat.Zygosity expectedZ = + calledAlleles.Length == 0 + ? VariantCallFormat.Zygosity.Unknown + : expectedHet + ? VariantCallFormat.Zygosity.Heterozygous + : VariantCallFormat.Zygosity.Homozygous; + + Assert.That(vcf.Homozygous[key], Is.EqualTo(expectedHom)); + Assert.That(vcf.Heterozygous[key], Is.EqualTo(expectedHet)); + Assert.That(vcf.ZygosityBySample[key], Is.EqualTo(expectedZ)); + } + } + } + [Test] + public void Constructor_InvalidCoordinates_ThrowsArgumentException() + { + // Minimal valid VCF line (10 columns) so VariantCallFormat parses without truncation. + // Arrange: end < begin (invalid coordinates) + var sv = new SequenceVariation( + oneBasedBeginPosition: 5, + oneBasedEndPosition: 4, + originalSequence: "A", + variantSequence: "V", + description: "invalid-coords", + oneBasedModifications: null); + + // Assert: SequenceVariation does not throw on construction; it reports invalid via AreValid() + Assert.That(sv.AreValid(), Is.False); + } + // Helper to create a minimal substitution modification matching the required detection pattern + private static Modification Substitution(string idArrow) + { + // If you want this helper to be convertible by the code under test, + // give it a matching motif for the site where it will be placed. + // For now keep it generic (unused in this test). + return new Modification( + idArrow, // originalId + null, // accession + "1 nucleotide substitution", // modificationType + null, // featureType + null, // target motif + "Anywhere.", // location restriction + null, // chemical formula + 0, // monoisotopic mass + new Dictionary>(), // databaseReference + null, // taxonomicRange + null, // keywords + null, // neutralLosses + null, // diagnosticIons + null); // fileOrigin + } + + // Non-substitution (should be ignored) + private static Modification Other(string id, double mass = 15.9949) + { + // Generic oxidation at P motif (unused by main test path) + ModificationMotif.TryGetMotif("P", out var motifP); + return new Modification( + id, + null, + "oxidation", + null, + motifP, + "Anywhere.", + null, + mass, + new Dictionary>(), + null, + null, + null, + null, + null); + } + + // Malformed substitution (no "->" pattern) must be ignored + private static Modification Malformed() + { + ModificationMotif.TryGetMotif("Q", out var motifQ); + return new Modification( + "E>A", + null, + "1 nucleotide substitution", + null, + motifQ, + "Anywhere.", + null, + 0, + new Dictionary>(), + null, + null, + null, + null, + null); + } + + [Test] + public void ConvertNucleotideSubstitutionModificationsToSequenceVariants_Comprehensive() + { + // 1 M, 2 A, 3 E, 4 W, 5 P, 6 Q, 7 K + var protein = new Protein("MAEWPQK", "TEST_PROT"); + + static Modification MakeSub(string idArrow, char originalResidue) + { + ModificationMotif.TryGetMotif(originalResidue.ToString(), out var motif); + return new Modification( + idArrow, + null, + "1 nucleotide substitution", + null, + motif, + "Anywhere.", + null, + 0, + new Dictionary>(), + null, + null, + null, + null, + null); + } + + static Modification MakeOther(string id) + { + ModificationMotif.TryGetMotif("P", out var motifP); + return new Modification( + id, + null, + "oxidation", + null, + motifP, + "Anywhere.", + null, + 15.9949, + new Dictionary>(), + null, + null, + null, + null, + null); + } + + static Modification MakeMalformed() + { + ModificationMotif.TryGetMotif("Q", out var motifQ); + return new Modification( + "E>A", + null, + "1 nucleotide substitution", + null, + motifQ, + "Anywhere.", + null, + 0, + new Dictionary>(), + null, + null, + null, + null, + null); + } + + void AddMod(Protein p, int pos, Modification m) + { + if (!p.OneBasedPossibleLocalizedModifications.TryGetValue(pos, out var list1)) + { + list1 = new List(); + p.OneBasedPossibleLocalizedModifications[pos] = list1; + } + list1.Add(m); + + if (!p.OriginalNonVariantModifications.TryGetValue(pos, out var list2)) + { + list2 = new List(); + p.OriginalNonVariantModifications[pos] = list2; + } + list2.Add(m); + } + + // Mods to seed + var modEtoA = MakeSub("E->A", 'E'); // pos 3 + var modWtoK = MakeSub("W->K", 'W'); // pos 4 + var modOxidP = MakeOther("Oxid_P"); // pos 5 + var malformed = MakeMalformed(); // pos 6 + + AddMod(protein, 3, modEtoA); + AddMod(protein, 4, modWtoK); + AddMod(protein, 5, modOxidP); + AddMod(protein, 6, malformed); + + // Pre-existing W->K (may be duplicated by converter if description differs) + protein.SequenceVariations.Add(new SequenceVariation(4, 4, "W", "K", "Existing substitution")); + + // Act + protein.ConvertNucleotideSubstitutionModificationsToSequenceVariants(); + + // Assert unique AA changes, not raw count (converter may add standardized duplicates) + var uniqueChanges = protein.SequenceVariations.Select(v => v.SimpleString()).Distinct().ToList(); + Assert.That(uniqueChanges.Count, Is.EqualTo(2), "Expected exactly two unique substitutions (E3->A and W4->K)."); + + // Ensure E3->A exists + var eToA = protein.SequenceVariations.SingleOrDefault(v => + v.OneBasedBeginPosition == 3 && v.OneBasedEndPosition == 3 && + v.OriginalSequence == "E" && v.VariantSequence == "A"); + Assert.That(eToA, Is.Not.Null, "E3->A variant was not created."); + + // Ensure at least one W4->K exists + var wToKCount = protein.SequenceVariations.Count(v => + v.OneBasedBeginPosition == 4 && v.OneBasedEndPosition == 4 && + v.OriginalSequence == "W" && v.VariantSequence == "K"); + Assert.That(wToKCount, Is.GreaterThanOrEqualTo(1), "Expected a W4->K variant."); + + // Converted positions removed from OneBasedPossibleLocalizedModifications + Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(3), Is.False); + Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(4), Is.False); + + // Unrelated and malformed mods remain + Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(5), Is.True); + Assert.That(protein.OneBasedPossibleLocalizedModifications[5].Any(m => m.OriginalId == "Oxid_P"), Is.True); + Assert.That(protein.OneBasedPossibleLocalizedModifications.ContainsKey(6), Is.True); + Assert.That(protein.OneBasedPossibleLocalizedModifications[6].Any(m => m.OriginalId == "E>A"), Is.True); + } } } \ No newline at end of file diff --git a/mzLib/Test/DatabaseTests/truncationsExpected.tsv b/mzLib/Test/DatabaseTests/truncationsExpected.tsv new file mode 100644 index 000000000..8a35f98a3 --- /dev/null +++ b/mzLib/Test/DatabaseTests/truncationsExpected.tsv @@ -0,0 +1,137 @@ +Sequence Type Begin End RetainedMethionine +MALWMRLLPLLALLALWGPDPAAA Target 1 24 TRUE +FVNQHLCGSHLVEALYLVCGERGFFYTPKT Target 25 54 FALSE +EAEDLQVGQVELGGGPGAGSLQPLALEGSLQ Target 57 87 FALSE +GIVEQCCTSICSLYQLENYCN Target 90 110 FALSE +MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 1 110 TRUE +ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 2 110 FALSE +LWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 3 110 FALSE +WMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 4 110 FALSE +MRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 5 110 FALSE +RLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 6 110 FALSE +LLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 7 110 FALSE +ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYC Target 2 109 FALSE +ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENY Target 2 108 FALSE +ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLEN Target 2 107 FALSE +ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLE Target 2 106 FALSE +ALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQL Target 2 105 FALSE +MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYC Target 1 109 TRUE +MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENY Target 1 108 TRUE +MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLEN Target 1 107 TRUE +MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLE Target 1 106 TRUE +MALWMRLLPLLALLALWGPDPAAAFVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQL Target 1 105 TRUE +ALWMRLLPLLALLALWGPDPAAA Target 2 24 FALSE +LWMRLLPLLALLALWGPDPAAA Target 3 24 FALSE +WMRLLPLLALLALWGPDPAAA Target 4 24 FALSE +MRLLPLLALLALWGPDPAAA Target 5 24 FALSE +RLLPLLALLALWGPDPAAA Target 6 24 FALSE +LLPLLALLALWGPDPAAA Target 7 24 FALSE +ALWMRLLPLLALLALWGPDPAA Target 2 23 FALSE +ALWMRLLPLLALLALWGPDPA Target 2 22 FALSE +ALWMRLLPLLALLALWGPDP Target 2 21 FALSE +ALWMRLLPLLALLALWGPD Target 2 20 FALSE +ALWMRLLPLLALLALWGP Target 2 19 FALSE +MALWMRLLPLLALLALWGPDPAA Target 1 23 TRUE +MALWMRLLPLLALLALWGPDPA Target 1 22 TRUE +MALWMRLLPLLALLALWGPDP Target 1 21 TRUE +MALWMRLLPLLALLALWGPD Target 1 20 TRUE +MALWMRLLPLLALLALWGP Target 1 19 TRUE +VNQHLCGSHLVEALYLVCGERGFFYTPKT Target 26 54 FALSE +NQHLCGSHLVEALYLVCGERGFFYTPKT Target 27 54 FALSE +QHLCGSHLVEALYLVCGERGFFYTPKT Target 28 54 FALSE +HLCGSHLVEALYLVCGERGFFYTPKT Target 29 54 FALSE +LCGSHLVEALYLVCGERGFFYTPKT Target 30 54 FALSE +FVNQHLCGSHLVEALYLVCGERGFFYTPK Target 25 53 FALSE +FVNQHLCGSHLVEALYLVCGERGFFYTP Target 25 52 FALSE +FVNQHLCGSHLVEALYLVCGERGFFYT Target 25 51 FALSE +FVNQHLCGSHLVEALYLVCGERGFFY Target 25 50 FALSE +FVNQHLCGSHLVEALYLVCGERGFF Target 25 49 FALSE +AEDLQVGQVELGGGPGAGSLQPLALEGSLQ Target 58 87 FALSE +EDLQVGQVELGGGPGAGSLQPLALEGSLQ Target 59 87 FALSE +DLQVGQVELGGGPGAGSLQPLALEGSLQ Target 60 87 FALSE +LQVGQVELGGGPGAGSLQPLALEGSLQ Target 61 87 FALSE +QVGQVELGGGPGAGSLQPLALEGSLQ Target 62 87 FALSE +EAEDLQVGQVELGGGPGAGSLQPLALEGSL Target 57 86 FALSE +EAEDLQVGQVELGGGPGAGSLQPLALEGS Target 57 85 FALSE +EAEDLQVGQVELGGGPGAGSLQPLALEG Target 57 84 FALSE +EAEDLQVGQVELGGGPGAGSLQPLALE Target 57 83 FALSE +EAEDLQVGQVELGGGPGAGSLQPLAL Target 57 82 FALSE +IVEQCCTSICSLYQLENYCN Target 91 110 FALSE +VEQCCTSICSLYQLENYCN Target 92 110 FALSE +EQCCTSICSLYQLENYCN Target 93 110 FALSE +QCCTSICSLYQLENYCN Target 94 110 FALSE +CCTSICSLYQLENYCN Target 95 110 FALSE +GIVEQCCTSICSLYQLENYC Target 90 109 FALSE +GIVEQCCTSICSLYQLENY Target 90 108 FALSE +GIVEQCCTSICSLYQLEN Target 90 107 FALSE +GIVEQCCTSICSLYQLE Target 90 106 FALSE +GIVEQCCTSICSLYQL Target 90 105 FALSE +FVNQHLCGSHLVEALYLVCGERGFFYTPKTRREAEDLQVGQVELGGGPGAGSLQPLALEGSLQKRGIVEQCCTSICSLYQLENYCN Target 25 110 FALSE +MNCYNELQYLSCISTCCQEVIGRK Decoy 1 24 TRUE +QLSGELALPQLSGAGPGGGLEVQGVQLDEA Decoy 25 54 FALSE +RTKPTYFFGREGCVLYLAEVLHSGCLHQNVF Decoy 57 87 FALSE +APDPGWLALLALLPLLRMWLA Decoy 90 110 FALSE +MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 1 110 TRUE +NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 2 110 FALSE +CYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 3 110 FALSE +YNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 4 110 FALSE +NELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 5 110 FALSE +ELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 6 110 FALSE +LQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 7 110 FALSE +NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWL Decoy 2 109 FALSE +NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMW Decoy 2 108 FALSE +NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRM Decoy 2 107 FALSE +NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLR Decoy 2 106 FALSE +NCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLL Decoy 2 105 FALSE +MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWL Decoy 1 109 TRUE +MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMW Decoy 1 108 TRUE +MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRM Decoy 1 107 TRUE +MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLR Decoy 1 106 TRUE +MNCYNELQYLSCISTCCQEVIGRKQLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLL Decoy 1 105 TRUE +NCYNELQYLSCISTCCQEVIGRK Decoy 2 24 FALSE +CYNELQYLSCISTCCQEVIGRK Decoy 3 24 FALSE +YNELQYLSCISTCCQEVIGRK Decoy 4 24 FALSE +NELQYLSCISTCCQEVIGRK Decoy 5 24 FALSE +ELQYLSCISTCCQEVIGRK Decoy 6 24 FALSE +LQYLSCISTCCQEVIGRK Decoy 7 24 FALSE +NCYNELQYLSCISTCCQEVIGR Decoy 2 23 FALSE +NCYNELQYLSCISTCCQEVIG Decoy 2 22 FALSE +NCYNELQYLSCISTCCQEVI Decoy 2 21 FALSE +NCYNELQYLSCISTCCQEV Decoy 2 20 FALSE +NCYNELQYLSCISTCCQE Decoy 2 19 FALSE +MNCYNELQYLSCISTCCQEVIGR Decoy 1 23 TRUE +MNCYNELQYLSCISTCCQEVIG Decoy 1 22 TRUE +MNCYNELQYLSCISTCCQEVI Decoy 1 21 TRUE +MNCYNELQYLSCISTCCQEV Decoy 1 20 TRUE +MNCYNELQYLSCISTCCQE Decoy 1 19 TRUE +LSGELALPQLSGAGPGGGLEVQGVQLDEA Decoy 26 54 FALSE +SGELALPQLSGAGPGGGLEVQGVQLDEA Decoy 27 54 FALSE +GELALPQLSGAGPGGGLEVQGVQLDEA Decoy 28 54 FALSE +ELALPQLSGAGPGGGLEVQGVQLDEA Decoy 29 54 FALSE +LALPQLSGAGPGGGLEVQGVQLDEA Decoy 30 54 FALSE +QLSGELALPQLSGAGPGGGLEVQGVQLDE Decoy 25 53 FALSE +QLSGELALPQLSGAGPGGGLEVQGVQLD Decoy 25 52 FALSE +QLSGELALPQLSGAGPGGGLEVQGVQL Decoy 25 51 FALSE +QLSGELALPQLSGAGPGGGLEVQGVQ Decoy 25 50 FALSE +QLSGELALPQLSGAGPGGGLEVQGV Decoy 25 49 FALSE +TKPTYFFGREGCVLYLAEVLHSGCLHQNVF Decoy 58 87 FALSE +KPTYFFGREGCVLYLAEVLHSGCLHQNVF Decoy 59 87 FALSE +PTYFFGREGCVLYLAEVLHSGCLHQNVF Decoy 60 87 FALSE +TYFFGREGCVLYLAEVLHSGCLHQNVF Decoy 61 87 FALSE +YFFGREGCVLYLAEVLHSGCLHQNVF Decoy 62 87 FALSE +RTKPTYFFGREGCVLYLAEVLHSGCLHQNV Decoy 57 86 FALSE +RTKPTYFFGREGCVLYLAEVLHSGCLHQN Decoy 57 85 FALSE +RTKPTYFFGREGCVLYLAEVLHSGCLHQ Decoy 57 84 FALSE +RTKPTYFFGREGCVLYLAEVLHSGCLH Decoy 57 83 FALSE +RTKPTYFFGREGCVLYLAEVLHSGCL Decoy 57 82 FALSE +PDPGWLALLALLPLLRMWLA Decoy 91 110 FALSE +DPGWLALLALLPLLRMWLA Decoy 92 110 FALSE +PGWLALLALLPLLRMWLA Decoy 93 110 FALSE +GWLALLALLPLLRMWLA Decoy 94 110 FALSE +WLALLALLPLLRMWLA Decoy 95 110 FALSE +APDPGWLALLALLPLLRMWL Decoy 90 109 FALSE +APDPGWLALLALLPLLRMW Decoy 90 108 FALSE +APDPGWLALLALLPLLRM Decoy 90 107 FALSE +APDPGWLALLALLPLLR Decoy 90 106 FALSE +APDPGWLALLALLPLL Decoy 90 105 FALSE +QLSGELALPQLSGAGPGGGLEVQGVQLDEAERRTKPTYFFGREGCVLYLAEVLHSGCLHQNVFAAAPDPGWLALLALLPLLRMWLA Decoy 25 110 FALSE diff --git a/mzLib/Test/Test.csproj b/mzLib/Test/Test.csproj index 59c393b39..7f0449d12 100644 --- a/mzLib/Test/Test.csproj +++ b/mzLib/Test/Test.csproj @@ -262,6 +262,9 @@ Always + + Always + Always diff --git a/mzLib/Test/TestPeptideWithSetMods.cs b/mzLib/Test/TestPeptideWithSetMods.cs index ccc6d950e..d235759ed 100644 --- a/mzLib/Test/TestPeptideWithSetMods.cs +++ b/mzLib/Test/TestPeptideWithSetMods.cs @@ -1046,67 +1046,177 @@ public static void TestPeptideWithSetModsReturnsTruncationsInTopDown() List insulinTruncations = insulin.Digest(new DigestionParams(protease: protease.Name), new List(), new List(), topDownTruncationSearch: true).ToList(); Assert.AreEqual(68, insulinTruncations.Count); } - [Test] public static void TestPeptideWithSetModsReturnsDecoyTruncationsInTopDown() { + // PURPOSE + // Generate a comprehensive TSV of all top-down truncation peptides for target and decoy insulin entries, + // then compare the result against the stored expected file for regression checking. + // + // OUTPUT COLUMNS + // - Sequence: peptide/proteoform base sequence + // - Type: "Target" or "Decoy" based on parent protein + // - Begin: one-based start residue within the parent protein + // - End: one-based end residue within the parent protein + // - RetainedMethionine: TRUE when the peptide includes the protein’s N-terminal Met (Begin == 1 and Parent.BaseSequence[0] == 'M'), else FALSE + + // Arrange: load insulin with reverse decoys and truncations enabled string xmlDatabase = Path.Combine(TestContext.CurrentContext.TestDirectory, "DataFiles", "humanInsulin.xml"); - List insulinProteins = ProteinDbLoader.LoadProteinXML(xmlDatabase, true, - DecoyType.Reverse, null, false, null, out var unknownModifications, addTruncations: true); + List insulinProteins = ProteinDbLoader.LoadProteinXML( + xmlDatabase, + generateTargets: true, + decoyType: DecoyType.Reverse, + allKnownModifications: null, + isContaminant: false, + modTypesToExclude: null, + unknownModifications: out var unknownModifications, + addTruncations: true); + + Assert.That(insulinProteins.Any(p => !p.IsDecoy), "Expected at least one target protein"); + Assert.That(insulinProteins.Any(p => p.IsDecoy), "Expected at least one decoy protein"); + Assert.That(unknownModifications == null || unknownModifications.Count == 0, "No unknown modifications expected from insulin XML"); + + // Digest: enumerate truncations for a representative target/decoy pair (parity sanity) + static string Reverse(string s) => new string(s.Reverse().ToArray()); + var target = insulinProteins.First(p => !p.IsDecoy); + string expectedDecoySeq = target.BaseSequence.Length > 0 && target.BaseSequence[0] == 'M' + ? "M" + Reverse(target.BaseSequence.Substring(1)) + : Reverse(target.BaseSequence); + var decoy = insulinProteins.FirstOrDefault(p => p.IsDecoy && p.BaseSequence == expectedDecoySeq) + ?? insulinProteins.First(p => p.IsDecoy && p.Length == target.Length); + + Assert.IsFalse(string.IsNullOrWhiteSpace(target.Accession)); + Assert.IsTrue(decoy.Accession.StartsWith("DECOY_")); + Assert.AreEqual(target.Length, decoy.Length); + Assert.AreNotEqual(target.BaseSequence, decoy.BaseSequence); + Assert.AreEqual(expectedDecoySeq, decoy.BaseSequence, "Decoy must follow 'retain M, reverse remainder' rule"); + + var dp = new DigestionParams(protease: "top-down"); + List targetTruncs = target + .Digest(dp, new List(), new List(), topDownTruncationSearch: true) + .Cast().ToList(); + List decoyTruncs = decoy + .Digest(dp, new List(), new List(), topDownTruncationSearch: true) + .Cast().ToList(); + + // Parity and sanity checks for the selected pair + Assert.AreEqual(68, targetTruncs.Count, "Target should yield 68 truncation products in top-down mode"); + Assert.AreEqual(68, decoyTruncs.Count, "Decoy should yield 68 truncation products in top-down mode"); + Assert.That(targetTruncs.All(p => p.DigestionParams?.DigestionAgent?.Name == "top-down")); + Assert.That(decoyTruncs.All(p => p.DigestionParams?.DigestionAgent?.Name == "top-down")); + Assert.AreEqual(targetTruncs.Count, targetTruncs.Select(p => p.BaseSequence).Distinct().Count()); + Assert.AreEqual(decoyTruncs.Count, decoyTruncs.Select(p => p.BaseSequence).Distinct().Count()); + Assert.IsTrue(targetTruncs.Any(p => p.BaseSequence == target.BaseSequence)); + Assert.IsTrue(decoyTruncs.Any(p => p.BaseSequence == decoy.BaseSequence)); + + // Build the table rows + static bool HasRetainedMet(PeptideWithSetModifications p) => + p.OneBasedStartResidueInProtein == 1 && + p.Parent?.BaseSequence?.Length > 0 && + p.Parent.BaseSequence[0] == 'M'; + + // We only compare the combined truncations for the chosen target/decoy in this test (68 + 68 rows) + var rows = targetTruncs.Concat(decoyTruncs) + .Select(pep => + { + bool isDecoy = (pep.Parent as Protein)?.IsDecoy == true; + string type = isDecoy ? "Decoy" : "Target"; + string retained = HasRetainedMet(pep) ? "TRUE" : "FALSE"; // normalize to match expected + return string.Join("\t", new[] + { + pep.BaseSequence, + type, + pep.OneBasedStartResidueInProtein.ToString(), + pep.OneBasedEndResidueInProtein.ToString(), + retained + }); + }) + // Sort deterministically to avoid platform/iteration-order differences + .OrderBy(r => r, StringComparer.Ordinal) + .ToList(); + + var header = "Sequence\tType\tBegin\tEnd\tRetainedMethionine"; + var outputLines = new List(capacity: rows.Count + 1) { header }; + outputLines.AddRange(rows); + + // Persist the generated table for inspection + string workPath = Path.Combine(TestContext.CurrentContext.WorkDirectory, "topdown_truncations_table.tsv"); + File.WriteAllLines(workPath, outputLines); + Console.WriteLine($"Generated truncation table: {workPath}"); + + // Load expected and compare + string expectedPath = Path.Combine(TestContext.CurrentContext.TestDirectory, "DatabaseTests", "truncationsExpected.tsv"); + Assert.That(File.Exists(expectedPath), $"Expected file not found: {expectedPath}"); + var expectedAll = File.ReadAllLines(expectedPath) + .Where(l => l is not null) + .Select(l => l.TrimEnd('\r', '\n')) + .ToList(); + + Assert.That(expectedAll.Count > 0, "Expected file is empty"); + string expectedHeader = expectedAll[0]; + var expectedRows = expectedAll.Skip(1) + .Where(l => !string.IsNullOrWhiteSpace(l)) + .OrderBy(l => l, StringComparer.Ordinal) + .ToList(); + + // Header check + if (!string.Equals(header, expectedHeader, StringComparison.Ordinal)) + { + TestContext.Out.WriteLine($"Header mismatch:"); + TestContext.Out.WriteLine($" Expected: {expectedHeader}"); + TestContext.Out.WriteLine($" Actual : {header}"); + } - Protease protease = new Protease("top-down", CleavageSpecificity.None, "", "", new List(), null); - List insulintTargetTruncations = insulinProteins.Where(p=>!p.IsDecoy).First().Digest(new DigestionParams(protease: protease.Name), new List(), new List(), topDownTruncationSearch: true).ToList(); - Assert.AreEqual(68, insulintTargetTruncations.Count); - List insulintDecoyTruncations = insulinProteins.Where(p => p.IsDecoy).First().Digest(new DigestionParams(protease: protease.Name), new List(), new List(), topDownTruncationSearch: true).ToList(); - Assert.AreEqual(68, insulintDecoyTruncations.Count); - } + // Multiset comparison for rows (counts of duplicates matter) + static Dictionary ToCounts(IEnumerable lines) + => lines.GroupBy(x => x, StringComparer.Ordinal) + .ToDictionary(g => g.Key, g => g.Count(), StringComparer.Ordinal); - [Test] - public static void CheckFullChemicalFormula() - { - PeptideWithSetModifications small_pep = new PeptideWithSetModifications(new Protein("PEPTIDE", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 7, CleavageSpecificity.Full, null, 0, new Dictionary(), 0, null); - ChemicalFormula small_pep_cf = ChemicalFormula.ParseFormula("C34H53N7O15"); - Assert.AreEqual(small_pep.FullChemicalFormula, small_pep_cf); + var expCounts = ToCounts(expectedRows); + var gotCounts = ToCounts(rows); - PeptideWithSetModifications large_pep = new PeptideWithSetModifications(new Protein("PEPTIDEKRNSPEPTIDEKECUEIRQUV", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 28, CleavageSpecificity.Full, null, 0, new Dictionary(), 0, null); - ChemicalFormula large_pep_cf = ChemicalFormula.ParseFormula("C134H220N38O50S1Se2"); - Assert.AreEqual(large_pep.FullChemicalFormula, large_pep_cf); + var missing = new List(); // in expected more times than in actual + var extra = new List(); // in actual more times than in expected + + foreach (var kv in expCounts) + { + gotCounts.TryGetValue(kv.Key, out int got); + if (got < kv.Value) + { + int deficit = kv.Value - got; + for (int i = 0; i < deficit; i++) missing.Add(kv.Key); + } + } + foreach (var kv in gotCounts) + { + expCounts.TryGetValue(kv.Key, out int exp); + if (kv.Value > exp) + { + int surplus = kv.Value - exp; + for (int i = 0; i < surplus; i++) extra.Add(kv.Key); + } + } - ModificationMotif.TryGetMotif("S", out ModificationMotif motif_s); - Modification phosphorylation = new Modification(_originalId: "phospho", _modificationType: "CommonBiological", _target: motif_s, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("H1O3P1")); - Dictionary modDict_small = new Dictionary(); - modDict_small.Add(4, phosphorylation); + if (missing.Count == 0 && extra.Count == 0) + { + TestContext.Out.WriteLine("Top-down truncation table matches expected."); + TestContext.Out.WriteLine("Sample (first 5 rows):"); + foreach (var l in outputLines.Take(6)) TestContext.Out.WriteLine(l); + } + else + { + TestContext.Out.WriteLine("Top-down truncation table differs from expected."); + TestContext.Out.WriteLine($"Missing rows (expected but not found or under-counted): {missing.Count}"); + foreach (var l in missing.Take(20)) TestContext.Out.WriteLine($" MISSING: {l}"); + if (missing.Count > 20) TestContext.Out.WriteLine($" ...and {missing.Count - 20} more"); - PeptideWithSetModifications small_pep_mod = new PeptideWithSetModifications(new Protein("PEPSIDE", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 7, CleavageSpecificity.Full, null, 0, modDict_small, 0, null); - ChemicalFormula small_pep_mod_cf = ChemicalFormula.ParseFormula("C33H52N7O18P1"); - Assert.AreEqual(small_pep_mod.FullChemicalFormula, small_pep_mod_cf); + TestContext.Out.WriteLine($"Extra rows (found but not expected or over-counted): {extra.Count}"); + foreach (var l in extra.Take(20)) TestContext.Out.WriteLine($" EXTRA: {l}"); + if (extra.Count > 20) TestContext.Out.WriteLine($" ...and {extra.Count - 20} more"); - ModificationMotif.TryGetMotif("K", out ModificationMotif motif_k); - Modification acetylation = new Modification(_originalId: "acetyl", _modificationType: "CommonBiological", _target: motif_k, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("C2H3O")); - Dictionary modDict_large = new Dictionary(); - modDict_large.Add(4, phosphorylation); - modDict_large.Add(11, phosphorylation); - modDict_large.Add(8, acetylation); - - PeptideWithSetModifications large_pep_mod = new PeptideWithSetModifications(new Protein("PEPSIDEKRNSPEPTIDEKECUEIRQUV", "ACCESSION"), new DigestionParams(protease: "trypsin"), 1, 28, CleavageSpecificity.Full, null, 0, modDict_large, 0, null); - ChemicalFormula large_pep_mod_cf = ChemicalFormula.ParseFormula("C135H223N38O57P2S1Se2"); - Assert.AreEqual(large_pep_mod.FullChemicalFormula, large_pep_mod_cf); - - ModificationMotif.TryGetMotif("C", out var motif_c); - ModificationMotif.TryGetMotif("G", out var motif_g); - Dictionary modDict = - new() - { - { "Carbamidomethyl on C", new Modification(_originalId: "Carbamidomethyl", _modificationType: "Common Fixed", - _target: motif_c, _locationRestriction: "Anywhere.", _chemicalFormula: ChemicalFormula.ParseFormula("C2H3ON")) }, - { "BS on G" , new Modification(_originalId: "BS on G", _modificationType: "BS", _target: motif_g, _monoisotopicMass: 96.0875)} - }; - PeptideWithSetModifications pwsmWithMissingCfMods = new PeptideWithSetModifications( - "ENQGDETQG[Speculative:BS on G]C[Common Fixed:Carbamidomethyl on C]PPQR", modDict, p: new Protein("ENQGDETQGCPPQR", "FakeProtein"), digestionParams: new DigestionParams(), - oneBasedStartResidueInProtein: 1, oneBasedEndResidueInProtein: 14); - Assert.Null(pwsmWithMissingCfMods.FullChemicalFormula); + Assert.Fail($"Generated top-down truncation table does not match expected.\nExpected file: {expectedPath}\nActual file: {workPath}\nMissing: {missing.Count}, Extra: {extra.Count}"); + } } - [Test] public static void CheckMostAbundantMonoisotopicMass() { diff --git a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs index c83c80aa1..87264702e 100644 --- a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs +++ b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/DecoyProteinGenerator.cs @@ -239,34 +239,34 @@ private static List ReverseSequenceVariations(IEnumerable 1 || sv.VariantSequence.Length > 1)) { string original = new string(originalArray).Substring(0, originalArray.Length - 1); string variant = new string(variationArray).Substring(0, variationArray.Length - 1); - decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, original, variant, $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications)); + decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length, original, variant, $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications)); } // gained an initiating methionine else if (sv.VariantSequence.StartsWith("M", StringComparison.Ordinal) && sv.OneBasedBeginPosition == 1) { - decoyVariations.Add(new SequenceVariation(1, 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications)); + decoyVariations.Add(new SequenceVariation(1, 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications)); } // starting methionine, but no variations on it else if (startsWithM) { - decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 2, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications)); + decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 2, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 2, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications)); } // no starting methionine else { - decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 1, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.Description, decoyVariantModifications)); + decoyVariations.Add(new SequenceVariation(protein.BaseSequence.Length - sv.OneBasedEndPosition + 1, protein.BaseSequence.Length - sv.OneBasedBeginPosition + 1, new string(originalArray), new string(variationArray), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString, decoyVariantModifications)); } } return decoyVariations; @@ -335,7 +335,7 @@ private static List GenerateSlideDecoys(List proteins, int max { variationArraySlided[i] = variationArrayUnslided[GetOldSlidedIndex(i, numSlidesHere, variationArrayUnslided.Length, true)]; } - decoyVariationsSlide.Add(new SequenceVariation(1, "M", new string(variationArraySlided), $"{decoyIdentifier} VARIANT: Initiator Methionine Change in " + sv.Description)); + decoyVariationsSlide.Add(new SequenceVariation(1, "M", new string(variationArraySlided), $"{decoyIdentifier} VARIANT: Initiator Methionine Change in " + sv.VariantCallFormatDataString)); } else { @@ -352,7 +352,7 @@ private static List GenerateSlideDecoys(List proteins, int max variationArraySlided[i] = variationArrayUnslided[GetOldSlidedIndex(i, numSlidesHere, variationArrayUnslided.Length, initMet)]; } - decoyVariationsSlide.Add(new SequenceVariation(decoy_begin, decoy_end, sv.OriginalSequence, new string(variationArraySlided), $"{decoyIdentifier} VARIANT: " + sv.Description)); + decoyVariationsSlide.Add(new SequenceVariation(decoy_begin, decoy_end, sv.OriginalSequence, new string(variationArraySlided), $"{decoyIdentifier} VARIANT: " + sv.VariantCallFormatDataString)); } } var decoyProteinSlide = new Protein(slided_sequence, $"{decoyIdentifier}_" + protein.Accession, protein.Organism, protein.GeneNames.ToList(), decoyModifications, decoyPPSlide, diff --git a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs index cc7723c15..3786c969d 100644 --- a/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs +++ b/mzLib/UsefulProteomicsDatabases/DecoyGeneration/RnaDecoyGenerator.cs @@ -87,7 +87,7 @@ private static List GenerateReverseDecoys(List nucleicAcids, int maxThr var reverseModKey = indexMapping[modKvp.Key]; reverseModificationsForVariation.Add(reverseModKey, modKvp.Value); } - reverseAppliedVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.Description.Description, reverseModificationsForVariation)); + reverseAppliedVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.VariantCallFormatDataString.Description, reverseModificationsForVariation)); } // Reverse Applied Variants @@ -101,7 +101,7 @@ private static List GenerateReverseDecoys(List nucleicAcids, int maxThr var reverseModKey = indexMapping[modKvp.Key]; reverseModificationsForVariation.Add(reverseModKey, modKvp.Value); } - reverseVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.Description.Description, reverseModificationsForVariation)); + reverseVariations.Add(new SequenceVariation(reverseBegin, reverseEnd, variation.OriginalSequence, variation.VariantSequence, variation.VariantCallFormatDataString.Description, reverseModificationsForVariation)); } // Reverse Truncations diff --git a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs index 24ff44dd4..5eaa9897a 100644 --- a/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs +++ b/mzLib/UsefulProteomicsDatabases/ProteinDbWriter.cs @@ -206,7 +206,7 @@ public static Dictionary WriteXmlDatabase(Dictionary WriteXmlDatabase(Dictionary WriteXmlDatabase(Dictionary element, if present private List<(int, string)> AnnotatedMods = new List<(int position, string originalModificationID)>(); private List<(int, string)> AnnotatedVariantMods = new List<(int position, string originalModificationID)>(); + // Captured isoform/sequence identifier from + private string LocationSequenceId; /// - /// Start parsing a protein XML element + /// Finalizes the parsing of a protein XML entry and constructs a object. + /// This method is called when the end of an <entry> element is reached during XML parsing. + /// It sanitizes the sequence, prunes out-of-range sequence variants, resolves and attaches modifications, + /// and aggregates all parsed data (such as gene names, proteolysis products, sequence variations, disulfide bonds, and splice sites) + /// into a new instance. + /// After construction, the internal state is cleared to prepare for the next entry. /// + /// The positioned at the end of the <entry> element. + /// Indicates whether the protein is a contaminant. + /// The file path or identifier of the protein database source. + /// A collection of modification types to exclude from the protein. + /// A dictionary to collect modifications that could not be resolved. + /// A string used to identify decoy proteins (default: "DECOY"). + /// + /// A constructed object containing all parsed and resolved information, + /// or null if the entry is incomplete. + /// public void ParseElement(string elementName, XmlReader xml) { int outValue; @@ -137,7 +154,9 @@ public void ParseElement(string elementName, XmlReader xml) PropertyTypes.Add(xml.GetAttribute("type")); PropertyValues.Add(xml.GetAttribute("value")); break; - + case "location": + LocationSequenceId = xml.GetAttribute("sequence"); + break; case "position": OneBasedFeaturePosition = int.Parse(xml.GetAttribute("position")); break; @@ -161,9 +180,12 @@ public void ParseElement(string elementName, XmlReader xml) } /// - /// Parses the attributes of the current element from the provided XmlReader. - /// Extracts and stores the values for dataset, created, modified, version, and xmlns attributes. + /// Parses and stores key metadata attributes from the current <entry> element in the XML. + /// This includes dataset, creation date, modification date, version, and XML namespace information. + /// The extracted values are assigned to the corresponding properties of the instance. + /// This method is typically called when the parser encounters the start of a protein entry in a UniProt or similar XML file. /// + /// The positioned at the <entry> element whose attributes are to be read. private void ParseEntryAttributes(XmlReader xml) { DatasetEntryTag = xml.GetAttribute("dataset"); @@ -268,10 +290,12 @@ private static UniProtSequenceAttributes.FragmentType ParseFragmentType(string f return UniProtSequenceAttributes.FragmentType.unspecified; } - // Helper method to compute the monoisotopic mass of a sequence. /// - /// Computes the monoisotopic mass of the given sequence. - /// Returns 0 if the sequence is empty. + /// Computes the monoisotopic mass of a protein or nucleic acid sequence without modifications. + /// If the input sequence is null or empty, returns 0. + /// Internally, constructs a using the provided sequence and an empty modification dictionary, + /// then returns the rounded monoisotopic mass as an integer. + /// This method is used to populate sequence attributes such as mass during XML parsing. /// private static int ComputeSequenceMass(string sequence) { @@ -280,8 +304,25 @@ private static int ComputeSequenceMass(string sequence) return (int)Math.Round(new PeptideWithSetModifications(sequence, new Dictionary()).MonoisotopicMass); } /// - /// Finish parsing at the end of an element + /// Handles the end of an XML element during protein database parsing, updating the internal state or finalizing objects as needed. + /// Depending on the element name, this method processes and stores feature, subfeature, database reference, gene, and organism information, + /// or, if the end of an <entry> element is reached, constructs and returns a fully populated object. + /// For <feature> and <subfeature> elements, it attaches modifications or proteolytic products. + /// For <dbReference>, it records database cross-references. + /// For <gene> and <organism>, it updates parsing state flags. + /// For <entry>, it aggregates all parsed data, resolves modifications, and returns a new instance, + /// clearing the internal state for the next entry. /// + /// The positioned at the end of the current XML element. + /// A collection of modification types to exclude from the protein. + /// A dictionary to collect modifications that could not be resolved. + /// Indicates whether the protein is a contaminant. + /// The file path or identifier of the protein database source. + /// A string used to identify decoy proteins (default: "DECOY"). + /// + /// A constructed object if the end of an <entry> element is reached and all required data is present; + /// otherwise, null. + /// public Protein ParseEndElement(XmlReader xml, IEnumerable modTypesToExclude, Dictionary unknownModifications, bool isContaminant, string proteinDbLocation, string decoyIdentifier = "DECOY") { @@ -312,7 +353,26 @@ public Protein ParseEndElement(XmlReader xml, IEnumerable modTypesToExcl } return protein; } - + /// + /// Handles the end of an XML element during RNA database parsing, updating the internal state or finalizing objects as needed. + /// Depending on the element name, this method processes and stores feature, subfeature, and database reference information, + /// or, if the end of an <entry> element is reached, constructs and returns a fully populated object. + /// For <feature> and <subfeature> elements, it attaches modifications or truncation products. + /// For <dbReference>, it records database cross-references. + /// For <gene> and <organism>, it updates parsing state flags. + /// For <entry>, it aggregates all parsed data, resolves modifications, and returns a new instance, + /// clearing the internal state for the next entry. + /// + /// The positioned at the end of the current XML element. + /// A collection of modification types to exclude from the RNA. + /// A dictionary to collect modifications that could not be resolved. + /// Indicates whether the RNA is a contaminant. + /// The file path or identifier of the RNA database source. + /// A string used to identify decoy RNAs (default: "DECOY"). + /// + /// A constructed object if the end of an <entry> element is reached and all required data is present; + /// otherwise, null. + /// internal RNA ParseRnaEndElement(XmlReader xml, IEnumerable modTypesToExclude, Dictionary unknownModifications, bool isContaminant, string rnaDbLocation, string decoyIdentifier = "DECOY") @@ -346,8 +406,30 @@ internal RNA ParseRnaEndElement(XmlReader xml, IEnumerable modTypesToExc } /// - /// Finish parsing an entry + /// Finalizes the parsing of a protein XML entry and constructs a object from the accumulated data. + /// This method is called when the end of an <entry> element is reached during XML parsing. + /// It performs several key tasks: + /// + /// Sanitizes the parsed sequence (e.g., replacing invalid amino acids with 'X'). + /// Prunes any sequence variants whose coordinates exceed the sequence length. + /// Resolves and attaches all annotated modifications, excluding those of specified types or unknowns. + /// Determines if the protein is a decoy based on the accession and decoy identifier. + /// Aggregates all parsed data (gene names, proteolysis products, sequence variations, disulfide bonds, splice sites, database references, and sequence attributes) into a new instance. + /// Clears the internal state of the to prepare for parsing the next entry. + /// + /// If either the accession or sequence is missing, returns null. /// + /// The positioned at the end of the <entry> element. + /// Indicates whether the protein is a contaminant. + /// The file path or identifier of the protein database source. + /// A collection of modification types to exclude from the protein. + /// A dictionary to collect modifications that could not be resolved. + /// A string used to identify decoy proteins (default: "DECOY"). + /// + /// A constructed object containing all parsed and resolved information, + /// or null if the entry is incomplete. + /// + public Protein ParseEntryEndElement(XmlReader xml, bool isContaminant, string proteinDbLocation, IEnumerable modTypesToExclude, Dictionary unknownModifications, string decoyIdentifier = "DECOY") { Protein result = null; @@ -359,6 +441,8 @@ public Protein ParseEntryEndElement(XmlReader xml, bool isContaminant, string pr Sequence = ProteinDbLoader.SanitizeAminoAcidSequence(Sequence, 'X'); ParseAnnotatedMods(OneBasedModifications, modTypesToExclude, unknownModifications, AnnotatedMods); + //prune any sequence variants whose coordinates exceed the known sequence length + PruneOutOfRangeSequenceVariants(); if (Accession.StartsWith(decoyIdentifier)) { isDecoy = true; @@ -370,7 +454,30 @@ public Protein ParseEntryEndElement(XmlReader xml, bool isContaminant, string pr Clear(); return result; } - + /// + /// Finalizes the parsing of an RNA XML entry and constructs an object from the accumulated data. + /// This method is called when the end of an <entry> element is reached during XML parsing for RNA records. + /// It performs several key tasks: + /// + /// Sanitizes the parsed sequence (e.g., replacing invalid characters with 'X'). + /// Prunes any sequence variants whose coordinates exceed the sequence length. + /// Resolves and attaches all annotated modifications, excluding those of specified types or unknowns. + /// Determines if the RNA is a decoy based on the accession and decoy identifier. + /// Aggregates all parsed data (gene names, proteolysis products, sequence variations, and other metadata) into a new instance. + /// Clears the internal state of the to prepare for parsing the next entry. + /// + /// If either the accession or sequence is missing, returns null. + /// + /// The positioned at the end of the <entry> element. + /// Indicates whether the RNA is a contaminant. + /// The file path or identifier of the RNA database source. + /// A collection of modification types to exclude from the RNA. + /// A dictionary to collect modifications that could not be resolved. + /// A string used to identify decoy RNAs (default: "DECOY"). + /// + /// A constructed object containing all parsed and resolved information, + /// or null if the entry is incomplete. + /// internal RNA ParseRnaEntryEndElement(XmlReader xml, bool isContaminant, string rnaDbLocation, IEnumerable modTypesToExclude, Dictionary unknownModifications, string decoyIdentifier = "DECOY") { @@ -381,6 +488,8 @@ internal RNA ParseRnaEntryEndElement(XmlReader xml, bool isContaminant, string r // sanitize the sequence to replace unexpected characters with X (unknown amino acid) // sometimes strange characters get added by RNA sequencing software, etc. Sequence = ProteinDbLoader.SanitizeAminoAcidSequence(Sequence, 'X'); + //prune any sequence variants whose coordinates exceed the known sequence length + PruneOutOfRangeSequenceVariants(); if (Accession.StartsWith(decoyIdentifier)) { isDecoy = true; @@ -407,8 +516,19 @@ public void ParseSubFeatureEndElement(XmlReader xml, IEnumerable modType } /// - /// Finish parsing a feature element + /// Processes the end of a <feature> element during XML parsing and updates the internal state with the parsed feature information. + /// Depending on the feature type, this method: + /// + /// Adds modification annotations for "modified residue" and "lipid moiety-binding region" features. + /// Creates and adds objects for proteolytic features such as "peptide", "propeptide", "chain", and "signal peptide". + /// Handles "sequence variant" features by creating objects, including variant-specific modifications, and ensures they apply to the correct sequence or isoform. + /// Creates and adds or objects for their respective feature types, using available position information. + /// + /// After processing, resets feature-related state variables to prepare for the next feature. /// + /// The positioned at the end of the <feature> element. + /// A collection of modification types to exclude from the protein. + /// A dictionary to collect modifications that could not be resolved. public void ParseFeatureEndElement(XmlReader xml, IEnumerable modTypesToExclude, Dictionary unknownModifications) { if (FeatureType == "modified residue") @@ -445,24 +565,56 @@ public void ParseFeatureEndElement(XmlReader xml, IEnumerable modTypesTo } else { - type += ("null-null"); + type += "null-null"; } } ProteolysisProducts.Add(new TruncationProduct(OneBasedBeginPosition, OneBasedEndPosition, type)); } - else if (FeatureType == "sequence variant" && VariationValue != null && VariationValue != "") // Only keep if there is variant sequence information and position information + else if (FeatureType == "sequence variant" && VariationValue != null && VariationValue != "") { - ParseAnnotatedMods(OneBasedVariantModifications, modTypesToExclude, unknownModifications, AnnotatedVariantMods); - if (OneBasedBeginPosition != null && OneBasedEndPosition != null) + bool appliesToThisSequence = true; + if (!string.IsNullOrEmpty(LocationSequenceId)) { - SequenceVariations.Add(new SequenceVariation((int)OneBasedBeginPosition, (int)OneBasedEndPosition, OriginalValue, VariationValue, FeatureDescription, OneBasedVariantModifications)); + string acc = Accession ?? ""; + appliesToThisSequence = + LocationSequenceId.Equals(acc, StringComparison.OrdinalIgnoreCase) + || (!string.IsNullOrEmpty(acc) && LocationSequenceId.Equals($"{acc}-1", StringComparison.OrdinalIgnoreCase)); } - else if (OneBasedFeaturePosition >= 1) + + if (appliesToThisSequence) { - SequenceVariations.Add(new SequenceVariation(OneBasedFeaturePosition, OriginalValue, VariationValue, FeatureDescription, OneBasedVariantModifications)); + ParseAnnotatedMods(OneBasedVariantModifications, modTypesToExclude, unknownModifications, AnnotatedVariantMods); + + // NOTE: We can NOT validate coordinate vs sequence length here because sequence is usually parsed later. + // Validation is deferred to PruneOutOfRangeSequenceVariants() during ParseEntryEndElement. + + if (OneBasedBeginPosition != null && OneBasedEndPosition != null) + { + SequenceVariations.Add( + new SequenceVariation( + (int)OneBasedBeginPosition, + (int)OneBasedEndPosition, + OriginalValue, + VariationValue, + FeatureDescription, + //variantCallFormatDataString: null, + oneBasedModifications: OneBasedVariantModifications)); + } + else if (OneBasedFeaturePosition >= 1) + { + SequenceVariations.Add( + new SequenceVariation( + OneBasedFeaturePosition, + OriginalValue, + VariationValue, + FeatureDescription, + //variantCallFormatDataString: null, + oneBasedModifications: OneBasedVariantModifications)); + } + + AnnotatedVariantMods = new List<(int, string)>(); + OneBasedVariantModifications = new Dictionary>(); } - AnnotatedVariantMods = new List<(int, string)>(); - OneBasedVariantModifications = new Dictionary>(); } else if (FeatureType == "disulfide bond") { @@ -491,64 +643,7 @@ public void ParseFeatureEndElement(XmlReader xml, IEnumerable modTypesTo OneBasedFeaturePosition = -1; OriginalValue = ""; VariationValue = ""; - } - - private static void ParseAnnotatedMods(Dictionary> destination, IEnumerable modTypesToExclude, - Dictionary unknownModifications, List<(int, string)> annotatedMods) - { - foreach (var annotatedMod in annotatedMods) - { - string annotatedId = annotatedMod.Item2; - int annotatedModLocation = annotatedMod.Item1; - - if (ProteinDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out Modification foundMod) - || RnaDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out foundMod)) - { - // if the list of known mods contains this IdWithMotif - if (!modTypesToExclude.Contains(foundMod.ModificationType)) - { - if (destination.TryGetValue(annotatedModLocation, out var listOfModsAtThisLocation)) - { - listOfModsAtThisLocation.Add(foundMod); - } - else - { - destination.Add(annotatedModLocation, new List { foundMod }); - } - } - // else - the mod ID was found but the motif didn't fit the annotated location - } - - // no known mod - try looking it up in the dictionary of mods without motif appended - else if (ProteinDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out IList mods) - || RnaDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out mods)) - { - foreach (Modification mod in mods) - { - if (!modTypesToExclude.Contains(mod.ModificationType)) - { - if (destination.TryGetValue(annotatedModLocation, out var listOfModsAtThisLocation)) - { - listOfModsAtThisLocation.Add(mod); - } - else - { - destination.Add(annotatedModLocation, new List { mod }); - } - break; - } - } - } - else - { - // could not find the annotated mod's ID in our list of known mods - it's an unknown mod - // I don't think this really does anything... - if (!unknownModifications.ContainsKey(annotatedId)) - { - unknownModifications.Add(annotatedId, new Modification(annotatedId)); - } - } - } + LocationSequenceId = null; } /// @@ -604,6 +699,86 @@ private void Clear() GeneNames = new List>(); ReadingGene = false; ReadingOrganism = false; + LocationSequenceId = null; + AnnotatedVariantMods = new List<(int, string)>(); + OneBasedVariantModifications = new Dictionary>(); + } + /// + /// Resolves and attaches annotated modifications to the specified destination dictionary based on parsed feature or variant annotations. + /// For each annotated modification, attempts to look up the modification by its identifier (with motif) in both protein and RNA modification dictionaries. + /// If found and not excluded by , the modification is added to the destination at the specified position. + /// If not found by identifier, attempts to resolve the modification by possible matches (without motif) and adds the first non-excluded match. + /// If no match is found, records the modification as unknown in to avoid repeated warnings. + /// This method is used to populate the protein or variant modification dictionaries during XML parsing. + /// + /// Dictionary mapping one-based positions to lists of modifications to be populated. + /// A collection of modification types to exclude from assignment. + /// A dictionary to collect modifications that could not be resolved by identifier or type. + /// List of (position, modification identifier) tuples parsed from XML features or subfeatures. + private static void ParseAnnotatedMods( + Dictionary> destination, + IEnumerable modTypesToExclude, + Dictionary unknownModifications, + List<(int position, string originalModificationID)> annotatedMods) + { + foreach (var (annotatedModLocation, annotatedId) in annotatedMods) + { + if (ProteinDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out Modification foundMod) + || RnaDbLoader.IdWithMotifToMod.TryGetValue(annotatedId, out foundMod)) + { + if (!modTypesToExclude.Contains(foundMod.ModificationType)) + { + if (destination.TryGetValue(annotatedModLocation, out var listOfModsAtThisLocation)) + { + listOfModsAtThisLocation.Add(foundMod); + } + else + { + destination.Add(annotatedModLocation, new List { foundMod }); + } + } + } + else if (ProteinDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out IList mods) + || RnaDbLoader.IdToPossibleMods.TryGetValue(annotatedId, out mods)) + { + foreach (Modification mod in mods) + { + if (!modTypesToExclude.Contains(mod.ModificationType)) + { + if (destination.TryGetValue(annotatedModLocation, out var listOfModsAtThisLocation)) + { + listOfModsAtThisLocation.Add(mod); + } + else + { + destination.Add(annotatedModLocation, new List { mod }); + } + break; + } + } + } + else + { + if (!unknownModifications.ContainsKey(annotatedId)) + { + unknownModifications.Add(annotatedId, new Modification(annotatedId)); + } + } + } + } + private void PruneOutOfRangeSequenceVariants() + { + if (string.IsNullOrEmpty(Sequence) || SequenceVariations.Count == 0) + return; + + int len = Sequence.Length; + int removed = SequenceVariations.RemoveAll(v => + v.OneBasedBeginPosition > len || v.OneBasedEndPosition > len); + + if (removed > 0) + { + Trace.TraceWarning($"Pruned {removed} out-of-range sequence variant(s) for accession {Accession} (protein length {len})."); + } } } } \ No newline at end of file