Skip to content

feat: add vaf calculation for strelka results #109

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions workflow/envs/cyvcf.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
channels:
- conda-forge
- bioconda
dependencies:
- cyvcf2
2 changes: 2 additions & 0 deletions workflow/rules/common.smk
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,8 @@ def get_vaf_status(wildcards):
vaf_benchmark = benchmarks[wildcards.benchmark].get("vaf-field")
if vaf_benchmark is None:
return False
if vaf_benchmark == "tbc":
return True
else:
callsets = get_benchmark_callsets(wildcards.benchmark)
vaf_callsets = [
Expand Down
13 changes: 13 additions & 0 deletions workflow/rules/eval.smk
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,19 @@ rule remove_non_pass:
"v3.3.6/bio/bcftools/view"


rule calculate_vaf:
input:
"results/filtered-variants/{callset}.bcf",
output:
"results/calculate-vaf/{callset}.added-vaf.bcf",
log:
"logs/calculate-vaf/{callset}.log",
conda:
"../envs/cyvcf.yaml"
script:
"../scripts/calc-vaf.py"


rule intersect_calls_with_target_regions:
input:
bcf="results/filtered-variants/{callset}.bcf",
Expand Down
145 changes: 145 additions & 0 deletions workflow/scripts/calc-vaf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
from cyvcf2 import VCF, Writer
import sys
import argparse
import numpy as np

def get_snv_allele_freq(variant):
refCounts = variant.format(variant.REF + "U")
altCounts = variant.format(variant.ALT[0] + "U")

# TODO: check which value is the correct one from the matrix (this leads to many zero VAF)
tier1RefCounts = refCounts[0, 0]
tier1AltCounts = altCounts[0, 0]

vaf = tier1AltCounts / (tier1AltCounts + tier1RefCounts)

return vaf

def get_indel_allele_freq(variant):
tier1RefCounts = variant.format("TAR")[0,0]
tier1AltCounts = variant.format("TIR")[0,0]

vaf = tier1AltCounts / (tier1AltCounts + tier1RefCounts)
return vaf

def calculate_vaf(variant, samples):
"""
Calculates the Variant Allele Frequency (VAF) for each sample and
each alternate allele based on the AD (Allelic Depth) FORMAT field.

Args:
variant (cyvcf2.Variant): The variant object.
samples (list): List of sample names.

Returns:
numpy.ndarray: A numpy array of shape (n_samples, n_alt_alleles)
containing the VAFs. Returns None if AD field is missing.
Missing values or divisions by zero result in np.nan.
"""
try:
# Get Allelic Depths (AD) - shape: (n_samples, n_alleles)
# n_alleles includes the reference allele
ad = variant.format('AD')
except KeyError:
# AD field is not present for this variant
print(f"Warning: AD field missing for variant at {variant.CHROM}:{variant.POS}. Skipping VAF calculation.", file=sys.stderr)
return None

n_samples = len(samples)
n_alleles = len(variant.alleles) # Includes reference
n_alt_alleles = n_alleles - 1

# Initialize VAF array with NaNs
# Shape: (n_samples, n_alt_alleles)
vaf_values = np.full((n_samples, n_alt_alleles), np.nan, dtype=np.float32)

for i in range(n_samples):
sample_ad = ad[i]

# Check for missing AD data for the sample (represented by negative values or could be None depending on VCF)
# cyvcf2 often uses negative numbers for missing integers in FORMAT fields like AD
if np.any(sample_ad < 0):
# Keep VAF as NaN if AD is missing for this sample
continue

# Calculate total depth for this sample
total_depth = np.sum(sample_ad)

if total_depth == 0:
# Avoid division by zero, keep VAFs as NaN
continue

# Calculate VAF for each alternate allele
for j in range(n_alt_alleles):
alt_depth = sample_ad[j + 1] # AD format is [ref_depth, alt1_depth, alt2_depth, ...]
vaf = alt_depth / total_depth
vaf_values[i, j] = vaf

return vaf_values

def add_vaf_to_vcf(input_vcf_path, output_vcf_path):
"""
Reads an input VCF, calculates VAF for each variant/sample, adds it
as a new FORMAT field 'VAF', and writes to an output VCF file.
"""
# Open the input VCF file
try:
vcf_reader = VCF(input_vcf_path)
except Exception as e:
print(f"Error opening input VCF file '{input_vcf_path}': {e}", file=sys.stderr)
sys.exit(1)


# Add the new VAF FORMAT field definition to the header
# Number='A' means one value per alternate allele
# Type='Float' for the frequency value
try:
vcf_reader.add_format_to_header({
'ID': 'VAF',
'Description': 'Variant Allele Frequency calculated from AD field (Alt Depth / Total Depth)',
'Type': 'Float',
'Number': 'A' # One value per alternate allele
})
except ValueError as e:
# Catch error if the field already exists
print(f"Warning: FORMAT field 'VAF' might already exist in header: {e}", file=sys.stderr)


# Create a VCF writer object using the modified header
try:
vcf_writer = Writer(output_vcf_path, vcf_reader)
except Exception as e:
print(f"Error creating output VCF file '{output_vcf_path}': {e}", file=sys.stderr)
vcf_reader.close()
sys.exit(1)

print(f"Processing VCF: {input_vcf_path}")
print(f"Writing output to: {output_vcf_path}")

processed_count = 0
# Iterate through each variant in the VCF
for variant in vcf_reader:
# Calculate VAFs for all samples for the current variant
vaf_array = calculate_vaf(variant, vcf_reader.samples)

# Add the calculated VAFs to the variant's FORMAT fields
# The vaf_array must have shape (n_samples, n_alt_alleles)
if vaf_array is not None:
try:
# Use set_format to add/update the VAF field for all samples
variant.set_format('VAF', vaf_array)
except Exception as e:
print(f"Error setting VAF format for variant at {variant.CHROM}:{variant.POS}: {e}", file=sys.stderr)
# Decide if you want to skip writing this variant or write without VAF
# Here, we'll still write the variant but VAF might be missing/incorrect

# Write the (potentially modified) variant record to the output file
vcf_writer.write_record(variant)
processed_count += 1
if processed_count % 1000 == 0:
print(f"Processed {processed_count} variants...", file=sys.stderr)

# Close the VCF reader and writer
vcf_reader.close()
vcf_writer.close()
Comment on lines +1 to +144
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Add a main block to make the script executable.

The script doesn't have a way to be executed from the command line. Since the file will likely be used as a command-line tool in a Snakemake workflow, you should add a main block to parse arguments and execute the add_vaf_to_vcf function.

import sys
import numpy as np
+import argparse

...existing code...

vcf_reader.close()
vcf_writer.close()
print(f"Finished processing. Total variants processed: {processed_count}")

+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Add VAF information to VCF files")
+    parser.add_argument("input_vcf", help="Input VCF file")
+    parser.add_argument("output_vcf", help="Output VCF file with VAF added")
+    args = parser.parse_args()
+    
+    add_vaf_to_vcf(args.input_vcf, args.output_vcf)

Committable suggestion skipped: line range outside the PR's diff.

🧰 Tools
🪛 Ruff (0.8.2)

3-3: argparse imported but unused

Remove unused import: argparse

(F401)

🤖 Prompt for AI Agents
In workflow/scripts/calc-vaf.py around lines 1 to 144, the script lacks a main
block to allow command-line execution. Add a main block at the end of the file
that uses argparse to parse input and output VCF file paths from command-line
arguments, then calls the add_vaf_to_vcf function with those arguments. This
will enable the script to be run directly and integrated into workflows like
Snakemake.

print(f"Finished processing. Total variants processed: {processed_count}")