From 4bc016df495efbfb6038a363c42563a79e3c83cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Thu, 9 Jan 2025 14:47:44 +0100 Subject: [PATCH 1/4] feat: first steps to add vaf calculation for strelka --- .gitignore | 3 ++- workflow/envs/cyvcf.yaml | 5 +++++ workflow/rules/common.smk | 2 ++ workflow/rules/eval.smk | 13 +++++++++++++ workflow/scripts/calc-vaf.py | 36 ++++++++++++++++++++++++++++++++++++ 5 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 workflow/envs/cyvcf.yaml create mode 100644 workflow/scripts/calc-vaf.py diff --git a/.gitignore b/.gitignore index 3d2ebb6..1cdcbf7 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ resources/** logs logs/** data/** -report/** \ No newline at end of file +report/** +test/** \ No newline at end of file diff --git a/workflow/envs/cyvcf.yaml b/workflow/envs/cyvcf.yaml new file mode 100644 index 0000000..bf54b6e --- /dev/null +++ b/workflow/envs/cyvcf.yaml @@ -0,0 +1,5 @@ +channels: + - conda-forge + - bioconda +dependencies: + - cyvcf2 \ No newline at end of file diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index 12188a9..e4ddb27 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -455,6 +455,8 @@ def get_vaf_status(wildcards): vaf_benchmark = benchmarks[wildcards.benchmark].get("vaf-field") if vaf_benchmark is None: return False + if vaf_benchmark == "tbc": + return True else: callsets = get_benchmark_callsets(wildcards.benchmark) vaf_callsets = [ diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk index 724f53a..624ff73 100644 --- a/workflow/rules/eval.smk +++ b/workflow/rules/eval.smk @@ -91,6 +91,19 @@ rule remove_non_pass: "v3.3.6/bio/bcftools/view" +rule calculate_vaf: + input: + "results/filtered-variants/{callset}.bcf" + output: + "results/calculate-vaf/{callset}.added-vaf.bcf" + log: + "logs/calculate-vaf/" + conda: + "../envs/cyvcf.yaml" + script: + "../scripts/calc-vaf.py" + + rule intersect_calls_with_target_regions: input: bcf="results/filtered-variants/{callset}.bcf", diff --git a/workflow/scripts/calc-vaf.py b/workflow/scripts/calc-vaf.py new file mode 100644 index 0000000..4df4474 --- /dev/null +++ b/workflow/scripts/calc-vaf.py @@ -0,0 +1,36 @@ +from cyvcf2 import VCF + +#vcf = "/Users/famke/01-pm4onco/osf-download/pipeline-results-of-imgag-data/qbic/strelka/tumor_5perc_vs_normal_5perc.strelka.somatic_snvs_VEP.ann.vcf" #snakemake.input +#indel = "/Users/famke/01-pm4onco/osf-download/pipeline-results-of-imgag-data/qbic/strelka/tumor_5perc_vs_normal_5perc.strelka.somatic_indels_VEP.ann.vcf.gz" + +# +# SNVS +# + +def get_snv_allele_freq(vcf): + for variant in VCF(vcf): + refCounts = variant.format(variant.REF + "U") + altCounts = variant.format(variant.ALT[0] + "U") + + # TODO: check which value is the correct one from the matrix (this leads to many zero VAF) + tier1RefCounts = refCounts[0, 0] + tier1AltCounts = altCounts[0, 0] + + vaf = tier1AltCounts / (tier1AltCounts + tier1RefCounts) + + print(vaf) + + + +# +# INDELs +# + +def get_indel_allele_freq(vcf): + for variant in VCF(vcf): + tier1RefCounts = variant.format("TAR")[0,0] + tier1AltCounts = variant.format("TIR")[0,0] + + vaf = tier1AltCounts / (tier1AltCounts + tier1RefCounts) + + print(vaf) From e78b133d94103ae2b6fa40c49eb20676f7146a06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Thu, 9 Jan 2025 14:51:20 +0100 Subject: [PATCH 2/4] fix: snakefmt --- workflow/rules/eval.smk | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk index 624ff73..a82c62d 100644 --- a/workflow/rules/eval.smk +++ b/workflow/rules/eval.smk @@ -93,11 +93,11 @@ rule remove_non_pass: rule calculate_vaf: input: - "results/filtered-variants/{callset}.bcf" + "results/filtered-variants/{callset}.bcf", output: - "results/calculate-vaf/{callset}.added-vaf.bcf" + "results/calculate-vaf/{callset}.added-vaf.bcf", log: - "logs/calculate-vaf/" + "logs/calculate-vaf/", conda: "../envs/cyvcf.yaml" script: From ab3ffbb4b3d0906b62aad4940350f53b1757ef7b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Mon, 7 Apr 2025 15:16:46 +0200 Subject: [PATCH 3/4] fix: linting --- workflow/rules/eval.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/rules/eval.smk b/workflow/rules/eval.smk index 1f1fe5b..d60fb04 100644 --- a/workflow/rules/eval.smk +++ b/workflow/rules/eval.smk @@ -115,7 +115,7 @@ rule calculate_vaf: output: "results/calculate-vaf/{callset}.added-vaf.bcf", log: - "logs/calculate-vaf/", + "logs/calculate-vaf/{callset}.log", conda: "../envs/cyvcf.yaml" script: From 8c78c46428dc53b0ed3c0c03593645c5b4efba84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Famke=20Ba=CC=88uerle?= Date: Thu, 15 May 2025 15:56:02 +0200 Subject: [PATCH 4/4] fix: extend script --- workflow/scripts/calc-vaf.py | 159 +++++++++++++++++++++++++++++------ 1 file changed, 134 insertions(+), 25 deletions(-) diff --git a/workflow/scripts/calc-vaf.py b/workflow/scripts/calc-vaf.py index 4df4474..fe528aa 100644 --- a/workflow/scripts/calc-vaf.py +++ b/workflow/scripts/calc-vaf.py @@ -1,36 +1,145 @@ -from cyvcf2 import VCF +from cyvcf2 import VCF, Writer +import sys +import argparse +import numpy as np + +def get_snv_allele_freq(variant): + refCounts = variant.format(variant.REF + "U") + altCounts = variant.format(variant.ALT[0] + "U") -#vcf = "/Users/famke/01-pm4onco/osf-download/pipeline-results-of-imgag-data/qbic/strelka/tumor_5perc_vs_normal_5perc.strelka.somatic_snvs_VEP.ann.vcf" #snakemake.input -#indel = "/Users/famke/01-pm4onco/osf-download/pipeline-results-of-imgag-data/qbic/strelka/tumor_5perc_vs_normal_5perc.strelka.somatic_indels_VEP.ann.vcf.gz" + # TODO: check which value is the correct one from the matrix (this leads to many zero VAF) + tier1RefCounts = refCounts[0, 0] + tier1AltCounts = altCounts[0, 0] -# -# SNVS -# - -def get_snv_allele_freq(vcf): - for variant in VCF(vcf): - refCounts = variant.format(variant.REF + "U") - altCounts = variant.format(variant.ALT[0] + "U") + vaf = tier1AltCounts / (tier1AltCounts + tier1RefCounts) - # TODO: check which value is the correct one from the matrix (this leads to many zero VAF) - tier1RefCounts = refCounts[0, 0] - tier1AltCounts = altCounts[0, 0] + return vaf - vaf = tier1AltCounts / (tier1AltCounts + tier1RefCounts) +def get_indel_allele_freq(variant): + tier1RefCounts = variant.format("TAR")[0,0] + tier1AltCounts = variant.format("TIR")[0,0] - print(vaf) + vaf = tier1AltCounts / (tier1AltCounts + tier1RefCounts) + return vaf +def calculate_vaf(variant, samples): + """ + Calculates the Variant Allele Frequency (VAF) for each sample and + each alternate allele based on the AD (Allelic Depth) FORMAT field. + Args: + variant (cyvcf2.Variant): The variant object. + samples (list): List of sample names. -# -# INDELs -# + Returns: + numpy.ndarray: A numpy array of shape (n_samples, n_alt_alleles) + containing the VAFs. Returns None if AD field is missing. + Missing values or divisions by zero result in np.nan. + """ + try: + # Get Allelic Depths (AD) - shape: (n_samples, n_alleles) + # n_alleles includes the reference allele + ad = variant.format('AD') + except KeyError: + # AD field is not present for this variant + print(f"Warning: AD field missing for variant at {variant.CHROM}:{variant.POS}. Skipping VAF calculation.", file=sys.stderr) + return None -def get_indel_allele_freq(vcf): - for variant in VCF(vcf): - tier1RefCounts = variant.format("TAR")[0,0] - tier1AltCounts = variant.format("TIR")[0,0] + n_samples = len(samples) + n_alleles = len(variant.alleles) # Includes reference + n_alt_alleles = n_alleles - 1 - vaf = tier1AltCounts / (tier1AltCounts + tier1RefCounts) + # Initialize VAF array with NaNs + # Shape: (n_samples, n_alt_alleles) + vaf_values = np.full((n_samples, n_alt_alleles), np.nan, dtype=np.float32) - print(vaf) + for i in range(n_samples): + sample_ad = ad[i] + + # Check for missing AD data for the sample (represented by negative values or could be None depending on VCF) + # cyvcf2 often uses negative numbers for missing integers in FORMAT fields like AD + if np.any(sample_ad < 0): + # Keep VAF as NaN if AD is missing for this sample + continue + + # Calculate total depth for this sample + total_depth = np.sum(sample_ad) + + if total_depth == 0: + # Avoid division by zero, keep VAFs as NaN + continue + + # Calculate VAF for each alternate allele + for j in range(n_alt_alleles): + alt_depth = sample_ad[j + 1] # AD format is [ref_depth, alt1_depth, alt2_depth, ...] + vaf = alt_depth / total_depth + vaf_values[i, j] = vaf + + return vaf_values + +def add_vaf_to_vcf(input_vcf_path, output_vcf_path): + """ + Reads an input VCF, calculates VAF for each variant/sample, adds it + as a new FORMAT field 'VAF', and writes to an output VCF file. + """ + # Open the input VCF file + try: + vcf_reader = VCF(input_vcf_path) + except Exception as e: + print(f"Error opening input VCF file '{input_vcf_path}': {e}", file=sys.stderr) + sys.exit(1) + + + # Add the new VAF FORMAT field definition to the header + # Number='A' means one value per alternate allele + # Type='Float' for the frequency value + try: + vcf_reader.add_format_to_header({ + 'ID': 'VAF', + 'Description': 'Variant Allele Frequency calculated from AD field (Alt Depth / Total Depth)', + 'Type': 'Float', + 'Number': 'A' # One value per alternate allele + }) + except ValueError as e: + # Catch error if the field already exists + print(f"Warning: FORMAT field 'VAF' might already exist in header: {e}", file=sys.stderr) + + + # Create a VCF writer object using the modified header + try: + vcf_writer = Writer(output_vcf_path, vcf_reader) + except Exception as e: + print(f"Error creating output VCF file '{output_vcf_path}': {e}", file=sys.stderr) + vcf_reader.close() + sys.exit(1) + + print(f"Processing VCF: {input_vcf_path}") + print(f"Writing output to: {output_vcf_path}") + + processed_count = 0 + # Iterate through each variant in the VCF + for variant in vcf_reader: + # Calculate VAFs for all samples for the current variant + vaf_array = calculate_vaf(variant, vcf_reader.samples) + + # Add the calculated VAFs to the variant's FORMAT fields + # The vaf_array must have shape (n_samples, n_alt_alleles) + if vaf_array is not None: + try: + # Use set_format to add/update the VAF field for all samples + variant.set_format('VAF', vaf_array) + except Exception as e: + print(f"Error setting VAF format for variant at {variant.CHROM}:{variant.POS}: {e}", file=sys.stderr) + # Decide if you want to skip writing this variant or write without VAF + # Here, we'll still write the variant but VAF might be missing/incorrect + + # Write the (potentially modified) variant record to the output file + vcf_writer.write_record(variant) + processed_count += 1 + if processed_count % 1000 == 0: + print(f"Processed {processed_count} variants...", file=sys.stderr) + + # Close the VCF reader and writer + vcf_reader.close() + vcf_writer.close() + print(f"Finished processing. Total variants processed: {processed_count}") \ No newline at end of file