diff --git a/download_and_create_reference_datasets/v02/create_ht__cadd.py b/download_and_create_reference_datasets/v02/create_ht__cadd.py deleted file mode 100755 index 7b90c7c34..000000000 --- a/download_and_create_reference_datasets/v02/create_ht__cadd.py +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env python3 - -from kubernetes.shell_utils import simple_run as run - -run(( - "python3 gcloud_dataproc/v02/run_script.py " - "--cluster create-ht-cadd " - "download_and_create_reference_datasets/v02/hail_scripts/write_cadd_ht.py")) diff --git a/download_and_create_reference_datasets/v02/create_ht__clinvar.py b/download_and_create_reference_datasets/v02/create_ht__clinvar.py deleted file mode 100755 index cd5928121..000000000 --- a/download_and_create_reference_datasets/v02/create_ht__clinvar.py +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env python3 - -from kubernetes.shell_utils import simple_run as run - -run(( - "python3 gcloud_dataproc/v02/run_script.py " - "--cluster create-ht-clinvar " - "download_and_create_reference_datasets/v02/hail_scripts/write_clinvar_ht.py")) diff --git a/download_and_create_reference_datasets/v02/create_ht__combined_reference_data.py b/download_and_create_reference_datasets/v02/create_ht__combined_reference_data.py deleted file mode 100644 index 482e5298f..000000000 --- a/download_and_create_reference_datasets/v02/create_ht__combined_reference_data.py +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env python3 - -import argparse -from kubernetes.shell_utils import simple_run as run - -parser = argparse.ArgumentParser() -parser.add_argument('-b', '--build', help='Reference build, 37 or 38', choices=["37", "38"], required=True) -args = parser.parse_args() - -run(( - "python3 gcloud_dataproc/v02/run_script.py " - "--cluster create-ht-combined-reference-data " - "download_and_create_reference_datasets/v02/hail_scripts/write_combined_reference_data_ht.py " - f"--build {args.build}")) diff --git a/download_and_create_reference_datasets/v02/create_ht__eigen.py b/download_and_create_reference_datasets/v02/create_ht__eigen.py deleted file mode 100644 index 122a21631..000000000 --- a/download_and_create_reference_datasets/v02/create_ht__eigen.py +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env python3 - -from kubernetes.shell_utils import simple_run as run - -for genome_version, vcf_path in [ - ("37", "gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.vcf.gz"), - ("38", "gs://seqr-reference-data/GRCh38/eigen/EIGEN_coding_noncoding.liftover_grch38.vcf.gz"), -]: - run(("python3 gcloud_dataproc/v02/run_script.py " - "--cluster create-ht-eigen " - "hail_scripts/v02/convert_vcf_to_hail.py " - "--output-sites-only-ht " - f"--genome-version {genome_version} " - f"{vcf_path}")) diff --git a/download_and_create_reference_datasets/v02/create_ht__mpc.py b/download_and_create_reference_datasets/v02/create_ht__mpc.py deleted file mode 100755 index 030b15613..000000000 --- a/download_and_create_reference_datasets/v02/create_ht__mpc.py +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env python3 - -from kubernetes.shell_utils import simple_run as run - -for genome_version, vcf_path in [ - ("37", "gs://seqr-reference-data/GRCh37/MPC/fordist_constraint_official_mpc_values.vcf.gz"), - ("38", "gs://seqr-reference-data/GRCh38/MPC/fordist_constraint_official_mpc_values.liftover.GRCh38.vcf.gz"), -]: - run(("python3 gcloud_dataproc/v02/run_script.py " - "--cluster create-ht-mpc " - "hail_scripts/v02/convert_vcf_to_hail.py " - "--output-sites-only-ht " - f"--genome-version {genome_version} " - f"{vcf_path}")) diff --git a/download_and_create_reference_datasets/v02/create_ht__primate_ai.py b/download_and_create_reference_datasets/v02/create_ht__primate_ai.py deleted file mode 100644 index 0c97a50d7..000000000 --- a/download_and_create_reference_datasets/v02/create_ht__primate_ai.py +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env python3 - -from kubernetes.shell_utils import simple_run as run - -for genome_version, vcf_path in [ - ("37", "gs://seqr-reference-data/GRCh37/primate_ai/PrimateAI_scores_v0.2.vcf.gz"), - ("38", "gs://seqr-reference-data/GRCh38/primate_ai/PrimateAI_scores_v0.2.liftover_grch38.vcf.gz"), -]: - run(("python3 gcloud_dataproc/v02/run_script.py " - "--cluster create-ht-primate-ai " - "hail_scripts/v02/convert_vcf_to_hail.py " - "--output-sites-only-ht " - f"--genome-version {genome_version} " - f"{vcf_path}")) diff --git a/download_and_create_reference_datasets/v02/create_ht__topmed.py b/download_and_create_reference_datasets/v02/create_ht__topmed.py deleted file mode 100755 index f06937d8e..000000000 --- a/download_and_create_reference_datasets/v02/create_ht__topmed.py +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env python3 - -from kubernetes.shell_utils import simple_run as run - -for genome_version, vcf_path in [ - ("37", "gs://seqr-reference-data/GRCh37/TopMed/bravo-dbsnp-all.removed_chr_prefix.liftunder_GRCh37.vcf.gz"), - ("38", "gs://seqr-reference-data/GRCh38/TopMed/bravo-dbsnp-all.vcf.gz"), -]: - run(("python3 gcloud_dataproc/v02/run_script.py " - "--cluster create-ht-topmed " - "hail_scripts/v02/convert_vcf_to_hail.py " - "--output-sites-only-ht " - f"--genome-version {genome_version} " - f"{vcf_path}")) diff --git a/download_and_create_reference_datasets/v02/hail_scripts/write_1kg_ht.py b/download_and_create_reference_datasets/v02/hail_scripts/write_1kg_ht.py deleted file mode 100644 index aa4a216af..000000000 --- a/download_and_create_reference_datasets/v02/hail_scripts/write_1kg_ht.py +++ /dev/null @@ -1,71 +0,0 @@ -import logging - -import hail as hl - -from hail_scripts.utils.hail_utils import import_vcf - -logger = logging.getLogger('v02.hail_scripts.create_1kg_ht') - -CONFIG= { - "37": "gs://seqr-reference-data/GRCh37/1kg/1kg.wgs.phase3.20130502.GRCh37_sites.vcf.gz", - "38": "gs://seqr-reference-data/GRCh38/1kg/1kg.wgs.phase3.20170504.GRCh38_sites.vcf.gz" -} - -def vcf_to_mt(path, genome_version): - ''' - Converts 1kg vcf to mt. The 1kg dataset has multi-allelic variants and duplicates. - This function independently filters the mutli-allelics to split, then unions with - the bi-allelics. - - :param path: vcf path - :param genome_version: genome version - :return: - ''' - # Import but do not split multis here. - mt = import_vcf(path, - genome_version=genome_version, - min_partitions=1000, - split_multi_alleles=False) - - multiallelic_mt = mt.filter_rows(hl.len(mt.alleles) > 2) - multiallelic_mt = hl.split_multi_hts(multiallelic_mt) - - # We annotate some rows manually to conform to the multiallelic_mt (after split). - # Calling split_multi_hts on biallelic to annotate the rows causes problems. - biallelic_mt = mt.filter_rows(hl.len(mt.alleles) == 2) - biallelic_mt = biallelic_mt.annotate_rows(a_index=1, was_split=False) - - all_mt = biallelic_mt.union_rows(multiallelic_mt) - all_mt = all_mt.key_rows_by(all_mt.locus, all_mt.alleles) - - # 37 is known to have some unneeded symbolic alleles, so we filter out. - all_mt = all_mt.filter_rows( - hl.allele_type(all_mt.alleles[0], all_mt.alleles[1]) == 'Symbolic', - keep=False - ) - - return all_mt - -def annotate_mt(mt): - # Annotate POPMAX_AF, which is max of respective fields using a_index for multi-allelics. - return mt.annotate_rows(POPMAX_AF=hl.max(mt.info.AFR_AF[mt.a_index-1], - mt.info.AMR_AF[mt.a_index - 1], - mt.info.EAS_AF[mt.a_index - 1], - mt.info.EUR_AF[mt.a_index - 1], - mt.info.SAS_AF[mt.a_index - 1])) - -def run(): - for genome_version, path in CONFIG.items(): - logger.info('reading from input path: %s' % path) - - mt = vcf_to_mt(path, genome_version) - mt = annotate_mt(mt) - - mt.describe() - - output_path = path.replace(".vcf", "").replace(".gz", "").replace(".bgz", "")\ - .replace(".*", "").replace("*", "") + ".ht" - logger.info('writing to output path: %s' % output_path) - mt.rows().write(output_path) - -run() diff --git a/download_and_create_reference_datasets/v02/hail_scripts/write_cadd_ht.py b/download_and_create_reference_datasets/v02/hail_scripts/write_cadd_ht.py deleted file mode 100644 index 1e3585ffc..000000000 --- a/download_and_create_reference_datasets/v02/hail_scripts/write_cadd_ht.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/env python3 - -# combine the pre-computed CADD .tsvs from https://cadd.gs.washington.edu/download into 1 Table for each genome build - -import logging -logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s') -logger = logging.getLogger() -logger.setLevel(logging.INFO) - - -import hail as hl -from hail_scripts.utils.hail_utils import write_ht, import_table - -hl.init() - - -def import_cadd_table(path: str, genome_version: str) -> hl.Table: - if genome_version not in ("37", "38"): - raise ValueError(f"Invalid genome version: {genome_version}") - - column_names = {'f0': 'chrom', 'f1': 'pos', 'f2': 'ref', 'f3': 'alt', 'f4': 'RawScore', 'f5': 'PHRED'} - types = {'f0': hl.tstr, 'f1': hl.tint, 'f4': hl.tfloat32, 'f5': hl.tfloat32} - - cadd_ht = hl.import_table(path, force_bgz=True, comment="#", no_header=True, types=types, min_partitions=10000) - cadd_ht = cadd_ht.rename(column_names) - chrom = hl.format("chr%s", cadd_ht.chrom) if genome_version == "38" else cadd_ht.chrom - locus = hl.locus(chrom, cadd_ht.pos, reference_genome=hl.get_reference(f"GRCh{genome_version}")) - alleles = hl.array([cadd_ht.ref, cadd_ht.alt]) - cadd_ht = cadd_ht.transmute(locus=locus, alleles=alleles) - - cadd_union_ht = cadd_ht.head(0) - for contigs in (range(1, 10), list(range(10, 23)) + ["X", "Y", "MT"]): - contigs = ["chr%s" % contig for contig in contigs] if genome_version == "38" else contigs - cadd_ht_subset = cadd_ht.filter(hl.array(list(map(str, contigs))).contains(cadd_ht.locus.contig)) - cadd_union_ht = cadd_union_ht.union(cadd_ht_subset) - - cadd_union_ht = cadd_union_ht.key_by("locus", "alleles") - - cadd_union_ht.describe() - - return cadd_union_ht - -for genome_version in ["37", "38"]: - snvs_ht = import_cadd_table(f"gs://seqr-reference-data/GRCh{genome_version}/CADD/CADD_snvs.v1.6.tsv.gz", genome_version) - indel_ht = import_cadd_table(f"gs://seqr-reference-data/GRCh{genome_version}/CADD/InDels_v1.6.tsv.gz", genome_version) - - ht = snvs_ht.union(indel_ht) - - ht.naive_coalesce(10000).write(f"gs://seqr-reference-data/GRCh{genome_version}/CADD/CADD_snvs_and_indels.v1.6.ht", overwrite=True) diff --git a/download_and_create_reference_datasets/v02/hail_scripts/write_ccREs_ht.py b/download_and_create_reference_datasets/v02/hail_scripts/write_ccREs_ht.py deleted file mode 100644 index c210c10af..000000000 --- a/download_and_create_reference_datasets/v02/hail_scripts/write_ccREs_ht.py +++ /dev/null @@ -1,59 +0,0 @@ -import logging - -import hail as hl - -logging.basicConfig(format="%(asctime)s %(levelname)-8s %(message)s") -logger = logging.getLogger() -logger.setLevel(logging.INFO) - -CONFIG = {"38": "gs://seqr-reference-data/GRCh38/ccREs/GRCh38-ccREs.bed"} - - -def make_interval_bed_table(ht, reference_genome): - """ - Remove the extra fields from the input ccREs file and mimic a bed import. - - :param ht: ccREs bed file. - :return: Hail table that mimics basic bed file table. - """ - ht = ht.select( - interval=hl.locus_interval( - ht["f0"], - ht["f1"]+1, - ht["f2"]+1, - reference_genome=f"GRCh{reference_genome}", - invalid_missing=True, - ), - target=ht["f5"], - ) - ht = ht.transmute(target=ht.target.split(",")) - return ht.key_by("interval") - - -def run(): - for genome_version, path in CONFIG.items(): - logger.info("Reading from input path: %s", path) - - ht = hl.import_table( - path, - no_header=True, - min_partitions=100, - types={ - "f0": hl.tstr, - "f1": hl.tint32, - "f2": hl.tint32, - "f3": hl.tstr, - "f4": hl.tstr, - "f5": hl.tstr, # Hail throws a JSON parse error when using tarray(hl.tstr) so split string later in function - }, - ) - ht = make_interval_bed_table(ht, genome_version) - - ht.describe() - - output_path = path.replace(".bed", "") + ".ht" - logger.info("Writing to output path: %s", output_path) - ht.write(output_path, overwrite=True) - - -run() diff --git a/download_and_create_reference_datasets/v02/hail_scripts/write_clinvar_ht.py b/download_and_create_reference_datasets/v02/hail_scripts/write_clinvar_ht.py deleted file mode 100644 index e0584112e..000000000 --- a/download_and_create_reference_datasets/v02/hail_scripts/write_clinvar_ht.py +++ /dev/null @@ -1,29 +0,0 @@ -import tempfile - -import hail as hl - -from v03_pipeline.lib.model import ReferenceGenome -from v03_pipeline.lib.reference_data.clinvar import ( - download_and_import_latest_clinvar_vcf, - CLINVAR_GOLD_STARS_LOOKUP, -) -from hail_scripts.utils.hail_utils import write_ht - -CLINVAR_PATH = 'ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_{reference_genome}/clinvar.vcf.gz' -CLINVAR_HT_PATH = 'gs://seqr-reference-data/{reference_genome}/clinvar/clinvar.{reference_genome}.ht' - -for reference_genome in ReferenceGenome: - clinvar_url = CLINVAR_PATH.format(reference_genome=reference_genome.value) - ht = download_and_import_latest_clinvar_vcf(clinvar_url, reference_genome) - timestamp = hl.eval(ht.version) - ht = ht.annotate( - gold_stars=CLINVAR_GOLD_STARS_LOOKUP.get(hl.delimit(ht.info.CLNREVSTAT)) - ) - ht.describe() - ht = ht.repartition(100) - write_ht( - ht, - CLINVAR_HT_PATH.format(reference_genome=reference_genome.value).replace(".ht", ".") - + timestamp - + ".ht", - ) diff --git a/download_and_create_reference_datasets/v02/hail_scripts/write_combined_interval_ref_data.py b/download_and_create_reference_datasets/v02/hail_scripts/write_combined_interval_ref_data.py deleted file mode 100644 index 83e4c74b0..000000000 --- a/download_and_create_reference_datasets/v02/hail_scripts/write_combined_interval_ref_data.py +++ /dev/null @@ -1,43 +0,0 @@ -import argparse -import logging - -import hail as hl - -from v03_pipeline.lib.reference_data.dataset_table_operations import join_hts - -VERSION = '2.0.5' -OUTPUT_PATH = "gs://seqr-reference-data/GRCh38/combined_interval_reference_data/combined_interval_reference_data.ht" - -logging.basicConfig(format="%(asctime)s %(levelname)-8s %(message)s", level="INFO") -logger = logging.getLogger(__name__) - - -def run(args): - hl.init(default_reference="GRCh38") - logger.info("Joining the interval reference datasets") - joined_ht = join_hts( - ["gnomad_non_coding_constraint", "screen"], VERSION, reference_genome="38" - ) - - output_path = args.output_path if args.output_path else OUTPUT_PATH - logger.info("Writing to %s", output_path) - joined_ht.write(output_path, overwrite=args.force_write) - logger.info("Done") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "-f", - "--force-write", - help="Overwrite an existing output file", - action="store_true", - ) - parser.add_argument( - "-o", - "--output-path", - help=f"Output path for the combined reference dataset. Default is {OUTPUT_PATH}", - ) - args = parser.parse_args() - - run(args) diff --git a/download_and_create_reference_datasets/v02/hail_scripts/write_combined_reference_data_ht.py b/download_and_create_reference_datasets/v02/hail_scripts/write_combined_reference_data_ht.py deleted file mode 100644 index ae456aab5..000000000 --- a/download_and_create_reference_datasets/v02/hail_scripts/write_combined_reference_data_ht.py +++ /dev/null @@ -1,30 +0,0 @@ -import argparse -import os - -import hail as hl - -from v03_pipeline.lib.reference_data.dataset_table_operations import join_hts -from v03_pipeline.lib.reference_data.config import CONFIG - -VERSION = '2.0.4' -OUTPUT_TEMPLATE = 'gs://seqr-reference-data/GRCh{genome_version}/' \ - 'all_reference_data/v2/combined_reference_data_grch{genome_version}-{version}.ht' - -def run(args): - hl._set_flags(no_whole_stage_codegen='1') # hail 0.2.78 hits an error on the join, this flag gets around it - joined_ht = join_hts(['cadd', 'mpc', 'eigen', 'dbnsfp', 'topmed', 'primate_ai', 'splice_ai', 'exac', - 'gnomad_genomes', 'gnomad_exomes', 'geno2mp', 'gnomad_genome_coverage', 'gnomad_exome_coverage'], - VERSION, - args.build,) - output_path = os.path.join(OUTPUT_TEMPLATE.format(genome_version=args.build, version=VERSION)) - print('Writing to %s' % output_path) - joined_ht.write(os.path.join(output_path)) - - -if __name__ == "__main__": - - parser = argparse.ArgumentParser() - parser.add_argument('-b', '--build', help='Reference build, 37 or 38', choices=["37", "38"], required=True) - args = parser.parse_args() - - run(args) diff --git a/download_and_create_reference_datasets/v02/hail_scripts/write_dbnsfp_ht.py b/download_and_create_reference_datasets/v02/hail_scripts/write_dbnsfp_ht.py deleted file mode 100644 index ede40223f..000000000 --- a/download_and_create_reference_datasets/v02/hail_scripts/write_dbnsfp_ht.py +++ /dev/null @@ -1,147 +0,0 @@ -import hail as hl -from hail.expr import tint, tfloat, tstr - -DBNSFP_INFO = { - '2.9.3': { - 'reference_genome': '37', - 'source_path': 'gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.chr*.gz', - 'output_path': 'gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.with_new_scores.ht', - }, - '4.2': { - 'reference_genome': '38', - 'source_path': 'gs://seqr-reference-data/GRCh38/dbNSFP/v4.2/dbNSFP4.2a_variant.chr*.gz', - 'output_path': 'gs://seqr-reference-data/GRCh38/dbNSFP/v4.2/dbNSFP4.2a_variant.with_new_scores.ht', - }, -} - -# Fields from the dataset file. -DBNSFP_SCHEMA = { - '2.9.3': { - '#chr': tstr, - 'pos(1-coor)': tint, - 'ref': tstr, - 'alt': tstr, - 'SIFT_score': tstr, - 'Polyphen2_HDIV_pred': tstr, - 'Polyphen2_HVAR_score': tstr, - 'LRT_pred': tstr, - 'MutationTaster_pred': tstr, - 'MutationAssessor_pred': tstr, - 'FATHMM_pred': tstr, - 'MetaSVM_pred': tstr, - 'MetaLR_pred': tstr, - 'VEST3_score': tstr, - 'VEST3_rankscore': tstr, - 'PROVEAN_pred': tstr, - 'M-CAP_pred': tstr, - 'REVEL_score': tstr, - 'REVEL_rankscore': tstr, - 'MutPred_Top5features': tstr, - 'Eigen-phred': tstr, - 'Eigen-PC-phred': tstr, - 'GERP++_RS': tstr, - 'GERP++_RS_rankscore': tstr, - 'phyloP46way_primate': tstr, - 'phyloP46way_primate_rankscore': tstr, - 'phyloP46way_placental': tstr, - 'phyloP46way_placental_rankscore': tstr, - 'phyloP100way_vertebrate': tstr, - 'phyloP100way_vertebrate_rankscore': tstr, - 'phastCons46way_primate': tstr, - 'phastCons46way_primate_rankscore': tstr, - 'phastCons46way_placental': tstr, - 'phastCons46way_placental_rankscore': tstr, - 'phastCons100way_vertebrate': tstr, - 'phastCons100way_vertebrate_rankscore': tstr, - 'SiPhy_29way_pi': tstr, - 'SiPhy_29way_logOdds_rankscore': tstr, - 'ESP6500_AA_AF': tfloat, - # This space is intentional and in the file. - 'ESP6500_EA_AF ': tfloat, - 'ARIC5606_AA_AC': tint, - 'ARIC5606_AA_AF': tfloat, - 'ARIC5606_EA_AC': tint, - 'ARIC5606_EA_AF': tfloat, - }, - '4.2': { - '#chr': tstr, - 'pos(1-based)': tint, - 'ref': tstr, - 'alt': tstr, - 'SIFT_score': tstr, - 'Polyphen2_HVAR_score': tstr, - 'MutationTaster_pred': tstr, - 'FATHMM_pred': tstr, - 'VEST4_score': tstr, - 'MetaSVM_pred': tstr, - 'REVEL_score': tstr, - 'GERP++_RS': tstr, - 'phastCons100way_vertebrate': tstr, - 'fathmm-MKL_coding_score': tfloat, - 'MutPred_score': tstr, - } -} - -def generate_replacement_fields(ht, schema): - ''' - Hail Tables need to have a fields remapping. This function generates a dict from - the new transformed field name (whitespace stripped, dash to underscore) to original - field name. The original field name references the exact attribute of ht, per - hail construct so we can feed it to the select query. - - :param ht: Hail table to reference the original field attribute. - :param schema: schema mapping from original field name to type - :return: dict of new transformed name to old attr from ht - ''' - def transform(field_name): - return field_name.strip(" `#").replace("(1-coor)", "")\ - .replace("(1-based)", "").replace("-", "_").replace("+", "") - return { - transform(field_name): getattr(ht, field_name) for field_name in schema.keys() - } - -def dbnsfp_to_ht(source_path, output_path, reference_genome='37', dbnsfp_version="2.9.3"): - ''' - Runs the conversion from importing the table from the source path, proessing the - fields, and outputing as a matrix table to the output path. - - :param source_path: location of the dbnsfp data - :param output_path: location to put the matrix table - :param dbnsfp_version: version - :return: - ''' - # Import the table using the schema to define the types. - ht = hl.import_table(source_path, - types=DBNSFP_SCHEMA[dbnsfp_version], - missing='.', - force=True, - min_partitions=10000) - # get a attribute map to run a select and remap fields. - replacement_fields = generate_replacement_fields(ht, DBNSFP_SCHEMA[dbnsfp_version]) - ht = ht.select(**replacement_fields) - ht = ht.filter(ht.alt == ht.ref, keep=False) #Ask DBSNFP why ref = alt exists if cant find in docs - - # key_by locus and allele needed for matrix table conversion to denote variant data. - chr = ht.chr if reference_genome == '37' else hl.str('chr' + ht.chr) - locus = hl.locus(chr, ht.pos, reference_genome='GRCh%s'%reference_genome) - # We have to upper because 37 is known to have some non uppercases :( - ht = ht.key_by(locus=locus, alleles=[ht.ref.upper(), ht.alt.upper()]) - - - ht = ht.annotate_globals( - sourceFilePath=source_path, - version=dbnsfp_version, - ) - - ht.write(output_path, overwrite=True) - return ht - -def run(): - for dbnsfp_version, config in DBNSFP_INFO.items(): - ht = dbnsfp_to_ht(config["source_path"], - config["output_path"], - config['reference_genome'], - dbnsfp_version) - ht.describe() - -run() diff --git a/download_and_create_reference_datasets/v02/hail_scripts/write_splice_ai_ht.py b/download_and_create_reference_datasets/v02/hail_scripts/write_splice_ai_ht.py deleted file mode 100644 index 051edc2a5..000000000 --- a/download_and_create_reference_datasets/v02/hail_scripts/write_splice_ai_ht.py +++ /dev/null @@ -1,94 +0,0 @@ -import logging -import os - -import hail as hl - -from gnomad.resources.resource_utils import NO_CHR_TO_CHR_CONTIG_RECODING - -CONFIG = { - "37": ( - "gs://seqr-reference-data/GRCh37/spliceai/new-version-2019-10-11/spliceai_scores.masked.snv.hg19.vcf.gz", - "gs://seqr-reference-data/GRCh37/spliceai/new-version-2019-10-11/spliceai_scores.masked.indel.hg19.vcf.gz", - ), - "38": ( - "gs://seqr-reference-data/GRCh38/spliceai/new-version-2019-10-11/spliceai_scores.masked.snv.hg38.vcf.gz", - "gs://seqr-reference-data/GRCh38/spliceai/new-version-2019-10-11/spliceai_scores.masked.indel.hg38.vcf.gz", - ), -} - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - - -def vcf_to_mt(splice_ai_snvs_path, splice_ai_indels_path, genome_version): - """ - Loads the snv path and indels source path to a matrix table and returns the table. - - :param splice_ai_snvs_path: source location - :param splice_ai_indels_path: source location - :return: matrix table - """ - - logger.info( - "==> reading in splice_ai vcfs: %s, %s" - % (splice_ai_snvs_path, splice_ai_indels_path) - ) - - # for 37, extract to MT, for 38, MT not included. - interval = "1-MT" if genome_version == "37" else "chr1-chrY" - contig_dict = None - if genome_version == "38": - contig_dict = NO_CHR_TO_CHR_CONTIG_RECODING - - mt = hl.import_vcf( - [splice_ai_snvs_path, splice_ai_indels_path], - reference_genome=f"GRCh{genome_version}", - contig_recoding=contig_dict, - force_bgz=True, - min_partitions=10000, - skip_invalid_loci=True, - ) - interval = [ - hl.parse_locus_interval(interval, reference_genome=f"GRCh{genome_version}") - ] - mt = hl.filter_intervals(mt, interval) - - # Split SpliceAI field by | delimiter. Capture delta score entries and map to floats - delta_scores = mt.info.SpliceAI[0].split(delim="\\|")[2:6] - splice_split = mt.info.annotate( - SpliceAI=hl.map(lambda x: hl.float32(x), delta_scores) - ) - mt = mt.annotate_rows(info=splice_split) - - # Annotate info.max_DS with the max of DS_AG, DS_AL, DS_DG, DS_DL in info. - # delta_score array is |DS_AG|DS_AL|DS_DG|DS_DL - consequences = hl.literal( - ["Acceptor gain", "Acceptor loss", "Donor gain", "Donor loss"] - ) - mt = mt.annotate_rows(info=mt.info.annotate(max_DS=hl.max(mt.info.SpliceAI))) - mt = mt.annotate_rows( - info=mt.info.annotate( - splice_consequence=hl.if_else( - mt.info.max_DS > 0, - consequences[mt.info.SpliceAI.index(mt.info.max_DS)], - "No consequence", - ) - ) - ) - return mt - - -def run(): - for version, config in CONFIG.items(): - logger.info("===> Version %s" % version) - mt = vcf_to_mt(config[0], config[1], version) - - # Write mt as a ht to the same directory as the snv source. - dest = os.path.join(os.path.dirname(CONFIG[version][0]), "spliceai_scores.ht") - logger.info("===> Writing to %s" % dest) - ht = mt.rows() - ht.write(dest) - ht.describe() - - -run() diff --git a/download_and_create_reference_datasets/v02/mito/utils.py b/download_and_create_reference_datasets/v02/mito/utils.py deleted file mode 100644 index f7517cc88..000000000 --- a/download_and_create_reference_datasets/v02/mito/utils.py +++ /dev/null @@ -1,92 +0,0 @@ -import argparse -import logging -import json -import tqdm -import tempfile -import os -import zipfile -import requests - -import hail as hl - -logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level='INFO') -logger = logging.getLogger(__name__) - - -def _download_file(url, to_dir=tempfile.gettempdir(), skip_verify=False): - if not (url and url.startswith(("http://", "https://"))): - raise ValueError("Invalid url: {}".format(url)) - - local_file_path = os.path.join(to_dir, os.path.basename(url.rstrip('/'))) - - if not skip_verify: - response = requests.head(url) - size = int(response.headers.get('Content-Length', '0')) - if os.path.isfile(local_file_path) and os.path.getsize(local_file_path) == size: - logger.info("Re-using {} previously downloaded from {}".format(local_file_path, url)) - return local_file_path - - is_gz = url.endswith(".gz") or url.endswith(".zip") - response = requests.get(url, stream=is_gz, verify=not skip_verify) - input_iter = response if is_gz else response.iter_content() - - logger.info("Downloading {} to {}".format(url, local_file_path)) - input_iter = tqdm.tqdm(input_iter, unit=" data" if is_gz else " lines") - - with open(local_file_path, 'wb') as f: - f.writelines(input_iter) - - input_iter.close() - - return local_file_path - - -def _convert_json_to_tsv(json_path): - with open(json_path, 'r') as f: - data = json.load(f) - tsv_path = f'{json_path[:-5]}.tsv' if json_path.endswith('.json') else f'{json_path}.tsv' - with open(tsv_path, 'w') as f: - header = '\t'.join(data[0].keys()) - f.write(header + '\n') - for row in data: - f.write('\t'.join([str(v) for v in row.values()]) + '\n') - return tsv_path - - -def _load_mito_ht(config, force_write=True): - logger.info(f'Downloading dataset from {config["input_path"]}.') - dn_path = _download_file(config['input_path'], skip_verify=config.get('skip_verify_ssl')) - - if dn_path.endswith('.zip'): - with zipfile.ZipFile(dn_path, 'r') as zip: - zip.extractall(path=os.path.dirname(dn_path)) - dn_path = dn_path[:-4] - - logger.info(f'Loading hail table from {dn_path}.') - types = config['field_types'] if config.get('field_types') else {} - if config['input_type'] == 'json': - tsv_path = _convert_json_to_tsv(dn_path) - ht = hl.import_table(tsv_path, types=types) - else: - ht = hl.import_table(dn_path, types=types) - - if config.get('annotate'): - ht = ht.annotate(**{field: func(ht) for field, func in config['annotate'].items()}) - - ht = ht.filter(ht.locus.contig == 'chrM') - - ht = ht.key_by('locus', 'alleles') - - logger.info(f'Writing hail table to {config["output_path"]}.') - ht.write(config['output_path'], overwrite=force_write) - logger.info('Done') - - -def load(config): - parser = argparse.ArgumentParser() - parser.add_argument('-f', '--force-write', help='Force write to an existing output file', action='store_true') - args = parser.parse_args() - - hl.init(default_reference='GRCh38') - - _load_mito_ht(config, args.force_write) diff --git a/download_and_create_reference_datasets/v02/mito/write_combined_mito_reference_data_hts.py b/download_and_create_reference_datasets/v02/mito/write_combined_mito_reference_data_hts.py deleted file mode 100644 index 185d352d6..000000000 --- a/download_and_create_reference_datasets/v02/mito/write_combined_mito_reference_data_hts.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import logging - -import hail as hl - -from v03_pipeline.lib.reference_data.dataset_table_operations import join_hts - -VERSION = '2.0.4' -OUTPUT_PATH = 'gs://seqr-reference-data/GRCh38/mitochondrial/all_mito_reference_data/combined_reference_data_chrM.ht' - -logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level='INFO') -logger = logging.getLogger(__name__) - - -def run(args): - # If there are out-of-memory errors, such as "java.lang.OutOfMemoryError: GC overhead limit exceeded" - # then you may need to set the environment variable with the following command - # $ export PYSPARK_SUBMIT_ARGS="--driver-memory 4G pyspark-shell" - # "4G" in the environment variable can be bigger if your computer has a larger memory. - # See more information in https://discuss.hail.is/t/java-heap-space-out-of-memory/1575/6 - hl.init(default_reference='GRCh38', min_block_size=128, master='local[32]') - - logger.info('Joining the mitochondrial reference datasets') - joined_ht = join_hts( - ['gnomad_mito', 'mitomap', 'mitimpact', 'hmtvar', 'helix_mito', 'dbnsfp_mito'], - VERSION, - reference_genome='38' - ) - - joined_ht = joined_ht.rename({'dbnsfp_mito': 'dbnsfp'}) - output_path = args.output_path if args.output_path else OUTPUT_PATH - logger.info(f'Writing to {output_path}') - joined_ht.write(output_path, overwrite=args.force_write) - logger.info('Done') - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument('-f', '--force-write', help='Force write to an existing output file', action='store_true') - parser.add_argument('-o', '--output-path', help=f'Output path for the combined reference dataset. Default is {OUTPUT_PATH}') - args = parser.parse_args() - - run(args) diff --git a/download_and_create_reference_datasets/v02/mito/write_mito_helix_ht.py b/download_and_create_reference_datasets/v02/mito/write_mito_helix_ht.py deleted file mode 100644 index f0f14a48e..000000000 --- a/download_and_create_reference_datasets/v02/mito/write_mito_helix_ht.py +++ /dev/null @@ -1,19 +0,0 @@ -import hail as hl - -from download_and_create_reference_datasets.v02.mito.utils import load - -CONFIG = { - 'input_path': 'https://helix-research-public.s3.amazonaws.com/mito/HelixMTdb_20200327.tsv', - 'input_type': 'tsv', - 'output_path': 'gs://seqr-reference-data/GRCh38/mitochondrial/Helix/HelixMTdb_20200327.ht', - 'field_types': {'counts_hom': hl.tint32, 'AF_hom': hl.tfloat64, 'counts_het': hl.tint32, - 'AF_het': hl.tfloat64, 'max_ARF': hl.tfloat64, 'alleles': hl.tarray(hl.tstr)}, - 'annotate': { - 'locus': lambda ht: hl.locus('chrM', hl.parse_int32(ht.locus.split(':')[1])), - 'AN': lambda ht: hl.if_else(ht.AF_hom > 0, hl.int32(ht.counts_hom/ht.AF_hom), hl.int32(ht.counts_het/ht.AF_het)) - }, -} - - -if __name__ == "__main__": - load(CONFIG) diff --git a/download_and_create_reference_datasets/v02/mito/write_mito_hmtvar_ht.py b/download_and_create_reference_datasets/v02/mito/write_mito_hmtvar_ht.py deleted file mode 100644 index 05a9974bb..000000000 --- a/download_and_create_reference_datasets/v02/mito/write_mito_hmtvar_ht.py +++ /dev/null @@ -1,19 +0,0 @@ -import hail as hl - -from download_and_create_reference_datasets.v02.mito.utils import load - -CONFIG = { - 'input_path': 'https://www.hmtvar.uniba.it/api/main/', - 'input_type': 'json', - 'skip_verify_ssl': True, # The certificate of the website has expired. - 'output_path': 'gs://seqr-reference-data/GRCh38/mitochondrial/HmtVar/HmtVar Jan. 10 2022.ht', - 'annotate': { - 'locus': lambda ht: hl.locus('chrM', hl.parse_int32(ht.nt_start)), - 'alleles': lambda ht: [ht.ref_rCRS, ht.alt], - 'disease_score': lambda ht: hl.parse_float(ht.disease_score), - }, -} - - -if __name__ == "__main__": - load(CONFIG) diff --git a/download_and_create_reference_datasets/v02/mito/write_mito_mitimpact_ht.py b/download_and_create_reference_datasets/v02/mito/write_mito_mitimpact_ht.py deleted file mode 100644 index 0e158b018..000000000 --- a/download_and_create_reference_datasets/v02/mito/write_mito_mitimpact_ht.py +++ /dev/null @@ -1,18 +0,0 @@ -import hail as hl - -from download_and_create_reference_datasets.v02.mito.utils import load - -CONFIG = { - 'input_path': 'https://mitimpact.css-mendel.it/cdn/MitImpact_db_3.1.3.txt.zip', - 'input_type': 'tsv', - 'output_path': 'gs://seqr-reference-data/GRCh38/mitochondrial/MitImpact/MitImpact_db_3.1.3.ht', - 'annotate': { - 'locus': lambda ht: hl.locus('chrM', hl.parse_int32(ht.Start)), - 'alleles': lambda ht: [ht.Ref, ht.Alt], - 'APOGEE2_score': lambda ht: hl.parse_float(ht.APOGEE2_score), - }, -} - - -if __name__ == "__main__": - load(CONFIG) diff --git a/download_and_create_reference_datasets/v02/mito/write_mito_mitomap_ht.py b/download_and_create_reference_datasets/v02/mito/write_mito_mitomap_ht.py deleted file mode 100644 index 7ade39494..000000000 --- a/download_and_create_reference_datasets/v02/mito/write_mito_mitomap_ht.py +++ /dev/null @@ -1,20 +0,0 @@ -import hail as hl - -from download_and_create_reference_datasets.v02.mito.utils import load - -CONFIG = { - # The data source is https://www.mitomap.org/foswiki/bin/view/MITOMAP/ConfirmedMutations and it is a regular web - # page. So we download it manually and save the data to a file in tsv format. - 'input_path': 'https://storage.googleapis.com/seqr-reference-data/GRCh38/mitochondrial/MITOMAP/Mitomap%20Confirmed%20Mutations%20Feb.%2004%202022.tsv', - 'input_type': 'tsv', - 'output_path': 'gs://seqr-reference-data/GRCh38/mitochondrial/MITOMAP/Mitomap Confirmed Mutations Feb. 04 2022.ht', - 'annotate': { - 'locus': lambda ht: hl.locus('chrM', hl.parse_int32(ht.Allele.first_match_in('m.([0-9]+)')[0])), - 'alleles': lambda ht: ht.Allele.first_match_in('m.[0-9]+([ATGC]+)>([ATGC]+)'), - 'pathogenic': lambda ht: True - }, -} - - -if __name__ == "__main__": - load(CONFIG) diff --git a/requirements-dev.in b/requirements-dev.in index 79ef6c802..0c76ef725 100644 --- a/requirements-dev.in +++ b/requirements-dev.in @@ -6,3 +6,4 @@ pip-tools>=6.12.3 responses>=0.23.1 ruff>=0.1.8 shellcheck-py>=0.10.0 +pysam==0.22.1 diff --git a/requirements-dev.txt b/requirements-dev.txt index 017d42de4..65e628a44 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -67,6 +67,8 @@ pygments==2.18.0 # sphinx pyproject-hooks==1.0.0 # via build +pysam==0.22.1 + # via -r requirements-dev.in pyyaml==6.0.2 # via # -c requirements.txt diff --git a/v03_pipeline/bin/rsync_reference_data.bash b/v03_pipeline/bin/rsync_reference_data.bash index 825c583e5..db937c132 100755 --- a/v03_pipeline/bin/rsync_reference_data.bash +++ b/v03_pipeline/bin/rsync_reference_data.bash @@ -39,7 +39,7 @@ else fi fi -gsutil -m rsync -rd "gs://seqr-reference-data/v03/$REFERENCE_GENOME" $REFERENCE_DATASETS_DIR/$REFERENCE_GENOME +gsutil -m rsync -rd "gs://seqr-reference-data/v3.1/$REFERENCE_GENOME" $REFERENCE_DATASETS_DIR/$REFERENCE_GENOME if ! [[ $REFERENCE_DATASETS_DIR =~ gs://* ]]; then touch "$REFERENCE_DATASETS_DIR"/"$REFERENCE_GENOME"/_SUCCESS else diff --git a/v03_pipeline/lib/annotations/enums.py b/v03_pipeline/lib/annotations/enums.py index 16033214f..35a61179e 100644 --- a/v03_pipeline/lib/annotations/enums.py +++ b/v03_pipeline/lib/annotations/enums.py @@ -206,6 +206,21 @@ 'NEAREST_TSS', ] +CLINVAR_ASSERTIONS = [ + 'Affects', + 'association', + 'association_not_found', + 'confers_sensitivity', + 'drug_response', + 'low_penetrance', + 'not_provided', + 'other', + 'protective', + 'risk_factor', + 'no_classification_for_the_single_variant', + 'no_classifications_from_unflagged_records', +] + CLINVAR_DEFAULT_PATHOGENICITY = 'No_pathogenic_assertion' # NB: sorted by pathogenicity diff --git a/v03_pipeline/lib/annotations/fields_test.py b/v03_pipeline/lib/annotations/fields_test.py index 904369e23..fcb8a11d3 100644 --- a/v03_pipeline/lib/annotations/fields_test.py +++ b/v03_pipeline/lib/annotations/fields_test.py @@ -6,32 +6,39 @@ from v03_pipeline.lib.annotations.fields import get_fields from v03_pipeline.lib.model import ( DatasetType, - ReferenceDatasetCollection, ReferenceGenome, ) -from v03_pipeline.lib.paths import valid_reference_dataset_collection_path +from v03_pipeline.lib.paths import valid_reference_dataset_path +from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase from v03_pipeline.lib.vep import run_vep from v03_pipeline.var.test.vep.mock_vep_data import MOCK_37_VEP_DATA, MOCK_38_VEP_DATA -TEST_INTERVAL_1 = 'v03_pipeline/var/test/reference_data/test_interval_1.ht' GRCH37_TO_GRCH38_LIFTOVER_REF_PATH = ( 'v03_pipeline/var/test/liftover/grch37_to_grch38.over.chain.gz' ) GRCH38_TO_GRCH37_LIFTOVER_REF_PATH = ( 'v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz' ) +TEST_GNOMAD_NONCODING_CONSTRAINT_38_HT = 'v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht' +TEST_SCREEN_38_HT = 'v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht' class FieldsTest(MockedDatarootTestCase): def setUp(self) -> None: super().setUp() shutil.copytree( - TEST_INTERVAL_1, - valid_reference_dataset_collection_path( + TEST_GNOMAD_NONCODING_CONSTRAINT_38_HT, + valid_reference_dataset_path( ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - ReferenceDatasetCollection.INTERVAL, + ReferenceDataset.gnomad_non_coding_constraint, + ), + ) + shutil.copytree( + TEST_SCREEN_38_HT, + valid_reference_dataset_path( + ReferenceGenome.GRCh38, + ReferenceDataset.screen, ), ) @@ -120,18 +127,17 @@ def test_get_formatting_fields(self, mock_vep: Mock) -> None: reference_genome, ), **{ - f'{rdc.value}_ht': hl.read_table( - valid_reference_dataset_collection_path( + f'{reference_dataset}_ht': hl.read_table( + valid_reference_dataset_path( reference_genome, - DatasetType.SNV_INDEL, - rdc, + reference_dataset, ), ) - for rdc in ReferenceDatasetCollection.for_reference_genome_dataset_type( + for reference_dataset in ReferenceDataset.for_reference_genome_dataset_type_annotations( reference_genome, DatasetType.SNV_INDEL, ) - if rdc.requires_annotation + if reference_dataset.is_keyed_by_interval }, **( { diff --git a/v03_pipeline/lib/annotations/mito.py b/v03_pipeline/lib/annotations/mito.py index 49ed0c108..dbf483685 100644 --- a/v03_pipeline/lib/annotations/mito.py +++ b/v03_pipeline/lib/annotations/mito.py @@ -47,14 +47,6 @@ def HL(mt: hl.MatrixTable, **_: Any) -> hl.Expression: # noqa: N802 return hl.if_else(is_called, mt.HL, 0) -def high_constraint_region_mito( - ht: hl.Table, - interval_ht: hl.Table, - **_: Any, -) -> hl.Expression: - return hl.is_defined(interval_ht[ht.locus]) - - def mito_cn(mt: hl.MatrixTable, **_: Any) -> hl.Expression: return hl.int32(mt.mito_cn) diff --git a/v03_pipeline/lib/annotations/rdc_dependencies.py b/v03_pipeline/lib/annotations/rdc_dependencies.py deleted file mode 100644 index 1f1e13d7a..000000000 --- a/v03_pipeline/lib/annotations/rdc_dependencies.py +++ /dev/null @@ -1,29 +0,0 @@ -import hail as hl - -from v03_pipeline.lib.model import ( - DatasetType, - ReferenceDatasetCollection, - ReferenceGenome, -) -from v03_pipeline.lib.paths import ( - valid_reference_dataset_collection_path, -) - - -def get_rdc_annotation_dependencies( - dataset_type: DatasetType, - reference_genome: ReferenceGenome, -) -> dict[str, hl.Table]: - deps = {} - for rdc in ReferenceDatasetCollection.for_reference_genome_dataset_type( - reference_genome, - dataset_type, - ): - deps[f'{rdc.value}_ht'] = hl.read_table( - valid_reference_dataset_collection_path( - reference_genome, - dataset_type, - rdc, - ), - ) - return deps diff --git a/v03_pipeline/lib/annotations/snv_indel.py b/v03_pipeline/lib/annotations/snv_indel.py index 44a26b044..de1a561c4 100644 --- a/v03_pipeline/lib/annotations/snv_indel.py +++ b/v03_pipeline/lib/annotations/snv_indel.py @@ -73,16 +73,16 @@ def gt_stats( def gnomad_non_coding_constraint( ht: hl.Table, - interval_ht: hl.Table, + gnomad_non_coding_constraint_ht: hl.Table, **_: Any, ) -> hl.Expression: return hl.Struct( z_score=( - interval_ht.index(ht.locus, all_matches=True) + gnomad_non_coding_constraint_ht.index(ht.locus, all_matches=True) .filter( - lambda x: hl.is_defined(x.gnomad_non_coding_constraint['z_score']), + lambda x: hl.is_defined(x['z_score']), ) - .gnomad_non_coding_constraint.z_score.first() + .z_score.first() ), ) @@ -98,16 +98,16 @@ def rg38_locus( def screen( ht: hl.Table, - interval_ht: hl.Table, + screen_ht: hl.Table, **_: Any, ) -> hl.Expression: return hl.Struct( region_type_ids=( - interval_ht.index( + screen_ht.index( ht.locus, all_matches=True, ).flatmap( - lambda x: x.screen['region_type_ids'], + lambda x: x['region_type_ids'], ) ), ) diff --git a/v03_pipeline/lib/misc/validation.py b/v03_pipeline/lib/misc/validation.py index 5f5e4400c..77448bb03 100644 --- a/v03_pipeline/lib/misc/validation.py +++ b/v03_pipeline/lib/misc/validation.py @@ -3,17 +3,11 @@ import hail as hl from v03_pipeline.lib.model import ( - CachedReferenceDatasetQuery, DatasetType, - Env, ReferenceGenome, SampleType, Sex, ) -from v03_pipeline.lib.paths import ( - cached_reference_dataset_query_path, - sex_check_table_path, -) AMBIGUOUS_THRESHOLD_PERC: float = 0.01 # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown. MIN_ROWS_PER_CONTIG = 100 @@ -24,36 +18,6 @@ class SeqrValidationError(Exception): pass -def get_validation_dependencies( - dataset_type: DatasetType, - reference_genome: ReferenceGenome, - callset_path: str, - skip_check_sex_and_relatedness: bool, - **_: Any, -) -> dict[str, hl.Table]: - deps = {} - deps['coding_and_noncoding_variants_ht'] = hl.read_table( - cached_reference_dataset_query_path( - reference_genome, - dataset_type, - CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS, - ), - ) - if ( - Env.CHECK_SEX_AND_RELATEDNESS - and dataset_type.check_sex_and_relatedness - and not skip_check_sex_and_relatedness - ): - deps['sex_check_ht'] = hl.read_table( - sex_check_table_path( - reference_genome, - dataset_type, - callset_path, - ), - ) - return deps - - def validate_allele_type( mt: hl.MatrixTable, dataset_type: DatasetType, diff --git a/v03_pipeline/lib/misc/validation_test.py b/v03_pipeline/lib/misc/validation_test.py index d7b318f18..f32900a99 100644 --- a/v03_pipeline/lib/misc/validation_test.py +++ b/v03_pipeline/lib/misc/validation_test.py @@ -1,5 +1,4 @@ import unittest -from unittest.mock import Mock, patch import hail as hl @@ -111,8 +110,7 @@ def test_validate_allele_type(self) -> None: DatasetType.SNV_INDEL, ) - @patch('v03_pipeline.lib.misc.validation.Env') - def test_validate_imputed_sex_ploidy(self, mock_env: Mock) -> None: + def test_validate_imputed_sex_ploidy(self) -> None: female_sample = 'HG00731_1' male_sample_1 = 'HG00732_1' male_sample_2 = 'HG00732_1' @@ -121,7 +119,6 @@ def test_validate_imputed_sex_ploidy(self, mock_env: Mock) -> None: xyy_sample = 'NA20891_1' xxx_sample = 'NA20892_1' - mock_env.CHECK_SEX_AND_RELATEDNESS = True sex_check_ht = hl.read_table(TEST_SEX_CHECK_1) # All calls on X chromosome are valid diff --git a/v03_pipeline/lib/model/__init__.py b/v03_pipeline/lib/model/__init__.py index 1e6605876..bb4325d47 100644 --- a/v03_pipeline/lib/model/__init__.py +++ b/v03_pipeline/lib/model/__init__.py @@ -1,6 +1,3 @@ -from v03_pipeline.lib.model.cached_reference_dataset_query import ( - CachedReferenceDatasetQuery, -) from v03_pipeline.lib.model.dataset_type import DatasetType from v03_pipeline.lib.model.definitions import ( AccessControl, @@ -10,18 +7,13 @@ Sex, ) from v03_pipeline.lib.model.environment import Env -from v03_pipeline.lib.model.reference_dataset_collection import ( - ReferenceDatasetCollection, -) __all__ = [ 'AccessControl', - 'CachedReferenceDatasetQuery', 'DatasetType', 'Env', 'Sex', 'PipelineVersion', - 'ReferenceDatasetCollection', 'ReferenceGenome', 'SampleType', ] diff --git a/v03_pipeline/lib/model/cached_reference_dataset_query.py b/v03_pipeline/lib/model/cached_reference_dataset_query.py deleted file mode 100644 index b950be51b..000000000 --- a/v03_pipeline/lib/model/cached_reference_dataset_query.py +++ /dev/null @@ -1,65 +0,0 @@ -from collections.abc import Callable -from enum import Enum - -import hail as hl - -from v03_pipeline.lib.model.dataset_type import DatasetType -from v03_pipeline.lib.model.definitions import ReferenceGenome -from v03_pipeline.lib.model.reference_dataset_collection import ( - ReferenceDatasetCollection, -) -from v03_pipeline.lib.reference_data.queries import ( - clinvar_path_variants, - gnomad_coding_and_noncoding_variants, - gnomad_qc, - high_af_variants, -) - - -class CachedReferenceDatasetQuery(str, Enum): - CLINVAR_PATH_VARIANTS = 'clinvar_path_variants' - GNOMAD_CODING_AND_NONCODING_VARIANTS = 'gnomad_coding_and_noncoding_variants' - GNOMAD_QC = 'gnomad_qc' - HIGH_AF_VARIANTS = 'high_af_variants' - - def dataset(self, dataset_type: DatasetType) -> str | None: - return { - CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS: 'clinvar_mito' - if dataset_type == DatasetType.MITO - else 'clinvar', - CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS: 'gnomad_genomes', - CachedReferenceDatasetQuery.GNOMAD_QC: 'gnomad_qc', - CachedReferenceDatasetQuery.HIGH_AF_VARIANTS: 'gnomad_genomes', - }.get(self) - - @property - def reference_dataset_collection(self) -> ReferenceDatasetCollection: - return { - CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS: ReferenceDatasetCollection.COMBINED, - CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS: None, - CachedReferenceDatasetQuery.GNOMAD_QC: None, - CachedReferenceDatasetQuery.HIGH_AF_VARIANTS: ReferenceDatasetCollection.COMBINED, - }[self] - - @property - def query(self) -> Callable[[hl.Table, ReferenceGenome], hl.Table]: - return { - CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS: clinvar_path_variants, - CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS: gnomad_coding_and_noncoding_variants, - CachedReferenceDatasetQuery.GNOMAD_QC: gnomad_qc, - CachedReferenceDatasetQuery.HIGH_AF_VARIANTS: high_af_variants, - }[self] - - @classmethod - def for_reference_genome_dataset_type( - cls, - reference_genome: ReferenceGenome, - dataset_type: DatasetType, - ) -> list['CachedReferenceDatasetQuery']: - return { - (ReferenceGenome.GRCh38, DatasetType.SNV_INDEL): list(cls), - (ReferenceGenome.GRCh38, DatasetType.MITO): [ - CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, - ], - (ReferenceGenome.GRCh37, DatasetType.SNV_INDEL): list(cls), - }.get((reference_genome, dataset_type), []) diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py index 8bad768bb..281cffd9b 100644 --- a/v03_pipeline/lib/model/dataset_type.py +++ b/v03_pipeline/lib/model/dataset_type.py @@ -246,7 +246,6 @@ def formatting_annotation_fns( DatasetType.MITO: [ mito.common_low_heteroplasmy, mito.haplogroup, - mito.high_constraint_region_mito, mito.mitotip, mito.rsid, shared.variant_id, diff --git a/v03_pipeline/lib/model/definitions.py b/v03_pipeline/lib/model/definitions.py index 71e8b2821..a87437c8f 100644 --- a/v03_pipeline/lib/model/definitions.py +++ b/v03_pipeline/lib/model/definitions.py @@ -63,6 +63,13 @@ def optional_contigs(self) -> set[str]: }, }[self] + @property + def mito_contig(self) -> str: + return { + ReferenceGenome.GRCh37: 'MT', + ReferenceGenome.GRCh38: 'chrM', + }[self] + def contig_recoding(self, include_mt: bool = False) -> dict[str, str]: recode = { ReferenceGenome.GRCh37: { diff --git a/v03_pipeline/lib/model/reference_dataset_collection.py b/v03_pipeline/lib/model/reference_dataset_collection.py deleted file mode 100644 index d05e1e328..000000000 --- a/v03_pipeline/lib/model/reference_dataset_collection.py +++ /dev/null @@ -1,110 +0,0 @@ -from enum import Enum - -import hail as hl - -from v03_pipeline.lib.model.dataset_type import DatasetType -from v03_pipeline.lib.model.definitions import AccessControl, ReferenceGenome -from v03_pipeline.lib.model.environment import Env - - -class ReferenceDatasetCollection(str, Enum): - COMBINED = 'combined' - HGMD = 'hgmd' - INTERVAL = 'interval' - - @property - def access_control(self) -> AccessControl: - if self == ReferenceDatasetCollection.HGMD: - return AccessControl.PRIVATE - return AccessControl.PUBLIC - - @property - def requires_annotation(self) -> bool: - return self == ReferenceDatasetCollection.INTERVAL - - def datasets(self, dataset_type: DatasetType) -> list[str]: - return { - (ReferenceDatasetCollection.COMBINED, DatasetType.SNV_INDEL): [ - 'cadd', - 'clinvar', - 'dbnsfp', - 'eigen', - 'exac', - 'gnomad_exomes', - 'gnomad_genomes', - 'mpc', - 'primate_ai', - 'splice_ai', - 'topmed', - ], - (ReferenceDatasetCollection.COMBINED, DatasetType.MITO): [ - 'clinvar_mito', - 'dbnsfp_mito', - 'gnomad_mito', - 'helix_mito', - 'hmtvar', - 'mitomap', - 'mitimpact', - 'local_constraint_mito', - ], - (ReferenceDatasetCollection.HGMD, DatasetType.SNV_INDEL): ['hgmd'], - (ReferenceDatasetCollection.INTERVAL, DatasetType.SNV_INDEL): [ - 'gnomad_non_coding_constraint', - 'screen', - ], - (ReferenceDatasetCollection.INTERVAL, DatasetType.MITO): [ - 'high_constraint_region_mito', - ], - }.get((self, dataset_type), []) - - def table_key_type( - self, - reference_genome: ReferenceGenome, - ) -> hl.tstruct: - default_key = hl.tstruct( - locus=hl.tlocus(reference_genome.value), - alleles=hl.tarray(hl.tstr), - ) - return { - ReferenceDatasetCollection.INTERVAL: hl.tstruct( - interval=hl.tinterval(hl.tlocus(reference_genome.value)), - ), - }.get(self, default_key) - - @classmethod - def for_reference_genome_dataset_type( - cls, - reference_genome: ReferenceGenome, - dataset_type: DatasetType, - ) -> list['ReferenceDatasetCollection']: - rdcs = { - (ReferenceGenome.GRCh38, DatasetType.SNV_INDEL): [ - ReferenceDatasetCollection.COMBINED, - ReferenceDatasetCollection.INTERVAL, - ReferenceDatasetCollection.HGMD, - ], - (ReferenceGenome.GRCh38, DatasetType.MITO): [ - ReferenceDatasetCollection.COMBINED, - ReferenceDatasetCollection.INTERVAL, - ], - (ReferenceGenome.GRCh37, DatasetType.SNV_INDEL): [ - ReferenceDatasetCollection.COMBINED, - ReferenceDatasetCollection.HGMD, - ], - }.get((reference_genome, dataset_type), []) - if not Env.ACCESS_PRIVATE_REFERENCE_DATASETS: - return [rdc for rdc in rdcs if rdc.access_control == AccessControl.PUBLIC] - return rdcs - - @classmethod - def for_dataset( - cls, - dataset: str, - dataset_type: DatasetType, - ) -> 'ReferenceDatasetCollection': - for rdc in cls: - if dataset in rdc.datasets(dataset_type): - return rdc - - err_msg = f'Dataset "{dataset}" not found in any reference dataset collection' - raise ValueError(err_msg) diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 0ae158866..68d27bba2 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -4,14 +4,16 @@ from v03_pipeline.lib.model import ( AccessControl, - CachedReferenceDatasetQuery, DatasetType, Env, PipelineVersion, - ReferenceDatasetCollection, ReferenceGenome, SampleType, ) +from v03_pipeline.lib.reference_datasets.reference_dataset import ( + ReferenceDataset, + ReferenceDatasetQuery, +) def _pipeline_prefix( @@ -57,19 +59,24 @@ def _v03_reference_data_prefix( ) -def cached_reference_dataset_query_path( +def _v03_reference_dataset_prefix( + access_control: AccessControl, reference_genome: ReferenceGenome, - dataset_type: DatasetType, - cached_reference_dataset_query: CachedReferenceDatasetQuery, ) -> str: + root = ( + Env.PRIVATE_REFERENCE_DATASETS_DIR + if access_control == AccessControl.PRIVATE + else Env.REFERENCE_DATASETS_DIR + ) + if Env.INCLUDE_PIPELINE_VERSION_IN_PREFIX: + return os.path.join( + root, + PipelineVersion.V3_1.value, + reference_genome.value, + ) return os.path.join( - _v03_reference_data_prefix( - AccessControl.PUBLIC, - reference_genome, - dataset_type, - ), - 'cached_reference_dataset_queries', - f'{cached_reference_dataset_query.value}.ht', + root, + reference_genome.value, ) @@ -283,24 +290,32 @@ def valid_filters_path( ) -def valid_reference_dataset_collection_path( +def valid_reference_dataset_path( + reference_genome: ReferenceGenome, + reference_dataset: ReferenceDataset, +) -> str | None: + return os.path.join( + _v03_reference_dataset_prefix( + reference_dataset.access_control, + reference_genome, + ), + f'{reference_dataset.value}', + f'{reference_dataset.version(reference_genome)}.ht', + ) + + +def valid_reference_dataset_query_path( reference_genome: ReferenceGenome, dataset_type: DatasetType, - reference_dataset_collection: ReferenceDatasetCollection, + reference_dataset_query: ReferenceDatasetQuery, ) -> str | None: - if ( - not Env.ACCESS_PRIVATE_REFERENCE_DATASETS - and reference_dataset_collection.access_control == AccessControl.PRIVATE - ): - return None return os.path.join( - _v03_reference_data_prefix( - reference_dataset_collection.access_control, + _v03_reference_dataset_prefix( + reference_dataset_query.access_control, reference_genome, - dataset_type, ), - 'reference_datasets', - f'{reference_dataset_collection.value}.ht', + dataset_type.value, + f'{reference_dataset_query.value}.ht', ) diff --git a/v03_pipeline/lib/paths_test.py b/v03_pipeline/lib/paths_test.py index 28de9567b..f49e62b61 100644 --- a/v03_pipeline/lib/paths_test.py +++ b/v03_pipeline/lib/paths_test.py @@ -2,14 +2,11 @@ from unittest.mock import patch from v03_pipeline.lib.model import ( - CachedReferenceDatasetQuery, DatasetType, - ReferenceDatasetCollection, ReferenceGenome, SampleType, ) from v03_pipeline.lib.paths import ( - cached_reference_dataset_query_path, family_table_path, imported_callset_path, imputed_sex_path, @@ -23,23 +20,12 @@ remapped_and_subsetted_callset_path, sex_check_table_path, valid_filters_path, - valid_reference_dataset_collection_path, validation_errors_for_run_path, variant_annotations_table_path, ) class TestPaths(unittest.TestCase): - def test_cached_reference_dataset_query_path(self) -> None: - self.assertEqual( - cached_reference_dataset_query_path( - ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, - ), - '/var/seqr/seqr-reference-data/v03/GRCh38/SNV_INDEL/cached_reference_dataset_queries/clinvar_path_variants.ht', - ) - def test_family_table_path(self) -> None: self.assertEqual( family_table_path( @@ -103,26 +89,6 @@ def test_project_table_path(self) -> None: '/var/seqr/seqr-hail-search-data/v3.1/GRCh38/MITO/projects/WES/R0652_pipeline_test.ht', ) - def test_valid_reference_dataset_collection_path(self) -> None: - with patch('v03_pipeline.lib.paths.Env') as mock_env: - mock_env.ACCESS_PRIVATE_REFERENCE_DATASETS = False - self.assertEqual( - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh37, - DatasetType.SNV_INDEL, - ReferenceDatasetCollection.HGMD, - ), - None, - ) - self.assertEqual( - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - ReferenceDatasetCollection.HGMD, - ), - '/var/seqr/seqr-reference-data-private/v03/GRCh38/SNV_INDEL/reference_datasets/hgmd.ht', - ) - def test_lookup_table_path(self) -> None: self.assertEqual( lookup_table_path( diff --git a/v03_pipeline/lib/reference_data/clinvar.py b/v03_pipeline/lib/reference_data/clinvar.py deleted file mode 100644 index 3e482e0b6..000000000 --- a/v03_pipeline/lib/reference_data/clinvar.py +++ /dev/null @@ -1,214 +0,0 @@ -import gzip -import os -import shutil -import tempfile -import urllib - -import hail as hl -import hailtop.fs as hfs -import requests - -from v03_pipeline.lib.annotations.enums import CLINVAR_PATHOGENICITIES_LOOKUP -from v03_pipeline.lib.logger import get_logger -from v03_pipeline.lib.misc.io import write -from v03_pipeline.lib.model import Env -from v03_pipeline.lib.model.definitions import ReferenceGenome -from v03_pipeline.lib.paths import clinvar_dataset_path - -CLINVAR_ASSERTIONS = [ - 'Affects', - 'association', - 'association_not_found', - 'confers_sensitivity', - 'drug_response', - 'low_penetrance', - 'not_provided', - 'other', - 'protective', - 'risk_factor', - 'no_classification_for_the_single_variant', - 'no_classifications_from_unflagged_records', -] -CLINVAR_GOLD_STARS_LOOKUP = hl.dict( - { - 'no_classification_for_the_single_variant': 0, - 'no_classification_provided': 0, - 'no_assertion_criteria_provided': 0, - 'no_classifications_from_unflagged_records': 0, - 'criteria_provided,_single_submitter': 1, - 'criteria_provided,_conflicting_classifications': 1, - 'criteria_provided,_multiple_submitters,_no_conflicts': 2, - 'reviewed_by_expert_panel': 3, - 'practice_guideline': 4, - }, -) -CLINVAR_SUBMISSION_SUMMARY_URL = ( - 'ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/submission_summary.txt.gz' -) -MIN_HT_PARTITIONS = 2000 -logger = get_logger(__name__) - - -def parsed_clnsig(ht: hl.Table): - return ( - hl.delimit(ht.info.CLNSIG) - .replace( - 'Likely_pathogenic,_low_penetrance', - 'Likely_pathogenic|low_penetrance', - ) - .replace( - '/Pathogenic,_low_penetrance/Established_risk_allele', - '/Established_risk_allele|low_penetrance', - ) - .replace( - '/Pathogenic,_low_penetrance', - '|low_penetrance', - ) - .split(r'\|') - ) - - -def parse_to_count(entry: str): - splt = entry.split( - r'\(', - ) # pattern, count = entry... if destructuring worked on a hail expression! - return hl.Struct( - pathogenicity_id=CLINVAR_PATHOGENICITIES_LOOKUP[splt[0]], - count=hl.int32(splt[1][:-1]), - ) - - -def parsed_and_mapped_clnsigconf(ht: hl.Table): - return ( - hl.delimit(ht.info.CLNSIGCONF) - .replace(',_low_penetrance', '') - .split(r'\|') - .map(parse_to_count) - .group_by(lambda x: x.pathogenicity_id) - .map_values( - lambda values: ( - values.fold( - lambda x, y: x + y.count, - 0, - ) - ), - ) - .items() - .map(lambda x: hl.Struct(pathogenicity_id=x[0], count=x[1])) - ) - - -def get_clinvar_ht( - clinvar_url: str, - reference_genome: ReferenceGenome, -): - etag = requests.head(clinvar_url, timeout=10).headers.get('ETag').strip('"') - clinvar_ht_path = clinvar_dataset_path(reference_genome, etag) - if hfs.exists(clinvar_ht_path): - logger.info(f'Try using cached clinvar ht with etag {etag}') - ht = hl.read_table(clinvar_ht_path) - else: - logger.info('Cached clinvar ht not found, downloading latest clinvar vcf') - hl._set_flags(use_new_shuffle=None, no_whole_stage_codegen='1') # noqa: SLF001 - ht = download_and_import_latest_clinvar_vcf(clinvar_url, reference_genome) - write(ht, clinvar_ht_path, repartition=False) - hl._set_flags(use_new_shuffle='1', no_whole_stage_codegen='1') # noqa: SLF001 - return ht - - -def download_and_import_latest_clinvar_vcf( - clinvar_url: str, - reference_genome: ReferenceGenome, -) -> hl.Table: - version = parse_clinvar_release_date(clinvar_url) - with tempfile.NamedTemporaryFile(suffix='.vcf.gz', delete=False) as tmp_file: - urllib.request.urlretrieve(clinvar_url, tmp_file.name) # noqa: S310 - cached_tmp_file_name = os.path.join( - Env.HAIL_TMP_DIR, - os.path.basename(tmp_file.name), - ) - # In cases where HAIL_TMP_DIR is a remote path, copy the - # file there. If it's local, do nothing. - if tmp_file.name != cached_tmp_file_name: - hfs.copy(tmp_file.name, cached_tmp_file_name) - mt = hl.import_vcf( - cached_tmp_file_name, - reference_genome=reference_genome.value, - drop_samples=True, - skip_invalid_loci=True, - contig_recoding=reference_genome.contig_recoding(include_mt=True), - min_partitions=MIN_HT_PARTITIONS, - force_bgz=True, - ) - mt = mt.annotate_globals(version=version) - return join_to_submission_summary_ht(mt.rows()) - - -def parse_clinvar_release_date(clinvar_url: str) -> str: - response = requests.get(clinvar_url, stream=True, timeout=10) - for byte_line in gzip.GzipFile(fileobj=response.raw): - line = byte_line.decode('ascii').strip() - if not line: - continue - if line.startswith('##fileDate='): - return line.split('=')[-1].strip() - if not line.startswith('#'): - return None - return None - - -def join_to_submission_summary_ht(vcf_ht: hl.Table) -> hl.Table: - # https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/README - submission_summary.txt - logger.info('Getting clinvar submission summary from NCBI FTP server') - ht = download_and_import_clinvar_submission_summary() - return vcf_ht.annotate( - submitters=ht[vcf_ht.rsid].Submitters, - conditions=ht[vcf_ht.rsid].Conditions, - ) - - -def download_and_import_clinvar_submission_summary() -> hl.Table: - with tempfile.NamedTemporaryFile( - suffix='.txt.gz', - delete=False, - ) as tmp_file, tempfile.NamedTemporaryFile( - suffix='.txt', - delete=False, - ) as unzipped_tmp_file: - urllib.request.urlretrieve(CLINVAR_SUBMISSION_SUMMARY_URL, tmp_file.name) # noqa: S310 - # Unzip the gzipped file first to fix gzip files being read by hail with single partition - with gzip.open(tmp_file.name, 'rb') as f_in, open( - unzipped_tmp_file.name, - 'wb', - ) as f_out: - shutil.copyfileobj(f_in, f_out) - cached_tmp_file_name = os.path.join( - Env.HAIL_TMP_DIR, - os.path.basename(unzipped_tmp_file.name), - ) - # In cases where HAIL_TMP_DIR is a remote path, copy the - # file there. If it's local, do nothing. - if unzipped_tmp_file.name != cached_tmp_file_name: - hfs.copy(unzipped_tmp_file.name, cached_tmp_file_name) - return import_submission_table(cached_tmp_file_name) - - -def import_submission_table(file_name: str) -> hl.Table: - ht = hl.import_table( - file_name, - force=True, - filter='^(#[^:]*:|^##).*$', # removes all comments except for the header line - types={ - '#VariationID': hl.tstr, - 'Submitter': hl.tstr, - 'ReportedPhenotypeInfo': hl.tstr, - }, - missing='-', - min_partitions=MIN_HT_PARTITIONS, - ) - ht = ht.rename({'#VariationID': 'VariationID'}) - ht = ht.select('VariationID', 'Submitter', 'ReportedPhenotypeInfo') - return ht.group_by('VariationID').aggregate( - Submitters=hl.agg.collect(ht.Submitter), - Conditions=hl.agg.collect(ht.ReportedPhenotypeInfo), - ) diff --git a/v03_pipeline/lib/reference_data/clinvar_test.py b/v03_pipeline/lib/reference_data/clinvar_test.py deleted file mode 100644 index fd8d4e832..000000000 --- a/v03_pipeline/lib/reference_data/clinvar_test.py +++ /dev/null @@ -1,281 +0,0 @@ -import gzip -import unittest -from unittest import mock - -import hail as hl -import responses - -from v03_pipeline.lib.reference_data.clinvar import ( - import_submission_table, - join_to_submission_summary_ht, - parse_clinvar_release_date, - parsed_and_mapped_clnsigconf, - parsed_clnsig, -) - -CLINVAR_VCF_DATA = b""" -##fileformat=VCFv4.1 -##fileDate=2024-10-27 -##source=ClinVar -##reference=GRCh37 -##ID= -##INFO= -""" - - -class ClinvarTest(unittest.TestCase): - @responses.activate - def test_parse_clinvar_release_date(self): - clinvar_url = ( - 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz' - ) - responses.get( - clinvar_url, - body=gzip.compress(CLINVAR_VCF_DATA), - ) - self.assertEqual( - parse_clinvar_release_date(clinvar_url), - '2024-10-27', - ) - - def test_parsed_clnsig(self): - ht = hl.Table.parallelize( - [ - {'info': hl.Struct(CLNSIG=['Pathogenic|Affects'])}, - { - 'info': hl.Struct( - CLNSIG=[ - 'Pathogenic/Likely_pathogenic/Pathogenic', - '_low_penetrance', - ], - ), - }, - { - 'info': hl.Struct( - CLNSIG=[ - 'Likely_pathogenic/Pathogenic', - '_low_penetrance|association|protective', - ], - ), - }, - {'info': hl.Struct(CLNSIG=['Likely_pathogenic', '_low_penetrance'])}, - {'info': hl.Struct(CLNSIG=['association|protective'])}, - { - 'info': hl.Struct( - CLNSIG=[ - 'Pathogenic/Likely_pathogenic/Pathogenic', - '_low_penetrance/Established_risk_allele', - ], - ), - }, - ], - hl.tstruct(info=hl.tstruct(CLNSIG=hl.tarray(hl.tstr))), - ) - self.assertListEqual( - parsed_clnsig(ht).collect(), - [ - ['Pathogenic', 'Affects'], - ['Pathogenic/Likely_pathogenic', 'low_penetrance'], - ['Likely_pathogenic', 'low_penetrance', 'association', 'protective'], - ['Likely_pathogenic', 'low_penetrance'], - ['association', 'protective'], - [ - 'Pathogenic/Likely_pathogenic/Established_risk_allele', - 'low_penetrance', - ], - ], - ) - - def test_parsed_and_mapped_clnsigconf(self): - ht = hl.Table.parallelize( - [ - {'info': hl.Struct(CLNSIGCONF=hl.missing(hl.tarray(hl.tstr)))}, - { - 'info': hl.Struct( - CLNSIGCONF=[ - 'Pathogenic(8)|Likely_pathogenic(2)|Pathogenic', - '_low_penetrance(1)|Uncertain_significance(1)', - ], - ), - }, - ], - hl.tstruct(info=hl.tstruct(CLNSIGCONF=hl.tarray(hl.tstr))), - ) - self.assertListEqual( - parsed_and_mapped_clnsigconf(ht).collect(), - [ - None, - [ - hl.Struct(count=9, pathogenicity_id=0), - hl.Struct(count=2, pathogenicity_id=5), - hl.Struct(count=1, pathogenicity_id=12), - ], - ], - ) - - @mock.patch( - 'v03_pipeline.lib.reference_data.clinvar.hl.import_table', - ) - def test_import_submission_table(self, mock_import_table): - mock_import_table.return_value = hl.Table.parallelize( - [ - { - '#VariationID': '5', - 'Submitter': 'OMIM', - 'ReportedPhenotypeInfo': 'C3661900:not provided', - }, - { - '#VariationID': '5', - 'Submitter': 'Broad Institute Rare Disease Group, Broad Institute', - 'ReportedPhenotypeInfo': 'C0023264:Leigh syndrome', - }, - { - '#VariationID': '5', - 'Submitter': 'PreventionGenetics, part of Exact Sciences', - 'ReportedPhenotypeInfo': 'na:FOXRED1-related condition', - }, - { - '#VariationID': '5', - 'Submitter': 'Invitae', - 'ReportedPhenotypeInfo': 'C4748791:Mitochondrial complex 1 deficiency, nuclear type 19', - }, - { - '#VariationID': '6', - 'Submitter': 'A', - 'ReportedPhenotypeInfo': 'na:B', - }, - ], - ) - ht = import_submission_table('mock_file_name') - self.assertEqual( - ht.collect(), - [ - hl.Struct( - VariationID='5', - Submitters=[ - 'OMIM', - 'Broad Institute Rare Disease Group, Broad Institute', - 'PreventionGenetics, part of Exact Sciences', - 'Invitae', - ], - Conditions=[ - 'C3661900:not provided', - 'C0023264:Leigh syndrome', - 'na:FOXRED1-related condition', - 'C4748791:Mitochondrial complex 1 deficiency, nuclear type 19', - ], - ), - hl.Struct( - VariationID='6', - Submitters=['A'], - Conditions=['na:B'], - ), - ], - ) - - @mock.patch( - 'v03_pipeline.lib.reference_data.clinvar.download_and_import_clinvar_submission_summary', - ) - def test_join_to_submission_summary_ht( - self, - mock_download, - ): - vcf_ht = hl.Table.parallelize( - [ - { - 'locus': hl.Locus( - contig='chr1', - position=871269, - reference_genome='GRCh38', - ), - 'alleles': ['A', 'C'], - 'rsid': '5', - 'info': hl.Struct(ALLELEID=1), - }, - { - 'locus': hl.Locus( - contig='chr1', - position=871269, - reference_genome='GRCh38', - ), - 'alleles': ['A', 'AC'], - 'rsid': '7', - 'info': hl.Struct(ALLELEID=1), - }, - ], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - rsid=hl.tstr, - info=hl.tstruct(ALLELEID=hl.tint32), - ), - ) - submitters_ht = hl.Table.parallelize( - [ - { - 'VariationID': '5', - 'Submitters': [ - 'OMIM', - 'Broad Institute Rare Disease Group, Broad Institute', - 'PreventionGenetics, part of Exact Sciences', - 'Invitae', - ], - 'Conditions': [ - 'C3661900:not provided', - 'C0023264:Leigh syndrome', - 'na:FOXRED1-related condition', - 'C4748791:Mitochondrial complex 1 deficiency, nuclear type 19', - ], - }, - {'VariationID': '6', 'Submitters': ['A'], 'Conditions': ['na:B']}, - ], - hl.tstruct( - VariationID=hl.tstr, - Submitters=hl.tarray(hl.tstr), - Conditions=hl.tarray(hl.tstr), - ), - key='VariationID', - ) - expected_clinvar_ht_rows = [ - hl.Struct( - locus=hl.Locus( - contig='chr1', - position=871269, - reference_genome='GRCh38', - ), - alleles=['A', 'C'], - rsid='5', - info=hl.Struct(ALLELEID=1), - submitters=[ - 'OMIM', - 'Broad Institute Rare Disease Group, Broad Institute', - 'PreventionGenetics, part of Exact Sciences', - 'Invitae', - ], - conditions=[ - 'C3661900:not provided', - 'C0023264:Leigh syndrome', - 'na:FOXRED1-related condition', - 'C4748791:Mitochondrial complex 1 deficiency, nuclear type 19', - ], - ), - hl.Struct( - locus=hl.Locus( - contig='chr1', - position=871269, - reference_genome='GRCh38', - ), - alleles=['A', 'AC'], - rsid='7', - info=hl.Struct(ALLELEID=1), - submitters=None, - conditions=None, - ), - ] - - mock_download.return_value = submitters_ht - ht = join_to_submission_summary_ht(vcf_ht) - self.assertEqual( - ht.collect(), - expected_clinvar_ht_rows, - ) diff --git a/v03_pipeline/lib/reference_data/compare_globals.py b/v03_pipeline/lib/reference_data/compare_globals.py deleted file mode 100644 index bdaf367ae..000000000 --- a/v03_pipeline/lib/reference_data/compare_globals.py +++ /dev/null @@ -1,137 +0,0 @@ -import dataclasses - -import hail as hl - -from v03_pipeline.lib.logger import get_logger -from v03_pipeline.lib.model import ( - DatasetType, - ReferenceGenome, -) -from v03_pipeline.lib.reference_data.clinvar import parse_clinvar_release_date -from v03_pipeline.lib.reference_data.config import CONFIG -from v03_pipeline.lib.reference_data.dataset_table_operations import ( - get_all_select_fields, - get_enum_select_fields, - import_ht_from_config_path, -) - -logger = get_logger(__name__) - - -def clinvar_versions_equal( - ht: hl.Table, - reference_genome: ReferenceGenome, - dataset_type: DatasetType, -) -> bool: - dataset = 'clinvar_mito' if dataset_type == DatasetType.MITO else 'clinvar' - return hl.eval(ht.globals.versions[dataset]) == parse_clinvar_release_date( - CONFIG[dataset][reference_genome.v02_value]['source_path'], - ) - - -@dataclasses.dataclass -class Globals: - paths: dict[str, str] - versions: dict[str, str] - enums: dict[str, dict[str, list[str]]] - selects: dict[str, dict[str, hl.dtype]] - - def __getitem__(self, name: str): - return getattr(self, name) - - @classmethod - def from_dataset_configs( - cls, - reference_genome: ReferenceGenome, - datasets: list[str], - ): - paths, versions, enums, selects = {}, {}, {}, {} - for dataset in datasets: - dataset_config = CONFIG[dataset][reference_genome.v02_value] - dataset_ht = import_ht_from_config_path( - dataset_config, - dataset, - reference_genome, - ) - dataset_ht_globals = hl.eval(dataset_ht.globals) - paths[dataset] = dataset_ht_globals.path - versions[dataset] = dataset_ht_globals.version - enums[dataset] = dict(dataset_ht_globals.enums) - dataset_ht = dataset_ht.select( - **get_all_select_fields(dataset_ht, dataset_config), - ) - dataset_ht = dataset_ht.transmute( - **get_enum_select_fields(dataset_ht, dataset_config), - ) - selects[dataset] = { - k: v.dtype - for k, v in dict(dataset_ht.row).items() - if k not in set(dataset_ht.key) - } - return cls(paths, versions, enums, selects) - - @classmethod - def from_ht( - cls, - ht: hl.Table, - datasets: list[str], - ): - rdc_globals_struct = hl.eval(ht.globals) - paths = dict(rdc_globals_struct.paths) - versions = dict(rdc_globals_struct.versions) - # enums are nested structs - enums = {k: dict(v) for k, v in rdc_globals_struct.enums.items() if k in paths} - selects = {} - for dataset in datasets: - if dataset in ht.row: - # NB: handle an edge case (mito high constraint) where we annotate a bool from the reference dataset collection - selects[dataset] = ( - {k: v.dtype for k, v in dict(ht[dataset]).items()} - if isinstance(ht[dataset], hl.StructExpression) - else {} - ) - return cls(paths, versions, enums, selects) - - -def validate_selects_types( - ht1_globals: Globals, - ht2_globals: Globals, - dataset: str, -) -> None: - # Assert that all shared annotations have identical types - shared_selects = ( - ht1_globals['selects'][dataset].keys() - & ht2_globals['selects'].get(dataset).keys() - ) - mismatched_select_types = [ - (select, ht2_globals['selects'][dataset][select]) - for select in shared_selects - if ( - ht1_globals['selects'][dataset][select] - != ht2_globals['selects'][dataset][select] - ) - ] - if mismatched_select_types: - msg = f'Unexpected field types detected in {dataset}: {mismatched_select_types}' - raise ValueError(msg) - - -def get_datasets_to_update( - ht1_globals: Globals, - ht2_globals: Globals, - validate_selects: bool = True, -) -> list[str]: - datasets_to_update = set() - for field in dataclasses.fields(Globals): - if field.name == 'selects' and not validate_selects: - continue - datasets_to_update.update( - ht1_globals[field.name].keys() ^ ht2_globals[field.name].keys(), - ) - for dataset in ht1_globals[field.name].keys() & ht2_globals[field.name].keys(): - if field.name == 'selects': - validate_selects_types(ht1_globals, ht2_globals, dataset) - if ht1_globals[field.name][dataset] != ht2_globals[field.name][dataset]: - logger.info(f'{field.name} mismatch for {dataset}') - datasets_to_update.add(dataset) - return sorted(datasets_to_update) diff --git a/v03_pipeline/lib/reference_data/compare_globals_test.py b/v03_pipeline/lib/reference_data/compare_globals_test.py deleted file mode 100644 index 786964fcb..000000000 --- a/v03_pipeline/lib/reference_data/compare_globals_test.py +++ /dev/null @@ -1,321 +0,0 @@ -import unittest -from unittest import mock - -import hail as hl - -from v03_pipeline.lib.model import ( - ReferenceGenome, -) -from v03_pipeline.lib.reference_data.compare_globals import ( - Globals, - get_datasets_to_update, -) - -CONFIG = { - 'a': { - '38': { - 'custom_import': None, - 'source_path': 'a_path', # 'a' has a custom import - 'select': { - 'test_select': 'info.test_select', - 'test_enum': 'test_enum', - }, - 'version': 'a_version', - 'enum_select': {'test_enum': ['A', 'B']}, - }, - }, - 'b': { # b is missing version - '38': { - 'path': 'b_path', - 'select': { - 'test_select': 'info.test_select', - 'test_enum': 'test_enum', - }, - 'enum_select': {'test_enum': ['C', 'D']}, - 'custom_select': lambda ht: {'field2': ht.info.test_select_2}, - }, - }, -} - -B_TABLE = hl.Table.parallelize( - [], - schema=hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - info=hl.tstruct( - test_select=hl.tint, - test_select_2=hl.tint, - ), - test_enum=hl.tstr, - ), - globals=hl.Struct( - version='b_version', - path='b_path', - enums=hl.Struct(test_enum=['C', 'D']), - ), - key=['locus', 'alleles'], -) - - -class CompareGlobalsTest(unittest.TestCase): - @mock.patch.dict('v03_pipeline.lib.reference_data.compare_globals.CONFIG', CONFIG) - @mock.patch( - 'v03_pipeline.lib.reference_data.compare_globals.import_ht_from_config_path', - ) - def test_create_globals_from_dataset_configs( - self, - mock_import_dataset_ht, - ): - mock_import_dataset_ht.side_effect = [ - hl.Table.parallelize( - [], - schema=hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - info=hl.tstruct( - test_select=hl.tint, - ), - test_enum=hl.tstr, - ), - globals=hl.Struct( - version='a_version', - path='a_path', - enums=hl.Struct(test_enum=['A', 'B']), - ), - key=['locus', 'alleles'], - ), - B_TABLE, - ] - dataset_config_globals = Globals.from_dataset_configs( - reference_genome=ReferenceGenome.GRCh38, - datasets=['a', 'b'], - ) - self.assertTrue( - dataset_config_globals.versions == {'a': 'a_version', 'b': 'b_version'}, - ) - self.assertTrue( - dataset_config_globals.paths == {'a': 'a_path', 'b': 'b_path'}, - ) - self.assertTrue( - dataset_config_globals.enums - == {'a': {'test_enum': ['A', 'B']}, 'b': {'test_enum': ['C', 'D']}}, - ) - self.assertTrue( - dataset_config_globals.selects - == { - 'a': { - 'test_select': hl.tint32, - 'test_enum_id': hl.tint32, - }, - 'b': { - 'test_select': hl.tint32, - 'field2': hl.tint32, - 'test_enum_id': hl.tint32, - }, - }, - ) - - @mock.patch.dict('v03_pipeline.lib.reference_data.compare_globals.CONFIG', CONFIG) - @mock.patch( - 'v03_pipeline.lib.reference_data.dataset_table_operations.hl.read_table', - ) - def test_create_globals_from_dataset_configs_single_dataset(self, mock_read_table): - # by mocking hl.read_table() (only possible for a dataset without a custom import), - # we can test the code inside import_ht_from_config_path() - mock_read_table.return_value = B_TABLE - - dataset_config_globals = Globals.from_dataset_configs( - reference_genome=ReferenceGenome.GRCh38, - datasets=['b'], - ) - - self.assertTrue( - dataset_config_globals.versions == {'b': 'b_version'}, - ) - self.assertTrue( - dataset_config_globals.paths == {'b': 'b_path'}, - ) - self.assertTrue( - dataset_config_globals.enums == {'b': {'test_enum': ['C', 'D']}}, - ) - self.assertTrue( - dataset_config_globals.selects - == { - 'b': { - 'test_select': hl.tint32, - 'field2': hl.tint32, - 'test_enum_id': hl.tint32, - }, - }, - ) - - def test_from_rdc_or_annotations_ht(self): - rdc_ht = hl.Table.parallelize( - [], - schema=hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - gnomad_non_coding_constraint=hl.tstruct( - z_score=hl.tfloat32, - ), - screen=hl.tstruct( - region_type_ids=hl.tarray(hl.tint32), - ), - ), - globals=hl.Struct( - paths=hl.Struct( - gnomad_non_coding_constraint='gnomad_non_coding_constraint_path', - screen='screen_path', - ), - versions=hl.Struct( - gnomad_non_coding_constraint='v1', - screen='v2', - ), - enums=hl.Struct( - screen=hl.Struct(region_type=['C', 'D']), - ), - ), - ) - rdc_globals = Globals.from_ht( - rdc_ht, - ['gnomad_non_coding_constraint', 'screen'], - ) - self.assertTrue( - rdc_globals.versions - == {'gnomad_non_coding_constraint': 'v1', 'screen': 'v2'}, - ) - self.assertTrue( - rdc_globals.paths - == { - 'gnomad_non_coding_constraint': 'gnomad_non_coding_constraint_path', - 'screen': 'screen_path', - }, - ) - self.assertTrue( - rdc_globals.enums == {'screen': {'region_type': ['C', 'D']}}, - ) - self.assertTrue( - rdc_globals.selects - == { - 'gnomad_non_coding_constraint': {'z_score': hl.tfloat32}, - 'screen': {'region_type_ids': hl.tarray(hl.tint32)}, - }, - ) - - def test_get_datasets_to_update_version_different(self): - result = get_datasets_to_update( - ht1_globals=Globals( - paths={'a': 'a_path', 'b': 'b_path'}, - # 'a' has a different version, 'c' is missing version in ht2_globals - versions={'a': 'v2', 'b': 'v2', 'c': 'v1'}, - enums={'a': {}, 'b': {}, 'c': {}}, - selects={'a': {}, 'b': {}}, - ), - ht2_globals=Globals( - paths={'a': 'a_path', 'b': 'b_path'}, - versions={'a': 'v1', 'b': 'v2'}, - enums={'a': {}, 'b': {}}, - selects={'a': {}, 'b': {}}, - ), - ) - self.assertTrue(result == ['a', 'c']) - - def test_get_datasets_to_update_path_different(self): - result = get_datasets_to_update( - ht1_globals=Globals( - # 'b' has a different path, 'c' is missing path in ht2_globals - paths={'a': 'a_path', 'b': 'old_b_path', 'c': 'extra_c_path'}, - versions={'a': 'v1', 'b': 'v2'}, - enums={'a': {}, 'b': {}}, - selects={'a': {}, 'b': {}}, - ), - ht2_globals=Globals( - paths={'a': 'a_path', 'b': 'b_path'}, - versions={'a': 'v1', 'b': 'v2'}, - enums={'a': {}, 'b': {}}, - selects={'a': {}, 'b': {}}, - ), - ) - self.assertTrue(result == ['b', 'c']) - - def test_get_datasets_to_update_enum_different(self): - result = get_datasets_to_update( - ht1_globals=Globals( - paths={'a': 'a_path', 'b': 'b_path'}, - versions={'a': 'v1', 'b': 'v2'}, - # 'a' has different enum values, 'b' has different enum key, 'c' is missing enum in ht2_globals - enums={ - 'a': {'test_enum': ['A', 'B']}, - 'b': {'enum_key_1': []}, - 'c': {}, - }, - selects={'a': {}, 'b': {}}, - ), - ht2_globals=Globals( - paths={'a': 'a_path', 'b': 'b_path'}, - versions={'a': 'v1', 'b': 'v2'}, - enums={'a': {'test_enum': ['C', 'D']}, 'b': {'enum_key_2': []}}, - selects={'a': {}, 'b': {}}, - ), - ) - self.assertTrue(result == ['a', 'b', 'c']) - - def test_get_datasets_to_update_select_different(self): - result = get_datasets_to_update( - ht1_globals=Globals( - paths={'a': 'a_path', 'b': 'b_path'}, - versions={'a': 'v1', 'b': 'v2'}, - enums={'a': {}, 'b': {}}, - # 'a' has extra select, 'b' has different select, 'c' is missing select in ht2_globals - selects={ - 'a': {'field1': hl.tint32, 'field2': hl.tint32}, - 'b': {'test_select': hl.tint32}, - 'c': {'test_select': hl.tint32}, - }, - ), - ht2_globals=Globals( - paths={'a': 'a_path', 'b': 'b_path'}, - versions={'a': 'v1', 'b': 'v2'}, - enums={'a': {}, 'b': {}}, - selects={'a': {'field1': hl.tint32}, 'b': {'test_select_2': hl.tint32}}, - ), - ) - self.assertTrue(result == ['a', 'b', 'c']) - - def test_get_datasets_to_update_select_type_validation(self): - self.assertRaisesRegex( - ValueError, - "Unexpected field types detected in a: \\[\\('field1', dtype\\('int32'\\)\\)\\]", - get_datasets_to_update, - ht1_globals=Globals( - paths={'a': 'a_path'}, - versions={'a': 'v1'}, - enums={'a': {}}, - selects={ - 'a': {'field1': hl.tarray(hl.tint32)}, - }, - ), - ht2_globals=Globals( - paths={'a': 'a_path'}, - versions={'a': 'v1'}, - enums={'a': {}}, - selects={'a': {'field1': hl.tint32, 'field2': hl.tint32}}, - ), - ) - result = get_datasets_to_update( - ht1_globals=Globals( - paths={'a': 'a_path'}, - versions={'a': 'v1'}, - enums={'a': {}}, - selects={ - 'a': {'field1': hl.tarray(hl.tint32)}, - }, - ), - ht2_globals=Globals( - paths={'a': 'a_path'}, - versions={'a': 'v1'}, - enums={'a': {}}, - selects={'a': {'field1': hl.tarray(hl.tint32), 'field2': hl.tint32}}, - ), - ) - self.assertTrue(result == ['a']) diff --git a/v03_pipeline/lib/reference_data/config.py b/v03_pipeline/lib/reference_data/config.py deleted file mode 100644 index 047532c0c..000000000 --- a/v03_pipeline/lib/reference_data/config.py +++ /dev/null @@ -1,549 +0,0 @@ -from typing import Any - -import hail as hl - -from v03_pipeline.lib.annotations.enums import ( - CLINVAR_DEFAULT_PATHOGENICITY, - CLINVAR_PATHOGENICITIES, - CLINVAR_PATHOGENICITIES_LOOKUP, -) -from v03_pipeline.lib.model.definitions import ReferenceGenome -from v03_pipeline.lib.reference_data.clinvar import ( - CLINVAR_ASSERTIONS, - CLINVAR_GOLD_STARS_LOOKUP, - get_clinvar_ht, - parsed_and_mapped_clnsigconf, - parsed_clnsig, -) -from v03_pipeline.lib.reference_data.hgmd import download_and_import_hgmd_vcf -from v03_pipeline.lib.reference_data.mito import ( - download_and_import_local_constraint_tsv, -) - - -def import_locus_intervals( - url: str, - reference_genome: ReferenceGenome, -) -> hl.Table: - return hl.import_locus_intervals(url, reference_genome.value) - - -def import_matrix_table( - url: str, - _: Any, -) -> hl.Table: - return hl.read_matrix_table(url).rows() - - -def predictor_parse(field: hl.StringExpression): - return field.split(';').find(lambda p: p != '.') - - -def clinvar_custom_select(ht): - selects = {} - clnsigs = parsed_clnsig(ht) - selects['pathogenicity'] = hl.if_else( - CLINVAR_PATHOGENICITIES_LOOKUP.contains(clnsigs[0]), - clnsigs[0], - CLINVAR_DEFAULT_PATHOGENICITY, - ) - selects['assertion'] = hl.if_else( - CLINVAR_PATHOGENICITIES_LOOKUP.contains(clnsigs[0]), - clnsigs[1:], - clnsigs, - ) - # NB: the `enum_select` does not support mapping a list of tuples - # so there's a hidden enum-mapping inside this clinvar function. - selects['conflictingPathogenicities'] = parsed_and_mapped_clnsigconf(ht) - selects['goldStars'] = CLINVAR_GOLD_STARS_LOOKUP.get(hl.delimit(ht.info.CLNREVSTAT)) - selects['submitters'] = ht.submitters - selects['conditions'] = hl.map( - lambda p: p.split(r':')[1], - ht.conditions, - ) # assumes the format 'MedGen#:condition', e.g.'C0023264:Leigh syndrome' - return selects - - -def dbnsfp_custom_select(ht): - selects = {} - selects['REVEL_score'] = hl.parse_float32(ht.REVEL_score) - selects['SIFT_score'] = hl.parse_float32(predictor_parse(ht.SIFT_score)) - selects['Polyphen2_HVAR_score'] = hl.parse_float32( - predictor_parse(ht.Polyphen2_HVAR_score), - ) - selects['MutationTaster_pred'] = predictor_parse(ht.MutationTaster_pred) - return selects - - -def dbnsfp_custom_select_38(ht): - selects = dbnsfp_custom_select(ht) - selects['VEST4_score'] = hl.parse_float32(predictor_parse(ht.VEST4_score)) - selects['MutPred_score'] = hl.parse_float32(ht.MutPred_score) - selects['fathmm_MKL_coding_score'] = hl.float32(ht.fathmm_MKL_coding_score) - return selects - - -def dbnsfp_mito_custom_select(ht): - selects = {} - selects['SIFT_score'] = hl.parse_float32(predictor_parse(ht.SIFT_score)) - selects['MutationTaster_pred'] = predictor_parse(ht.MutationTaster_pred) - return selects - - -def custom_gnomad_mito(ht): - selects = {} - selects['AN'] = hl.int32(ht.AN) - selects['AC_hom'] = hl.int32(ht.AC_hom) - selects['AC_het'] = hl.int32(ht.AC_het) - selects['AF_hom'] = ht.AF_hom - selects['AF_het'] = ht.AF_het - selects['max_hl'] = ht.max_hl - return selects - - -def custom_gnomad_select_v2(ht): - """ - Custom select for public gnomad v2 dataset (which we did not generate). Extracts fields like - 'AF', 'AN', and generates 'hemi'. - :param ht: hail table - :return: select expression dict - """ - selects = {} - global_idx = hl.eval(ht.globals.freq_index_dict['gnomad']) - selects['AF'] = hl.float32(ht.freq[global_idx].AF) - selects['AN'] = ht.freq[global_idx].AN - selects['AC'] = ht.freq[global_idx].AC - selects['Hom'] = ht.freq[global_idx].homozygote_count - - selects['AF_POPMAX_OR_GLOBAL'] = hl.float32( - hl.or_else( - ht.popmax[ht.globals.popmax_index_dict['gnomad']].AF, - ht.freq[global_idx].AF, - ), - ) - selects['FAF_AF'] = hl.float32(ht.faf[ht.globals.popmax_index_dict['gnomad']].faf95) - selects['Hemi'] = hl.if_else( - ht.locus.in_autosome_or_par(), - 0, - ht.freq[ht.globals.freq_index_dict['gnomad_male']].AC, - ) - return selects - - -def custom_gnomad_select_v4(ht): - """ - Custom select for public gnomad v4 dataset (which we did not generate). Extracts fields like - 'AF', 'AN', and generates 'hemi'. - :param ht: hail table - :return: select expression dict - """ - selects = {} - global_idx = hl.eval(ht.globals.freq_index_dict['adj']) - selects['AF'] = hl.float32(ht.freq[global_idx].AF) - selects['AN'] = ht.freq[global_idx].AN - selects['AC'] = ht.freq[global_idx].AC - selects['Hom'] = ht.freq[global_idx].homozygote_count - - grpmax_af = ht.grpmax['gnomad'].AF if hasattr(ht.grpmax, 'gnomad') else ht.grpmax.AF - selects['AF_POPMAX_OR_GLOBAL'] = hl.float32( - hl.or_else(grpmax_af, ht.freq[global_idx].AF), - ) - selects['FAF_AF'] = hl.float32(ht.faf[ht.globals.faf_index_dict['adj']].faf95) - selects['Hemi'] = hl.if_else( - ht.locus.in_autosome_or_par(), - 0, - ht.freq[ht.globals.freq_index_dict['XY_adj']].AC, - ) - return selects - - -def custom_mpc_select(ht): - selects = {} - selects['MPC'] = hl.parse_float32(ht.info.MPC) - return selects - - -""" -Configurations of dataset to combine. -Format: -'': { - '': { - 'path': 'gs://path/to/hailtable.ht', - 'select': '', - 'custom_select': '', - 'enum_select': '' - 'custom_import': '', - 'source_path': '' - }, -""" -CONFIG = { - 'cadd': { - '37': { - 'version': 'v1.6', - 'path': 'gs://seqr-reference-data/GRCh37/CADD/CADD_snvs_and_indels.v1.6.ht', - 'select': ['PHRED'], - }, - '38': { - 'version': 'v1.6', - 'path': 'gs://seqr-reference-data/GRCh38/CADD/CADD_snvs_and_indels.v1.6.ht', - 'select': ['PHRED'], - }, - }, - 'clinvar': { - '37': { - 'custom_import': get_clinvar_ht, - 'source_path': 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', - 'select': {'alleleId': 'info.ALLELEID'}, - 'custom_select': clinvar_custom_select, - 'enum_select': { - 'pathogenicity': CLINVAR_PATHOGENICITIES, - 'assertion': CLINVAR_ASSERTIONS, - }, - 'filter': lambda ht: ht.locus.contig != 'MT', - }, - '38': { - 'custom_import': get_clinvar_ht, - 'source_path': 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', - 'select': {'alleleId': 'info.ALLELEID'}, - 'custom_select': clinvar_custom_select, - 'enum_select': { - 'pathogenicity': CLINVAR_PATHOGENICITIES, - 'assertion': CLINVAR_ASSERTIONS, - }, - 'filter': lambda ht: ht.locus.contig != 'chrM', - }, - }, - 'dbnsfp': { - '37': { - 'version': '2.9.3', - 'path': 'gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.with_new_scores.ht', - 'custom_select': dbnsfp_custom_select, - 'enum_select': { - 'MutationTaster_pred': ['D', 'A', 'N', 'P'], - }, - 'filter': lambda ht: ht.locus.contig != 'MT', - }, - '38': { - 'version': '4.2', - 'path': 'gs://seqr-reference-data/GRCh38/dbNSFP/v4.2/dbNSFP4.2a_variant.with_new_scores.ht', - 'custom_select': dbnsfp_custom_select_38, - 'enum_select': { - 'MutationTaster_pred': ['D', 'A', 'N', 'P'], - }, - 'filter': lambda ht: ht.locus.contig != 'chrM', - }, - }, - 'eigen': { - '37': { - 'path': 'gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.ht', - 'select': {'Eigen_phred': 'info.Eigen-phred'}, - }, - '38': { - 'path': 'gs://seqr-reference-data/GRCh38/eigen/EIGEN_coding_noncoding.liftover_grch38.ht', - 'select': {'Eigen_phred': 'info.Eigen-phred'}, - }, - }, - 'hgmd': { - '37': { - 'custom_import': download_and_import_hgmd_vcf, - 'version': 'HGMD_Pro_2023', - 'source_path': 'gs://seqr-reference-data-private/GRCh37/HGMD/HGMD_Pro_2023.1_hg19.vcf.gz', - 'select': {'accession': 'rsid', 'class': 'info.CLASS'}, - 'enum_select': { - 'class': [ - 'DM', - 'DM?', - 'DP', - 'DFP', - 'FP', - 'R', - ], - }, - }, - '38': { - 'custom_import': download_and_import_hgmd_vcf, - 'version': 'HGMD_Pro_2023', - 'source_path': 'gs://seqr-reference-data-private/GRCh38/HGMD/HGMD_Pro_2023.1_hg38.vcf.gz', - 'select': {'accession': 'rsid', 'class': 'info.CLASS'}, - 'enum_select': { - 'class': [ - 'DM', - 'DM?', - 'DP', - 'DFP', - 'FP', - 'R', - ], - }, - }, - }, - 'mpc': { - '37': { - 'path': 'gs://seqr-reference-data/GRCh37/MPC/fordist_constraint_official_mpc_values.ht', - 'custom_select': custom_mpc_select, - }, - '38': { - 'path': 'gs://seqr-reference-data/GRCh38/MPC/fordist_constraint_official_mpc_values.liftover.GRCh38.ht', - 'custom_select': custom_mpc_select, - }, - }, - 'primate_ai': { - '37': { - 'version': 'v0.2', - 'path': 'gs://seqr-reference-data/GRCh37/primate_ai/PrimateAI_scores_v0.2.ht', - 'select': {'score': 'info.score'}, - }, - '38': { - 'version': 'v0.2', - 'path': 'gs://seqr-reference-data/GRCh38/primate_ai/PrimateAI_scores_v0.2.liftover_grch38.ht', - 'select': {'score': 'info.score'}, - }, - }, - 'splice_ai': { - '37': { - 'path': 'gs://seqr-reference-data/GRCh37/spliceai/spliceai_scores.ht', - 'select': { - 'delta_score': 'info.max_DS', - 'splice_consequence': 'info.splice_consequence', - }, - 'enum_select': { - 'splice_consequence': [ - 'Acceptor gain', - 'Acceptor loss', - 'Donor gain', - 'Donor loss', - 'No consequence', - ], - }, - }, - '38': { - 'path': 'gs://seqr-reference-data/GRCh38/spliceai/spliceai_scores.ht', - 'select': { - 'delta_score': 'info.max_DS', - 'splice_consequence': 'info.splice_consequence', - }, - 'enum_select': { - 'splice_consequence': [ - 'Acceptor gain', - 'Acceptor loss', - 'Donor gain', - 'Donor loss', - 'No consequence', - ], - }, - }, - }, - 'topmed': { - '37': { - 'path': 'gs://seqr-reference-data/GRCh37/TopMed/bravo-dbsnp-all.removed_chr_prefix.liftunder_GRCh37.ht', - 'select': { - 'AC': 'info.AC#', - 'AF': 'info.AF#', - 'AN': 'info.AN', - 'Hom': 'info.Hom#', - 'Het': 'info.Het#', - }, - }, - '38': { - 'path': 'gs://seqr-reference-data/GRCh38/TopMed/freeze8/TOPMed.all.ht', - 'select': { - 'AC': 'info.AC', - 'AF': 'info.AF', - 'AN': 'info.AN', - 'Hom': 'info.Hom', - 'Het': 'info.Het', - }, - }, - }, - 'gnomad_exomes': { - '37': { - 'version': 'r2.1.1', - 'path': 'gs://gcp-public-data--gnomad/release/2.1.1/ht/exomes/gnomad.exomes.r2.1.1.sites.ht', - 'custom_select': custom_gnomad_select_v2, - }, - '38': { - 'version': '4.1', - 'path': 'gs://gcp-public-data--gnomad/release/4.1/ht/exomes/gnomad.exomes.v4.1.sites.ht', - 'custom_select': custom_gnomad_select_v4, - }, - }, - 'gnomad_genomes': { - '37': { - 'version': 'r2.1.1', - 'path': 'gs://gcp-public-data--gnomad/release/2.1.1/ht/genomes/gnomad.genomes.r2.1.1.sites.ht', - 'custom_select': custom_gnomad_select_v2, - }, - '38': { - 'version': '4.1', - 'path': 'gs://gcp-public-data--gnomad/release/4.1/ht/genomes/gnomad.genomes.v4.1.sites.ht', - 'custom_select': custom_gnomad_select_v4, - }, - }, - 'gnomad_qc': { - '37': { - 'version': 'v2', - 'custom_import': import_matrix_table, - # Note: copied from 'gs://gnomad/sample_qc/mt/gnomad.joint.high_callrate_common_biallelic_snps.pruned.mt' - 'source_path': 'gs://seqr-reference-data/gnomad_qc/GRCh37/gnomad.joint.high_callrate_common_biallelic_snps.pruned.mt', - }, - '38': { - 'version': '4.0', - 'path': 'gs://gcp-public-data--gnomad/release/4.0/pca/gnomad.v4.0.pca_loadings.ht', - }, - }, - 'exac': { - '37': { - 'path': 'gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vep.ht', - 'select': { - 'AF_POPMAX': 'info.AF_POPMAX', - 'AF': 'info.AF#', - 'AC_Adj': 'info.AC_Adj#', - 'AC_Het': 'info.AC_Het#', - 'AC_Hom': 'info.AC_Hom#', - 'AC_Hemi': 'info.AC_Hemi#', - 'AN_Adj': 'info.AN_Adj', - }, - }, - '38': { - 'path': 'gs://seqr-reference-data/GRCh38/gnomad/ExAC.r1.sites.liftover.b38.ht', - 'select': { - 'AF_POPMAX': 'info.AF_POPMAX', - 'AF': 'info.AF#', - 'AC_Adj': 'info.AC_Adj#', - 'AC_Het': 'info.AC_Het#', - 'AC_Hom': 'info.AC_Hom#', - 'AC_Hemi': 'info.AC_Hemi#', - 'AN_Adj': 'info.AN_Adj', - }, - }, - }, - 'gnomad_non_coding_constraint': { - '38': { - 'path': 'gs://seqr-reference-data/GRCh38/gnomad_nc_constraint/gnomad_non-coding_constraint_z_scores.ht', - 'select': {'z_score': 'target'}, - }, - }, - 'screen': { - '38': { - 'path': 'gs://seqr-reference-data/GRCh38/ccREs/GRCh38-ccREs.ht', - 'select': {'region_type': 'target'}, - 'enum_select': { - 'region_type': [ - 'CTCF-bound', - 'CTCF-only', - 'DNase-H3K4me3', - 'PLS', - 'dELS', - 'pELS', - 'DNase-only', - 'low-DNase', - ], - }, - }, - }, - 'clinvar_mito': { - '37': { - 'custom_import': get_clinvar_ht, - 'source_path': 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', - 'select': {'alleleId': 'info.ALLELEID'}, - 'custom_select': clinvar_custom_select, - 'enum_select': { - 'pathogenicity': CLINVAR_PATHOGENICITIES, - 'assertion': CLINVAR_ASSERTIONS, - }, - 'filter': lambda ht: ht.locus.contig == 'MT', - }, - '38': { - 'custom_import': get_clinvar_ht, - 'source_path': 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', - 'select': {'alleleId': 'info.ALLELEID'}, - 'custom_select': clinvar_custom_select, - 'enum_select': { - 'pathogenicity': CLINVAR_PATHOGENICITIES, - 'assertion': CLINVAR_ASSERTIONS, - }, - 'filter': lambda ht: ht.locus.contig == 'chrM', - }, - }, - 'dbnsfp_mito': { - '37': { - 'version': '2.9.3', - 'path': 'gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.with_new_scores.ht', - 'custom_select': dbnsfp_mito_custom_select, - 'enum_select': { - 'MutationTaster_pred': ['D', 'A', 'N', 'P'], - }, - 'filter': lambda ht: ht.locus.contig == 'MT', - }, - '38': { - 'version': '4.2', - 'path': 'gs://seqr-reference-data/GRCh38/dbNSFP/v4.2/dbNSFP4.2a_variant.with_new_scores.ht', - 'custom_select': dbnsfp_mito_custom_select, - 'enum_select': { - 'MutationTaster_pred': ['D', 'A', 'N', 'P'], - }, - 'filter': lambda ht: ht.locus.contig == 'chrM', - }, - }, - 'gnomad_mito': { - '38': { - 'version': 'v3.1', - 'path': 'gs://gcp-public-data--gnomad/release/3.1/ht/genomes/gnomad.genomes.v3.1.sites.chrM.ht', - 'custom_select': custom_gnomad_mito, - }, - }, - 'mitomap': { - '38': { - 'version': 'Feb. 04 2022', - 'path': 'gs://seqr-reference-data/GRCh38/mitochondrial/MITOMAP/mitomap-confirmed-mutations-2022-02-04.ht', - 'select': ['pathogenic'], - }, - }, - 'mitimpact': { - '38': { - 'version': '3.1.3', - 'path': 'gs://seqr-reference-data/GRCh38/mitochondrial/MitImpact/MitImpact_db_3.1.3.ht', - 'select': {'score': 'APOGEE2_score'}, - }, - }, - 'hmtvar': { - '38': { - 'version': 'Jan. 10 2022', - 'path': 'gs://seqr-reference-data/GRCh38/mitochondrial/HmtVar/HmtVar%20Jan.%2010%202022.ht', - 'select': {'score': 'disease_score'}, - }, - }, - 'helix_mito': { - '38': { - 'version': '20200327', - 'path': 'gs://seqr-reference-data/GRCh38/mitochondrial/Helix/HelixMTdb_20200327.ht', - 'select': { - 'AC_hom': 'counts_hom', - 'AF_hom': 'AF_hom', - 'AC_het': 'counts_het', - 'AF_het': 'AF_het', - 'AN': 'AN', - 'max_hl': 'max_ARF', - }, - }, - }, - 'high_constraint_region_mito': { - '38': { - 'version': 'Feb-15-2022', - 'source_path': 'gs://seqr-reference-data/GRCh38/mitochondrial/Helix high constraint intervals Feb-15-2022.tsv', - 'custom_import': import_locus_intervals, - }, - }, - 'local_constraint_mito': { - '38': { - 'version': '2024-07-24', - # Originally sourced from https://www.biorxiv.org/content/10.1101/2022.12.16.520778v2.supplementary-material - # Supplementary Table 7. - 'source_path': 'gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv', - 'custom_import': download_and_import_local_constraint_tsv, - 'select': {'score': 'MLC_score'}, - }, - }, -} diff --git a/v03_pipeline/lib/reference_data/dataset_table_operations.py b/v03_pipeline/lib/reference_data/dataset_table_operations.py deleted file mode 100644 index 7d4d44b67..000000000 --- a/v03_pipeline/lib/reference_data/dataset_table_operations.py +++ /dev/null @@ -1,218 +0,0 @@ -from datetime import datetime -from types import FunctionType - -import hail as hl -import pytz - -from v03_pipeline.lib.misc.nested_field import parse_nested_field -from v03_pipeline.lib.model import ( - DatasetType, - ReferenceDatasetCollection, - ReferenceGenome, -) -from v03_pipeline.lib.reference_data.config import CONFIG - - -def update_or_create_joined_ht( - reference_dataset_collection: ReferenceDatasetCollection, - dataset_type: DatasetType, - reference_genome: ReferenceGenome, - datasets: list[str], - joined_ht: hl.Table, -) -> hl.Table: - for dataset in datasets: - # Drop the dataset if it exists. - if dataset in joined_ht.row: - joined_ht = joined_ht.drop(dataset) - joined_ht = joined_ht.annotate_globals( - paths=joined_ht.paths.drop(dataset), - versions=joined_ht.versions.drop(dataset), - enums=joined_ht.enums.drop(dataset), - ) - - # Handle cases where a dataset has been dropped OR renamed. - if dataset not in CONFIG: - continue - - # Join the new one! - dataset_ht = get_dataset_ht(dataset, reference_genome) - joined_ht = joined_ht.join(dataset_ht, 'outer') - joined_ht = annotate_dataset_globals(joined_ht, dataset, dataset_ht) - - return joined_ht.filter( - hl.any( - [ - ~hl.is_missing(joined_ht[dataset]) - for dataset in reference_dataset_collection.datasets(dataset_type) - ], - ), - ) - - -def get_dataset_ht( - dataset: str, - reference_genome: ReferenceGenome, -) -> hl.Table: - config = CONFIG[dataset][reference_genome.v02_value] - ht = import_ht_from_config_path(config, dataset, reference_genome) - if hasattr(ht, 'locus'): - ht = ht.filter( - hl.set(reference_genome.standard_contigs).contains(ht.locus.contig), - ) - - ht = ht.filter(config['filter'](ht)) if 'filter' in config else ht - ht = ht.select(**get_all_select_fields(ht, config)) - ht = ht.transmute(**get_enum_select_fields(ht, config)) - return ht.select(**{dataset: ht.row.drop(*ht.key)}).distinct() - - -def get_ht_path(config: dict) -> str: - return config['source_path'] if 'custom_import' in config else config['path'] - - -def import_ht_from_config_path( - config: dict, - dataset: str, - reference_genome: ReferenceGenome, -) -> hl.Table: - path = get_ht_path(config) - ht = ( - config['custom_import'](path, reference_genome) - if 'custom_import' in config - else hl.read_table(path) - ) - return ht.annotate_globals( - path=path, - version=parse_dataset_version(ht, dataset, config), - enums=hl.Struct( - **config.get( - 'enum_select', - hl.missing(hl.tstruct(hl.tstr, hl.tarray(hl.tstr))), - ), - ), - ) - - -def get_select_fields(selects: list | dict | None, base_ht: hl.Table) -> dict: - """ - Generic function that takes in a select config and base_ht and generates a - select dict that is generated from traversing the base_ht and extracting the right - annotation. If '#' is included at the end of a select field, the appropriate - biallelic position will be selected (e.g. 'x#' -> x[base_ht.a_index-1]. - :param selects: mapping or list of selections - :param base_ht: base_ht to traverse - :return: select mapping from annotation name to base_ht annotation - """ - select_fields = {} - if selects is None: - return select_fields - if isinstance(selects, list): - select_fields = {selection: base_ht[selection] for selection in selects} - elif isinstance(selects, dict): - for key, val in selects.items(): - expression = parse_nested_field(base_ht, val) - # Parse float64s into float32s to save space! - if expression.dtype == hl.tfloat64: - expression = hl.float32(expression) - select_fields[key] = expression - return select_fields - - -def get_custom_select_fields(custom_select: FunctionType | None, ht: hl.Table) -> dict: - if custom_select is None: - return {} - return custom_select(ht) - - -def get_all_select_fields( - ht: hl.Table, - config: dict, -) -> dict: - return { - **get_select_fields(config.get('select'), ht), - **get_custom_select_fields(config.get('custom_select'), ht), - } - - -def get_enum_select_fields(ht: hl.Table, config: dict) -> dict: - enum_selects = config.get('enum_select') - enum_select_fields = {} - if enum_selects is None: - return enum_select_fields - for field_name, values in enum_selects.items(): - lookup = hl.dict( - hl.enumerate(values, index_first=False).extend( - # NB: adding missing values here allows us to - # hard fail if a mapped key is present and has an unexpected value - # but propagate missing values. - [(hl.missing(hl.tstr), hl.missing(hl.tint32))], - ), - ) - # NB: this conditioning on type is "outside" the hail expression context. - if ( - isinstance(ht[field_name].dtype, hl.tarray | hl.tset) - and ht[field_name].dtype.element_type == hl.tstr - ): - enum_select_fields[f'{field_name}_ids'] = ht[field_name].map( - lambda x: lookup[x], # noqa: B023 - ) - else: - enum_select_fields[f'{field_name}_id'] = lookup[ht[field_name]] - return enum_select_fields - - -def parse_dataset_version( - ht: hl.Table, - dataset: str, - config: dict, -) -> hl.StringExpression: - annotated_version = ht.globals.get('version', hl.missing(hl.tstr)) - config_version = config.get('version', hl.missing(hl.tstr)) - return ( - hl.case() - .when(hl.is_missing(config_version), annotated_version) - .when(hl.is_missing(annotated_version), config_version) - .when(annotated_version == config_version, config_version) - .or_error( - hl.format( - 'found mismatching versions for dataset %s. config version: %s, ht version: %s', - dataset, - config_version, - annotated_version, - ), - ) - ) - - -def annotate_dataset_globals(joined_ht: hl.Table, dataset: str, dataset_ht: hl.Table): - return joined_ht.select_globals( - paths=joined_ht.paths.annotate(**{dataset: dataset_ht.index_globals().path}), - versions=joined_ht.versions.annotate( - **{dataset: dataset_ht.index_globals().version}, - ), - enums=joined_ht.enums.annotate(**{dataset: dataset_ht.index_globals().enums}), - date=datetime.now(tz=pytz.timezone('US/Eastern')).isoformat(), - ) - - -def join_hts( - reference_genome: ReferenceGenome, - dataset_type: DatasetType, - reference_dataset_collection: ReferenceDatasetCollection, -): - key_type = reference_dataset_collection.table_key_type(reference_genome) - joined_ht = hl.Table.parallelize( - [], - key_type, - key=key_type.fields, - globals=hl.Struct( - paths=hl.Struct(), - versions=hl.Struct(), - enums=hl.Struct(), - ), - ) - for dataset in reference_dataset_collection.datasets(dataset_type): - dataset_ht = get_dataset_ht(dataset, reference_genome) - joined_ht = joined_ht.join(dataset_ht, 'outer') - joined_ht = annotate_dataset_globals(joined_ht, dataset, dataset_ht) - return joined_ht diff --git a/v03_pipeline/lib/reference_data/dataset_table_operations_test.py b/v03_pipeline/lib/reference_data/dataset_table_operations_test.py deleted file mode 100644 index f1376c8ff..000000000 --- a/v03_pipeline/lib/reference_data/dataset_table_operations_test.py +++ /dev/null @@ -1,585 +0,0 @@ -import unittest -from datetime import datetime -from unittest import mock - -import hail as hl -import pytz - -from v03_pipeline.lib.model import ( - DatasetType, - ReferenceDatasetCollection, - ReferenceGenome, -) -from v03_pipeline.lib.reference_data.config import ( - dbnsfp_custom_select, - dbnsfp_mito_custom_select, -) -from v03_pipeline.lib.reference_data.dataset_table_operations import ( - get_dataset_ht, - get_enum_select_fields, - update_or_create_joined_ht, -) - -MOCK_CONFIG = { - 'a': { - '38': { - 'path': '', - 'select': [ - 'd', - ], - }, - }, - 'b': { - '38': { - 'path': '', - 'select': [ - 'e', - ], - 'enum_select': {}, - }, - }, -} -MOCK_JOINED_REFERENCE_DATA_HT = hl.Table.parallelize( - [ - { - 'locus': hl.Locus( - contig='chr1', - position=1, - reference_genome='GRCh38', - ), - 'alleles': ['A', 'C'], - 'a': hl.Struct(d=1), - 'b': hl.Struct(e=2), - }, - { - 'locus': hl.Locus( - contig='chr1', - position=2, - reference_genome='GRCh38', - ), - 'alleles': ['A', 'C'], - 'a': hl.Struct(d=3), - 'b': hl.Struct(e=4), - }, - ], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - a=hl.tstruct(d=hl.tint32), - b=hl.tstruct(e=hl.tint32), - ), - key=['locus', 'alleles'], - globals=hl.Struct( - paths=hl.Struct( - a='a_path', - b='b_path', - ), - versions=hl.Struct( - a='a_version', - b='b_version', - ), - enums=hl.Struct( - a=hl.Struct(), - b=hl.Struct(), - ), - ), -) -MOCK_A_DATASET_HT = hl.Table.parallelize( - [ - { - 'locus': hl.Locus( - contig='chr1', - position=1, - reference_genome='GRCh38', - ), - 'alleles': ['A', 'C'], - 'a': hl.Struct(d=1), - }, - { - 'locus': hl.Locus( - contig='chr1', - position=2, - reference_genome='GRCh38', - ), - 'alleles': ['A', 'C'], - 'a': hl.Struct(d=3), - }, - ], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - a=hl.tstruct(d=hl.tint32), - ), - key=['locus', 'alleles'], - globals=hl.Struct( - path='a_path', - version='a_version', - enums=hl.Struct(), - ), -) -MOCK_B_DATASET_HT = hl.Table.parallelize( - [ - { - 'locus': hl.Locus( - contig='chr1', - position=1, - reference_genome='GRCh38', - ), - 'alleles': ['A', 'C'], - 'b': hl.Struct(e=5, f=1), - }, - { - 'locus': hl.Locus( - contig='chr1', - position=3, - reference_genome='GRCh38', - ), - 'alleles': ['A', 'C'], - 'b': hl.Struct(e=7, f=2), - }, - ], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - b=hl.tstruct(e=hl.tint32, f=hl.tint32), - ), - key=['locus', 'alleles'], - globals=hl.Struct( - path='b_new_path', - version='b_new_version', - enums=hl.Struct( - enum_1=[ - 'D', - 'F', - ], - ), - ), -) -EXPECTED_JOINED_DATA = [ - hl.Struct( - locus=hl.Locus( - contig='chr1', - position=1, - reference_genome='GRCh38', - ), - alleles=['A', 'C'], - a=hl.Struct(d=1), - b=hl.Struct(e=5, f=1), - ), - hl.Struct( - locus=hl.Locus( - contig='chr1', - position=2, - reference_genome='GRCh38', - ), - alleles=['A', 'C'], - a=hl.Struct(d=3), - b=None, - ), - hl.Struct( - locus=hl.Locus( - contig='chr1', - position=3, - reference_genome='GRCh38', - ), - alleles=['A', 'C'], - a=None, - b=hl.Struct(e=7, f=2), - ), -] -EXPECTED_GLOBALS = [ - hl.Struct( - date='2023-04-19T16:43:39.361110-04:56', - paths=hl.Struct( - a='a_path', - b='b_new_path', - ), - versions=hl.Struct( - a='a_version', - b='b_new_version', - ), - enums=hl.Struct( - a=hl.Struct(), - b=hl.Struct( - enum_1=[ - 'D', - 'F', - ], - ), - ), - ), -] - -MOCK_DATETIME = datetime( - 2023, - 4, - 19, - 16, - 43, - 39, - 361110, - tzinfo=pytz.timezone('US/Eastern'), -) - -PATH_TO_FILE_UNDER_TEST = 'v03_pipeline.lib.reference_data.dataset_table_operations' - - -class DatasetTableOperationsTest(unittest.TestCase): - def test_get_enum_select_fields(self): - ht = hl.Table.parallelize( - [ - {'variant': ['1', '2'], 'sv_type': 'a', 'sample_fix': '1'}, - { - 'variant': ['1', '3', '2'], - 'sv_type': 'b', - 'sample_fix': '2', - }, - {'variant': ['1', '3'], 'sv_type': 'c', 'sample_fix': '3'}, - {'variant': ['4'], 'sv_type': 'd', 'sample_fix': '4'}, - ], - hl.tstruct( - variant=hl.dtype('array'), - sv_type=hl.dtype('str'), - sample_fix=hl.dtype('str'), - ), - ) - enum_select_fields = get_enum_select_fields( - ht, - { - 'enum_select': { - 'variant': ['1', '2', '3', '4'], - 'sv_type': ['a', 'b', 'c', 'd'], - }, - }, - ) - mapped_ht = ht.transmute(**enum_select_fields) - self.assertListEqual( - mapped_ht.collect(), - [ - hl.Struct(variant_ids=[0, 1], sv_type_id=0, sample_fix='1'), - hl.Struct(variant_ids=[0, 2, 1], sv_type_id=1, sample_fix='2'), - hl.Struct(variant_ids=[0, 2], sv_type_id=2, sample_fix='3'), - hl.Struct(variant_ids=[3], sv_type_id=3, sample_fix='4'), - ], - ) - - enum_select_fields = get_enum_select_fields( - ht, - { - 'enum_select': {'sv_type': ['d']}, - }, - ) - mapped_ht = ht.select(**enum_select_fields) - self.assertRaises(Exception, mapped_ht.collect) - - @mock.patch.dict( - f'{PATH_TO_FILE_UNDER_TEST}.CONFIG', - { - 'mock_dbnsfp': { - '38': { - 'path': '', - 'select': [ - 'fathmm_MKL_coding_pred', - ], - 'custom_select': dbnsfp_custom_select, - 'enum_select': { - 'MutationTaster_pred': ['D', 'A', 'N', 'P'], - 'fathmm_MKL_coding_pred': ['D', 'N'], - }, - }, - }, - 'mock_dbnsfp_mito': { - '38': { - 'path': '', - 'custom_select': dbnsfp_mito_custom_select, - 'enum_select': { - 'MutationTaster_pred': ['D', 'A', 'N', 'P'], - }, - 'filter': lambda ht: ht.locus.contig == 'chrM', - }, - }, - }, - ) - @mock.patch(f'{PATH_TO_FILE_UNDER_TEST}.hl.read_table') - def test_dbnsfp_select_and_filter(self, mock_read_table): - mock_read_table.return_value = hl.Table.parallelize( - [ - { - 'locus': hl.Locus( - contig='chr1', - position=1, - reference_genome='GRCh38', - ), - 'REVEL_score': hl.missing(hl.tstr), - 'SIFT_score': '.;0.082', - 'Polyphen2_HVAR_score': '.;0.401', - 'MutationTaster_pred': 'P', - 'fathmm_MKL_coding_pred': 'N', - }, - { - 'locus': hl.Locus( - contig='chrM', - position=2, - reference_genome='GRCh38', - ), - 'REVEL_score': '0.052', - 'SIFT_score': '.;0.082', - 'Polyphen2_HVAR_score': '.;0.401', - 'MutationTaster_pred': 'P', - 'fathmm_MKL_coding_pred': 'D', - }, - ], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - REVEL_score=hl.tstr, - SIFT_score=hl.tstr, - Polyphen2_HVAR_score=hl.tstr, - MutationTaster_pred=hl.tstr, - fathmm_MKL_coding_pred=hl.tstr, - ), - key='locus', - ) - ht = get_dataset_ht( - 'mock_dbnsfp', - ReferenceGenome.GRCh38, - ) - self.assertCountEqual( - ht.collect(), - [ - hl.Struct( - locus=hl.Locus( - contig='chr1', - position=1, - reference_genome='GRCh38', - ), - mock_dbnsfp=hl.Struct( - REVEL_score=None, - SIFT_score=hl.eval(hl.float32(0.082)), - Polyphen2_HVAR_score=hl.eval(hl.float32(0.401)), - MutationTaster_pred_id=3, - fathmm_MKL_coding_pred_id=1, - ), - ), - hl.Struct( - locus=hl.Locus( - contig='chrM', - position=2, - reference_genome='GRCh38', - ), - mock_dbnsfp=hl.Struct( - REVEL_score=hl.eval(hl.float32(0.052)), - SIFT_score=hl.eval(hl.float32(0.082)), - Polyphen2_HVAR_score=hl.eval(hl.float32(0.401)), - MutationTaster_pred_id=3, - fathmm_MKL_coding_pred_id=0, - ), - ), - ], - ) - ht = get_dataset_ht( - 'mock_dbnsfp_mito', - ReferenceGenome.GRCh38, - ) - self.assertCountEqual( - ht.collect(), - [ - hl.Struct( - locus=hl.Locus( - contig='chrM', - position=2, - reference_genome='GRCh38', - ), - mock_dbnsfp_mito=hl.Struct( - SIFT_score=hl.eval(hl.float32(0.0820000022649765)), - MutationTaster_pred_id=3, - ), - ), - ], - ) - - @mock.patch.dict( - f'{PATH_TO_FILE_UNDER_TEST}.CONFIG', - { - 'a': { - '38': { - 'path': 'gs://a.com', - 'select': ['b'], - 'version': '2.2.2', - }, - }, - }, - ) - @mock.patch(f'{PATH_TO_FILE_UNDER_TEST}.hl.read_table') - def test_parse_version(self, mock_read_table): - ht = hl.Table.parallelize( - [ - { - 'locus': hl.Locus( - contig='chr1', - position=1, - reference_genome='GRCh38', - ), - 'b': 1, - }, - { - 'locus': hl.Locus( - contig='chr1', - position=2, - reference_genome='GRCh38', - ), - 'b': 2, - }, - ], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - b=hl.tint32, - ), - key=['locus'], - globals=hl.Struct( - version='2.2.2', - ), - ) - mock_read_table.return_value = ht - self.assertCountEqual( - get_dataset_ht( - 'a', - ReferenceGenome.GRCh38, - ).globals.collect(), - [ - hl.Struct( - path='gs://a.com', - version='2.2.2', - enums=hl.Struct(), - ), - ], - ) - mock_read_table.return_value = ht.annotate_globals(version=hl.missing(hl.tstr)) - - self.assertCountEqual( - get_dataset_ht( - 'a', - ReferenceGenome.GRCh38, - ).globals.collect(), - [ - hl.Struct( - path='gs://a.com', - version='2.2.2', - enums=hl.Struct(), - ), - ], - ) - - mock_read_table.return_value = ht.annotate_globals(version='1.2.3') - ht = get_dataset_ht( - 'a', - ReferenceGenome.GRCh38, - ) - self.assertRaises(Exception, ht.globals.collect) - - @mock.patch.dict(f'{PATH_TO_FILE_UNDER_TEST}.CONFIG', MOCK_CONFIG) - @mock.patch(f'{PATH_TO_FILE_UNDER_TEST}.get_dataset_ht') - @mock.patch(f'{PATH_TO_FILE_UNDER_TEST}.datetime', wraps=datetime) - @mock.patch.object(ReferenceDatasetCollection, 'datasets') - def test_update_or_create_joined_ht_one_dataset( - self, - mock_reference_dataset_collection_datasets, - mock_datetime, - mock_get_dataset_ht, - ): - mock_reference_dataset_collection_datasets.return_value = ['a', 'b'] - mock_datetime.now.return_value = MOCK_DATETIME - mock_get_dataset_ht.return_value = MOCK_B_DATASET_HT - - ht = update_or_create_joined_ht( - ReferenceDatasetCollection.INTERVAL, - DatasetType.SNV_INDEL, - ReferenceGenome.GRCh38, - datasets=['b'], - joined_ht=MOCK_JOINED_REFERENCE_DATA_HT, - ) - self.assertCountEqual( - ht.collect(), - EXPECTED_JOINED_DATA, - ) - self.assertCountEqual(ht.globals.collect(), EXPECTED_GLOBALS) - - @mock.patch.dict(f'{PATH_TO_FILE_UNDER_TEST}.CONFIG', MOCK_CONFIG) - @mock.patch(f'{PATH_TO_FILE_UNDER_TEST}.get_dataset_ht') - @mock.patch(f'{PATH_TO_FILE_UNDER_TEST}.datetime', wraps=datetime) - @mock.patch.object(ReferenceDatasetCollection, 'datasets') - def test_update_or_create_joined_ht_all_datasets( - self, - mock_reference_dataset_collection_datasets, - mock_datetime, - mock_get_dataset_ht, - ): - mock_reference_dataset_collection_datasets.return_value = ['a', 'b'] - mock_datetime.now.return_value = MOCK_DATETIME - mock_get_dataset_ht.side_effect = [MOCK_A_DATASET_HT, MOCK_B_DATASET_HT] - - empty_ht = hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus(ReferenceGenome.GRCh38.value), - alleles=hl.tarray(hl.tstr), - ), - key=('locus', 'alleles'), - globals=hl.Struct( - paths=hl.Struct(), - versions=hl.Struct(), - enums=hl.Struct(), - ), - ) - - ht = update_or_create_joined_ht( - ReferenceDatasetCollection.COMBINED, - DatasetType.SNV_INDEL, - ReferenceGenome.GRCh38, - datasets=['a', 'b'], - joined_ht=empty_ht, - ) - self.assertCountEqual( - ht.collect(), - EXPECTED_JOINED_DATA, - ) - self.assertCountEqual(ht.globals.collect(), EXPECTED_GLOBALS) - - @mock.patch.dict(f'{PATH_TO_FILE_UNDER_TEST}.CONFIG', MOCK_CONFIG) - @mock.patch.object(ReferenceDatasetCollection, 'datasets') - def test_update_or_create_joined_ht_drop_a_dataset( - self, - mock_reference_dataset_collection_datasets, - ): - mock_reference_dataset_collection_datasets.return_value = ['b'] - ht = hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus(ReferenceGenome.GRCh38.value), - alleles=hl.tarray(hl.tstr), - c=hl.tint32, - b=hl.tint32, - ), - key=('locus', 'alleles'), - globals=hl.Struct( - paths=hl.Struct(c='abc', b='123'), - versions=hl.Struct(c='def', b='456'), - enums=hl.Struct(c=hl.Struct(d=['a', 'b'])), - ), - ) - ht = update_or_create_joined_ht( - ReferenceDatasetCollection.COMBINED, - DatasetType.SNV_INDEL, - ReferenceGenome.GRCh38, - datasets=['c'], - joined_ht=ht, - ) - self.assertCountEqual( - ht.globals.collect(), - [ - hl.Struct( - paths=hl.Struct(b='123'), - versions=hl.Struct(b='456'), - enums=hl.Struct(), - ), - ], - ) diff --git a/v03_pipeline/lib/reference_data/hgmd.py b/v03_pipeline/lib/reference_data/hgmd.py deleted file mode 100644 index d71825888..000000000 --- a/v03_pipeline/lib/reference_data/hgmd.py +++ /dev/null @@ -1,18 +0,0 @@ -import hail as hl - -from v03_pipeline.lib.model.definitions import ReferenceGenome - - -def download_and_import_hgmd_vcf( - hgmd_url: str, - reference_genome: ReferenceGenome, -) -> hl.Table: - mt = hl.import_vcf( - hgmd_url, - reference_genome=reference_genome.value, - force=True, - min_partitions=100, - skip_invalid_loci=True, - contig_recoding=reference_genome.contig_recoding(), - ) - return mt.rows() diff --git a/v03_pipeline/lib/reference_data/hgmd_test.py b/v03_pipeline/lib/reference_data/hgmd_test.py deleted file mode 100644 index cac7d9be5..000000000 --- a/v03_pipeline/lib/reference_data/hgmd_test.py +++ /dev/null @@ -1,12 +0,0 @@ -import unittest - -from v03_pipeline.lib.model import ReferenceGenome -from v03_pipeline.lib.reference_data.hgmd import download_and_import_hgmd_vcf - -TEST_HGMD_VCF = 'v03_pipeline/var/test/reference_data/test_hgmd.vcf' - - -class HGMDTest(unittest.TestCase): - def test_import_hgmd_vcf(self): - ht = download_and_import_hgmd_vcf(TEST_HGMD_VCF, ReferenceGenome.GRCh38) - self.assertEqual(ht.count(), 1) diff --git a/v03_pipeline/lib/reference_data/mito.py b/v03_pipeline/lib/reference_data/mito.py deleted file mode 100644 index 7df647324..000000000 --- a/v03_pipeline/lib/reference_data/mito.py +++ /dev/null @@ -1,16 +0,0 @@ -import hail as hl - -from v03_pipeline.lib.model.definitions import ReferenceGenome - - -def download_and_import_local_constraint_tsv( - url: str, - reference_genome: ReferenceGenome, -) -> hl.Table: - ht = hl.import_table(url, types={'Position': hl.tint32, 'MLC_score': hl.tfloat32}) - ht = ht.select( - locus=hl.locus('chrM', ht.Position, reference_genome.value), - alleles=[ht.Reference, ht.Alternate], - MLC_score=ht.MLC_score, - ) - return ht.key_by('locus', 'alleles') diff --git a/v03_pipeline/lib/reference_data/__init__.py b/v03_pipeline/lib/reference_datasets/__init__.py similarity index 100% rename from v03_pipeline/lib/reference_data/__init__.py rename to v03_pipeline/lib/reference_datasets/__init__.py diff --git a/v03_pipeline/lib/reference_datasets/clinvar.py b/v03_pipeline/lib/reference_datasets/clinvar.py new file mode 100644 index 000000000..89e70e47b --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/clinvar.py @@ -0,0 +1,172 @@ +import gzip +import shutil +import tempfile + +import hail as hl +import requests + +from v03_pipeline.lib.annotations.enums import ( + CLINVAR_ASSERTIONS, + CLINVAR_DEFAULT_PATHOGENICITY, + CLINVAR_PATHOGENICITIES, + CLINVAR_PATHOGENICITIES_LOOKUP, +) +from v03_pipeline.lib.model.definitions import ReferenceGenome +from v03_pipeline.lib.reference_datasets.misc import vcf_to_ht + +CLINVAR_GOLD_STARS_LOOKUP = hl.dict( + { + 'no_classification_for_the_single_variant': 0, + 'no_classification_provided': 0, + 'no_assertion_criteria_provided': 0, + 'no_classifications_from_unflagged_records': 0, + 'criteria_provided,_single_submitter': 1, + 'criteria_provided,_conflicting_classifications': 1, + 'criteria_provided,_multiple_submitters,_no_conflicts': 2, + 'reviewed_by_expert_panel': 3, + 'practice_guideline': 4, + }, +) +CLINVAR_SUBMISSION_SUMMARY_URL = ( + 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/submission_summary.txt.gz' +) + +ENUMS = { + 'assertion': CLINVAR_ASSERTIONS, + 'pathogenicity': CLINVAR_PATHOGENICITIES, +} + + +def parsed_clnsig(ht: hl.Table): + return ( + hl.delimit(ht.info.CLNSIG) + .replace( + 'Likely_pathogenic,_low_penetrance', + 'Likely_pathogenic|low_penetrance', + ) + .replace( + '/Pathogenic,_low_penetrance/Established_risk_allele', + '/Established_risk_allele|low_penetrance', + ) + .replace( + '/Pathogenic,_low_penetrance', + '|low_penetrance', + ) + .split(r'\|') + ) + + +def parse_to_count(entry: str): + splt = entry.split( + r'\(', + ) # pattern, count = entry... if destructuring worked on a hail expression! + return hl.Struct( + pathogenicity_id=CLINVAR_PATHOGENICITIES_LOOKUP[splt[0]], + count=hl.int32(splt[1][:-1]), + ) + + +def parsed_and_mapped_clnsigconf(ht: hl.Table): + return ( + hl.delimit(ht.info.CLNSIGCONF) + .replace(',_low_penetrance', '') + .split(r'\|') + .map(parse_to_count) + .group_by(lambda x: x.pathogenicity_id) + .map_values( + lambda values: ( + values.fold( + lambda x, y: x + y.count, + 0, + ) + ), + ) + .items() + .map(lambda x: hl.Struct(pathogenicity_id=x[0], count=x[1])) + ) + + +def parse_clinvar_release_date(clinvar_url: str) -> str: + response = requests.get(clinvar_url, stream=True, timeout=10) + for byte_line in gzip.GzipFile(fileobj=response.raw): + line = byte_line.decode('ascii').strip() + if not line: + continue + if line.startswith('##fileDate='): + return line.split('=')[-1].strip() + if not line.startswith('#'): + return None + return None + + +def get_submission_summary_ht() -> hl.Table: + with tempfile.NamedTemporaryFile( + suffix='.txt.gz', + delete=False, + ) as tmp_file, requests.get( + CLINVAR_SUBMISSION_SUMMARY_URL, + stream=True, + timeout=10, + ) as r: + shutil.copyfileobj(r.raw, tmp_file) + ht = hl.import_table( + tmp_file.name, + force=True, + filter='^(#[^:]*:|^##).*$', # removes all comments except for the header line + types={ + '#VariationID': hl.tstr, + 'Submitter': hl.tstr, + 'ReportedPhenotypeInfo': hl.tstr, + }, + missing='-', + ) + ht = ht.rename({'#VariationID': 'VariationID'}) + ht = ht.select('VariationID', 'Submitter', 'ReportedPhenotypeInfo') + return ht.group_by('VariationID').aggregate( + Submitters=hl.agg.collect(ht.Submitter), + Conditions=hl.agg.collect(ht.ReportedPhenotypeInfo), + ) + + +def select_fields(ht): + clnsigs = parsed_clnsig(ht) + return ht.select( + alleleId=ht.info.ALLELEID, + pathogenicity=hl.if_else( + CLINVAR_PATHOGENICITIES_LOOKUP.contains(clnsigs[0]), + clnsigs[0], + CLINVAR_DEFAULT_PATHOGENICITY, + ), + assertion=hl.if_else( + CLINVAR_PATHOGENICITIES_LOOKUP.contains(clnsigs[0]), + clnsigs[1:], + clnsigs, + ), + # NB: there's a hidden enum-mapping inside this clinvar function. + conflictingPathogenicities=parsed_and_mapped_clnsigconf(ht), + goldStars=CLINVAR_GOLD_STARS_LOOKUP.get(hl.delimit(ht.info.CLNREVSTAT)), + submitters=ht.submitters, + # assumes the format 'MedGen#:condition', e.g.'C0023264:Leigh syndrome' + conditions=hl.map( + lambda p: p.split(r':')[1], + ht.conditions, + ), + ) + + +def get_ht( + clinvar_url: str, + reference_genome: ReferenceGenome, +) -> hl.Table: + with tempfile.NamedTemporaryFile( + suffix='.vcf.gz', + delete=False, + ) as tmp_file, requests.get(clinvar_url, stream=True, timeout=10) as r: + shutil.copyfileobj(r.raw, tmp_file) + ht = vcf_to_ht(tmp_file.name, reference_genome) + submitters_ht = get_submission_summary_ht() + ht = ht.annotate( + submitters=submitters_ht[ht.rsid].Submitters, + conditions=submitters_ht[ht.rsid].Conditions, + ) + return select_fields(ht) diff --git a/v03_pipeline/lib/reference_datasets/clinvar_path_variants.py b/v03_pipeline/lib/reference_datasets/clinvar_path_variants.py new file mode 100644 index 000000000..f77fa4726 --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/clinvar_path_variants.py @@ -0,0 +1,38 @@ +import hail as hl + +from v03_pipeline.lib.annotations.enums import ( + CLINVAR_PATHOGENICITIES_LOOKUP, +) + +CLINVAR_PATH_RANGE = ('Pathogenic', 'Pathogenic/Likely_risk_allele') +CLINVAR_LIKELY_PATH_RANGE = ('Pathogenic/Likely_pathogenic', 'Likely_risk_allele') + + +def get_ht( + ht: hl.Table, + *_, +) -> hl.Table: + ht = ht.select_globals() + ht = ht.select( + is_pathogenic=( + ( + ht.pathogenicity_id + >= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_PATH_RANGE[0]] + ) + & ( + ht.pathogenicity_id + <= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_PATH_RANGE[1]] + ) + ), + is_likely_pathogenic=( + ( + ht.pathogenicity_id + >= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_LIKELY_PATH_RANGE[0]] + ) + & ( + ht.pathogenicity_id + <= CLINVAR_PATHOGENICITIES_LOOKUP[CLINVAR_LIKELY_PATH_RANGE[1]] + ) + ), + ) + return ht.filter(ht.is_pathogenic | ht.is_likely_pathogenic) diff --git a/v03_pipeline/lib/reference_datasets/clinvar_test.py b/v03_pipeline/lib/reference_datasets/clinvar_test.py new file mode 100644 index 000000000..e62e34e2d --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/clinvar_test.py @@ -0,0 +1,171 @@ +import unittest + +import hail as hl +import responses + +from v03_pipeline.lib.annotations.enums import ( + CLINVAR_ASSERTIONS, + CLINVAR_PATHOGENICITIES, +) +from v03_pipeline.lib.model.definitions import ReferenceGenome +from v03_pipeline.lib.reference_datasets.clinvar import ( + parsed_and_mapped_clnsigconf, + parsed_clnsig, +) +from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset +from v03_pipeline.lib.test.mock_clinvar_urls import mock_clinvar_urls + + +class ClinvarTest(unittest.TestCase): + @responses.activate + def test_get_clinvar_version(self): + with mock_clinvar_urls(): + self.assertEqual( + ReferenceDataset.clinvar.version(ReferenceGenome.GRCh38), + '2024-11-11', + ) + + def test_parsed_clnsig(self): + ht = hl.Table.parallelize( + [ + {'info': hl.Struct(CLNSIG=['Pathogenic|Affects'])}, + { + 'info': hl.Struct( + CLNSIG=[ + 'Pathogenic/Likely_pathogenic/Pathogenic', + '_low_penetrance', + ], + ), + }, + { + 'info': hl.Struct( + CLNSIG=[ + 'Likely_pathogenic/Pathogenic', + '_low_penetrance|association|protective', + ], + ), + }, + {'info': hl.Struct(CLNSIG=['Likely_pathogenic', '_low_penetrance'])}, + {'info': hl.Struct(CLNSIG=['association|protective'])}, + { + 'info': hl.Struct( + CLNSIG=[ + 'Pathogenic/Likely_pathogenic/Pathogenic', + '_low_penetrance/Established_risk_allele', + ], + ), + }, + ], + hl.tstruct(info=hl.tstruct(CLNSIG=hl.tarray(hl.tstr))), + ) + self.assertListEqual( + parsed_clnsig(ht).collect(), + [ + ['Pathogenic', 'Affects'], + ['Pathogenic/Likely_pathogenic', 'low_penetrance'], + ['Likely_pathogenic', 'low_penetrance', 'association', 'protective'], + ['Likely_pathogenic', 'low_penetrance'], + ['association', 'protective'], + [ + 'Pathogenic/Likely_pathogenic/Established_risk_allele', + 'low_penetrance', + ], + ], + ) + + def test_parsed_and_mapped_clnsigconf(self): + ht = hl.Table.parallelize( + [ + {'info': hl.Struct(CLNSIGCONF=hl.missing(hl.tarray(hl.tstr)))}, + { + 'info': hl.Struct( + CLNSIGCONF=[ + 'Pathogenic(8)|Likely_pathogenic(2)|Pathogenic', + '_low_penetrance(1)|Uncertain_significance(1)', + ], + ), + }, + ], + hl.tstruct(info=hl.tstruct(CLNSIGCONF=hl.tarray(hl.tstr))), + ) + self.assertListEqual( + parsed_and_mapped_clnsigconf(ht).collect(), + [ + None, + [ + hl.Struct(count=9, pathogenicity_id=0), + hl.Struct(count=2, pathogenicity_id=5), + hl.Struct(count=1, pathogenicity_id=12), + ], + ], + ) + + @responses.activate + def test_get_ht(self): + with mock_clinvar_urls(): + ht = ReferenceDataset.clinvar.get_ht( + ReferenceGenome.GRCh38, + ) + self.assertEqual( + ht.globals.collect()[0], + hl.Struct( + version='2024-11-11', + enums=hl.Struct( + assertion=CLINVAR_ASSERTIONS, + pathogenicity=CLINVAR_PATHOGENICITIES, + ), + ), + ) + self.assertEqual( + ht.collect()[:3], + [ + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=69134, + reference_genome='GRCh38', + ), + alleles=['A', 'G'], + alleleId=2193183, + conflictingPathogenicities=None, + goldStars=1, + submitters=None, + conditions=None, + pathogenicity_id=0, + assertion_ids=[], + ), + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=69314, + reference_genome='GRCh38', + ), + alleles=['T', 'G'], + alleleId=3374047, + conflictingPathogenicities=None, + goldStars=1, + submitters=['Paris Brain Institute, Inserm - ICM', 'OMIM'], + conditions=[ + 'Hereditary spastic paraplegia 48', + 'Hereditary spastic paraplegia 48', + ], + pathogenicity_id=12, + assertion_ids=[], + ), + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=69423, + reference_genome='GRCh38', + ), + alleles=['G', 'A'], + alleleId=3374048, + conflictingPathogenicities=None, + goldStars=1, + submitters=['OMIM'], + conditions=['Hereditary spastic paraplegia 48'], + pathogenicity_id=12, + assertion_ids=[], + ), + ], + ) diff --git a/v03_pipeline/lib/reference_datasets/dbnsfp.py b/v03_pipeline/lib/reference_datasets/dbnsfp.py new file mode 100644 index 000000000..b011cf034 --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/dbnsfp.py @@ -0,0 +1,83 @@ +import hail as hl + +from v03_pipeline.lib.model import DatasetType, ReferenceGenome +from v03_pipeline.lib.reference_datasets.misc import ( + download_zip_file, + key_by_locus_alleles, +) + +SHARED_TYPES = { + 'REVEL_score': hl.tfloat32, + 'fathmm-MKL_coding_score': hl.tfloat32, + 'MutPred_score': hl.tfloat32, + 'PrimateAI_score': hl.tfloat32, +} +TYPES = { + ReferenceGenome.GRCh37: { + **SHARED_TYPES, + 'pos(1-based)': hl.tint, + 'CADD_phred_hg19': hl.tfloat32, + }, + ReferenceGenome.GRCh38: { + **SHARED_TYPES, + 'hg19_pos(1-based)': hl.tint, + 'CADD_phred': hl.tfloat32, + }, +} + +SHARED_RENAME = { + 'fathmm-MKL_coding_score': 'fathmm_MKL_coding_score', +} +RENAME = { + ReferenceGenome.GRCh37: { + **SHARED_RENAME, + 'hg19_chr': 'chrom', + 'hg19_pos(1-based)': 'pos', + }, + ReferenceGenome.GRCh38: { + **SHARED_RENAME, + '#chr': 'chrom', + 'pos(1-based)': 'pos', + }, +} + +PREDICTOR_SCORES = { + 'REVEL_score', + 'SIFT_score', + 'Polyphen2_HVAR_score', + 'VEST4_score', + 'MPC_score', +} +PREDICTOR_FIELDS = ['MutationTaster_pred'] + + +def predictor_parse(field: hl.StringExpression) -> hl.StringExpression: + return field.split(';').find(lambda p: p != '.') + + +def get_ht(path: str, reference_genome: ReferenceGenome) -> hl.Table: + types = TYPES[reference_genome] + rename = RENAME[reference_genome] + + with download_zip_file(path) as unzipped_dir: + ht = hl.import_table( + f'{unzipped_dir}/dbNSFP*_variant.chr*.gz', + types=types, + missing='.', + force=True, + ) + select_fields = {'ref', 'alt', *types.keys(), *rename.keys()} + ht = ht.select( + *select_fields, + **{k: hl.parse_float32(predictor_parse(ht[k])) for k in PREDICTOR_SCORES}, + **{k: predictor_parse(ht[k]) for k in PREDICTOR_FIELDS}, + ) + ht = ht.rename(**rename) + + return key_by_locus_alleles(ht, reference_genome) + + +def select(_: ReferenceGenome, dataset_type: DatasetType, ht: hl.Table) -> hl.Table: + if dataset_type == DatasetType.MITO: + return ht.select(ht.SIFT_score, ht.MutationTaster_pred_id) + return ht diff --git a/v03_pipeline/lib/reference_datasets/eigen.py b/v03_pipeline/lib/reference_datasets/eigen.py new file mode 100644 index 000000000..5e56cfdca --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/eigen.py @@ -0,0 +1,6 @@ +import hail as hl + + +def get_ht(path: str, *_) -> hl.Table: + ht = hl.read_table(path) + return ht.select(Eigen_phred=ht.info['Eigen-phred']) diff --git a/v03_pipeline/lib/reference_datasets/exac.py b/v03_pipeline/lib/reference_datasets/exac.py new file mode 100644 index 000000000..45e8e6edb --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/exac.py @@ -0,0 +1,22 @@ +import hail as hl + +from v03_pipeline.lib.misc.nested_field import parse_nested_field +from v03_pipeline.lib.model import ReferenceGenome +from v03_pipeline.lib.reference_datasets.misc import vcf_to_ht + +SELECT = { + 'AF_POPMAX': 'info.POPMAX', + 'AF': 'info.AF#', + 'AC_Adj': 'info.AC_Adj#', + 'AC_Het': 'info.AC_Het#', + 'AC_Hom': 'info.AC_Hom#', + 'AC_Hemi': 'info.AC_Hemi#', + 'AN_Adj': 'info.AN_Adj', +} + + +def get_ht(path: str, reference_genome: ReferenceGenome) -> hl.Table: + ht = vcf_to_ht(path, reference_genome, split_multi=True) + return ht.select( + **{k: parse_nested_field(ht, v) for k, v in SELECT.items()}, + ) diff --git a/v03_pipeline/lib/reference_datasets/exac_test.py b/v03_pipeline/lib/reference_datasets/exac_test.py new file mode 100644 index 000000000..c5256c0bf --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/exac_test.py @@ -0,0 +1,54 @@ +import unittest +from unittest.mock import patch + +import hail as hl + +from v03_pipeline.lib.model.definitions import ReferenceGenome +from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset + +EXAC_PATH = 'v03_pipeline/var/test/reference_datasets/raw/exac.vcf' + + +class ExacTest(unittest.TestCase): + def test_exac(self): + with patch.object( + ReferenceDataset, + 'path', + return_value=EXAC_PATH, + ): + ht = ReferenceDataset.exac.get_ht(ReferenceGenome.GRCh38) + self.assertEqual( + ht.collect(), + [ + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=1046973, + reference_genome='GRCh38', + ), + alleles=['G', 'A'], + AF_POPMAX=['NA', 'NFE'], + AF=1.702e-05, + AC_Adj=0, + AC_Het=0, + AC_Hom=0, + AC_Hemi=None, + AN_Adj=27700, + ), + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=1046973, + reference_genome='GRCh38', + ), + alleles=['G', 'T'], + AF_POPMAX=['NA', 'NFE'], + AF=1.702e-05, + AC_Adj=1, + AC_Het=1, + AC_Hom=0, + AC_Hemi=None, + AN_Adj=27700, + ), + ], + ) diff --git a/v03_pipeline/lib/reference_data/gencode/__init__.py b/v03_pipeline/lib/reference_datasets/gencode/__init__.py similarity index 100% rename from v03_pipeline/lib/reference_data/gencode/__init__.py rename to v03_pipeline/lib/reference_datasets/gencode/__init__.py diff --git a/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids.py b/v03_pipeline/lib/reference_datasets/gencode/mapping_gene_ids.py similarity index 100% rename from v03_pipeline/lib/reference_data/gencode/mapping_gene_ids.py rename to v03_pipeline/lib/reference_datasets/gencode/mapping_gene_ids.py diff --git a/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids_tests.py b/v03_pipeline/lib/reference_datasets/gencode/mapping_gene_ids_tests.py similarity index 97% rename from v03_pipeline/lib/reference_data/gencode/mapping_gene_ids_tests.py rename to v03_pipeline/lib/reference_datasets/gencode/mapping_gene_ids_tests.py index 58c037048..1827a3453 100644 --- a/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids_tests.py +++ b/v03_pipeline/lib/reference_datasets/gencode/mapping_gene_ids_tests.py @@ -3,7 +3,7 @@ import responses -from v03_pipeline.lib.reference_data.gencode.mapping_gene_ids import ( +from v03_pipeline.lib.reference_datasets.gencode.mapping_gene_ids import ( GENCODE_ENSEMBL_TO_REFSEQ_URL, GENCODE_GTF_URL, load_gencode_ensembl_to_refseq_id, diff --git a/v03_pipeline/lib/reference_datasets/gnomad_coding_and_noncoding.py b/v03_pipeline/lib/reference_datasets/gnomad_coding_and_noncoding.py new file mode 100644 index 000000000..f9aac07a4 --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/gnomad_coding_and_noncoding.py @@ -0,0 +1,59 @@ +import hail as hl + +from v03_pipeline.lib.annotations.enums import ( + TRANSCRIPT_CONSEQUENCE_TERMS, +) +from v03_pipeline.lib.annotations.expression_helpers import ( + get_expr_for_vep_sorted_transcript_consequences_array, + get_expr_for_worst_transcript_consequence_annotations_struct, +) +from v03_pipeline.lib.model import ReferenceGenome + +GNOMAD_CODING_NONCODING_HIGH_AF_THRESHOLD = 0.90 +TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP = hl.dict( + hl.enumerate(TRANSCRIPT_CONSEQUENCE_TERMS, index_first=False), +) + + +def get_ht( + path: str, + reference_genome: ReferenceGenome, +) -> hl.Table: + ht = hl.read_table(path) + filtered_contig = 'chr1' if reference_genome == ReferenceGenome.GRCh38 else '1' + ht = hl.filter_intervals( + ht, + [ + hl.parse_locus_interval( + filtered_contig, + reference_genome=reference_genome.value, + ), + ], + ) + ht = ht.filter(ht.freq[0].AF > GNOMAD_CODING_NONCODING_HIGH_AF_THRESHOLD) + ht = ht.annotate( + sorted_transaction_consequences=( + get_expr_for_vep_sorted_transcript_consequences_array( + ht.vep, + omit_consequences=[], + ) + ), + ) + ht = ht.annotate( + main_transcript=( + get_expr_for_worst_transcript_consequence_annotations_struct( + ht.sorted_transaction_consequences, + ) + ), + ) + ht = ht.select( + coding=( + ht.main_transcript.major_consequence_rank + <= TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP['synonymous_variant'] + ), + noncoding=( + ht.main_transcript.major_consequence_rank + >= TRANSCRIPT_CONSEQUENCE_TERM_RANK_LOOKUP['downstream_gene_variant'] + ), + ) + return ht.filter(ht.coding | ht.noncoding) diff --git a/v03_pipeline/lib/reference_datasets/gnomad_exomes.py b/v03_pipeline/lib/reference_datasets/gnomad_exomes.py new file mode 100644 index 000000000..a5e2919ec --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/gnomad_exomes.py @@ -0,0 +1,21 @@ +import hail as hl + +from v03_pipeline.lib.model import ReferenceGenome +from v03_pipeline.lib.reference_datasets.gnomad_utils import get_ht as _get_ht + + +def af_popmax_expression( + ht: hl.Table, + reference_genome: ReferenceGenome, +) -> hl.Expression: + if reference_genome == ReferenceGenome.GRCh37: + return ht.popmax[ht.globals.popmax_index_dict['gnomad']].AF + return ht.grpmax['gnomad'].AF + + +def get_ht(path: str, reference_genome: ReferenceGenome) -> hl.Table: + return _get_ht( + path, + reference_genome, + af_popmax_expression, + ) diff --git a/v03_pipeline/lib/reference_datasets/gnomad_exomes_test.py b/v03_pipeline/lib/reference_datasets/gnomad_exomes_test.py new file mode 100644 index 000000000..75c81eb4e --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/gnomad_exomes_test.py @@ -0,0 +1,72 @@ +import unittest +from unittest.mock import patch + +import hail as hl + +from v03_pipeline.lib.model.definitions import ReferenceGenome +from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset + +GNOMAD_EXOMES_37_PATH = ( + 'v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht' +) +GNOMAD_EXOMES_38_PATH = ( + 'v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht' +) + + +class GnomadTest(unittest.TestCase): + def test_gnomad_exomes_37(self): + with patch.object( + ReferenceDataset, + 'path', + return_value=GNOMAD_EXOMES_37_PATH, + ): + ht = ReferenceDataset.gnomad_exomes.get_ht(ReferenceGenome.GRCh37) + self.assertEqual( + ht.collect(), + [ + hl.Struct( + locus=hl.Locus( + contig='1', + position=12586, + reference_genome='GRCh37', + ), + alleles=['C', 'T'], + AF=0.0005589714855886996, + AN=3578, + AC=2, + Hom=0, + AF_POPMAX_OR_GLOBAL=0.0022026430815458298, + FAF_AF=9.839000267675146e-05, + Hemi=0, + ), + ], + ) + + def test_gnomad_exomes_38(self): + with patch.object( + ReferenceDataset, + 'path', + return_value=GNOMAD_EXOMES_38_PATH, + ): + ht = ReferenceDataset.gnomad_exomes.get_ht(ReferenceGenome.GRCh38) + self.assertEqual( + ht.collect(), + [ + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=12138, + reference_genome='GRCh38', + ), + alleles=['C', 'A'], + AF=0.00909090880304575, + AN=110, + AC=1, + Hom=0, + AF_POPMAX_OR_GLOBAL=0.009803921915590763, + FAF_AF=0.0, + Hemi=0, + ), + ], + ) diff --git a/v03_pipeline/lib/reference_datasets/gnomad_genomes.py b/v03_pipeline/lib/reference_datasets/gnomad_genomes.py new file mode 100644 index 000000000..acd78efed --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/gnomad_genomes.py @@ -0,0 +1,21 @@ +import hail as hl + +from v03_pipeline.lib.model import ReferenceGenome +from v03_pipeline.lib.reference_datasets.gnomad_utils import get_ht as _get_ht + + +def af_popmax_expression( + ht: hl.Table, + reference_genome: ReferenceGenome, +) -> hl.Expression: + if reference_genome == ReferenceGenome.GRCh37: + return ht.popmax[ht.globals.popmax_index_dict['gnomad']].AF + return ht.grpmax.AF + + +def get_ht(path: str, reference_genome: ReferenceGenome) -> hl.Table: + return _get_ht( + path, + reference_genome, + af_popmax_expression, + ) diff --git a/v03_pipeline/lib/reference_datasets/gnomad_genomes_test.py b/v03_pipeline/lib/reference_datasets/gnomad_genomes_test.py new file mode 100644 index 000000000..a620ab88c --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/gnomad_genomes_test.py @@ -0,0 +1,72 @@ +import unittest +from unittest.mock import patch + +import hail as hl + +from v03_pipeline.lib.model.definitions import ReferenceGenome +from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset + +GNOMAD_GENOMES_37_PATH = ( + 'v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht' +) +GNOMAD_GENOMES_38_PATH = ( + 'v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht' +) + + +class GnomadTest(unittest.TestCase): + def test_gnomad_genomes_37(self): + with patch.object( + ReferenceDataset, + 'path', + return_value=GNOMAD_GENOMES_37_PATH, + ): + ht = ReferenceDataset.gnomad_genomes.get_ht(ReferenceGenome.GRCh37) + self.assertEqual( + ht.collect(), + [ + hl.Struct( + locus=hl.Locus( + contig='1', + position=10131, + reference_genome='GRCh37', + ), + alleles=['CT', 'C'], + AF=3.6635403375839815e-05, + AN=27296, + AC=1, + Hom=0, + AF_POPMAX_OR_GLOBAL=3.6635403375839815e-05, + FAF_AF=0.0, + Hemi=0, + ), + ], + ) + + def test_gnomad_genomes_38(self): + with patch.object( + ReferenceDataset, + 'path', + return_value=GNOMAD_GENOMES_38_PATH, + ): + ht = ReferenceDataset.gnomad_genomes.get_ht(ReferenceGenome.GRCh38) + self.assertEqual( + ht.collect(), + [ + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=10057, + reference_genome='GRCh38', + ), + alleles=['A', 'C'], + AF=2.642333674884867e-05, + AN=113536, + AC=3, + Hom=0, + AF_POPMAX_OR_GLOBAL=3.779861071961932e-05, + FAF_AF=7.019999884505523e-06, + Hemi=0, + ), + ], + ) diff --git a/v03_pipeline/lib/reference_datasets/gnomad_mito.py b/v03_pipeline/lib/reference_datasets/gnomad_mito.py new file mode 100644 index 000000000..bc5e64954 --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/gnomad_mito.py @@ -0,0 +1,14 @@ +import hail as hl + + +def get_ht(path: str, *_) -> hl.Table: + ht = hl.read_table(path) + ht = ht.select( + AN=hl.int32(ht.AN), + AC_hom=hl.int32(ht.AC_hom), + AC_het=hl.int32(ht.AC_het), + AF_hom=ht.AF_hom, + AF_het=ht.AF_het, + max_hl=ht.max_hl, + ) + return ht.select_globals() diff --git a/v03_pipeline/lib/reference_datasets/gnomad_non_coding_constraint.py b/v03_pipeline/lib/reference_datasets/gnomad_non_coding_constraint.py new file mode 100644 index 000000000..fda2064b2 --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/gnomad_non_coding_constraint.py @@ -0,0 +1,23 @@ +import hail as hl + +from v03_pipeline.lib.model import ReferenceGenome +from v03_pipeline.lib.reference_datasets.misc import ( + select_for_interval_reference_dataset, +) + + +def get_ht(path: str, reference_genome: ReferenceGenome) -> hl.Table: + ht = hl.import_table( + path, + types={ + 'start': hl.tint32, + 'end': hl.tint32, + 'z': hl.tfloat32, + }, + force_bgz=True, + ) + return select_for_interval_reference_dataset( + ht, + reference_genome, + {'z_score': ht['z']}, + ) diff --git a/v03_pipeline/lib/reference_datasets/gnomad_qc.py b/v03_pipeline/lib/reference_datasets/gnomad_qc.py new file mode 100644 index 000000000..fd580ef61 --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/gnomad_qc.py @@ -0,0 +1,9 @@ +import hail as hl + +from v03_pipeline.lib.model import ReferenceGenome + + +def get_ht(path: str, reference_genome: ReferenceGenome) -> hl.Table: + if reference_genome == ReferenceGenome.GRCh37: + return hl.read_matrix_table(path).rows() + return hl.read_table(path) diff --git a/v03_pipeline/lib/reference_datasets/gnomad_utils.py b/v03_pipeline/lib/reference_datasets/gnomad_utils.py new file mode 100644 index 000000000..846758db9 --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/gnomad_utils.py @@ -0,0 +1,55 @@ +from collections.abc import Callable + +import hail as hl + +from v03_pipeline.lib.model import ReferenceGenome + + +def global_idx_field(reference_genome: ReferenceGenome) -> str: + return 'gnomad' if reference_genome == ReferenceGenome.GRCh37 else 'adj' + + +def faf_globals_field(reference_genome: ReferenceGenome) -> str: + return ( + 'popmax_index_dict' + if reference_genome == ReferenceGenome.GRCh37 + else 'faf_index_dict' + ) + + +def hemi_field(reference_genome: ReferenceGenome) -> str: + return 'gnomad_male' if reference_genome == ReferenceGenome.GRCh37 else 'XY_adj' + + +def get_ht( + path: str, + reference_genome: ReferenceGenome, + af_popmax_expression: Callable, +) -> hl.Table: + ht = hl.read_table(path) + global_idx = hl.eval(ht.globals.freq_index_dict[global_idx_field(reference_genome)]) + ht = ht.select( + AF=hl.float32(ht.freq[global_idx].AF), + AN=ht.freq[global_idx].AN, + AC=ht.freq[global_idx].AC, + Hom=ht.freq[global_idx].homozygote_count, + AF_POPMAX_OR_GLOBAL=hl.float32( + hl.or_else( + af_popmax_expression(ht, reference_genome), + ht.freq[global_idx].AF, + ), + ), + FAF_AF=hl.float32( + ht.faf[ + ht.globals[faf_globals_field(reference_genome)][ + global_idx_field(reference_genome) + ] + ].faf95, + ), + Hemi=hl.if_else( + ht.locus.in_autosome_or_par(), + 0, + ht.freq[ht.globals.freq_index_dict[hemi_field(reference_genome)]].AC, + ), + ) + return ht.select_globals() diff --git a/v03_pipeline/lib/reference_datasets/helix_mito.py b/v03_pipeline/lib/reference_datasets/helix_mito.py new file mode 100644 index 000000000..f81571a6d --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/helix_mito.py @@ -0,0 +1,53 @@ +import shutil +import tempfile + +import hail as hl +import requests + +from v03_pipeline.lib.model.definitions import ReferenceGenome + +RENAME = { + 'counts_hom': 'AC_hom', + 'counts_het': 'AC_het', + 'max_ARF': 'max_hl', +} + + +def get_ht( + url: str, + reference_genome: ReferenceGenome, +) -> hl.Table: + with tempfile.NamedTemporaryFile( + suffix='.tsv', + delete=False, + ) as tmp_file, requests.get(url, stream=True, timeout=10) as r: + shutil.copyfileobj(r.raw, tmp_file) + ht = hl.import_table( + tmp_file.name, + types={ + 'counts_hom': hl.tint32, + 'counts_het': hl.tint32, + 'max_ARF': hl.tfloat32, + 'AF_het': hl.tfloat32, + 'AF_hom': hl.tfloat32, + 'alleles': hl.tarray(hl.tstr), + }, + ) + ht = ht.rename(RENAME) + ht = ht.select( + *RENAME.values(), + locus=hl.locus( + 'chrM', + hl.parse_int32(ht.locus.split(':')[1]), + reference_genome, + ), + alleles=ht.alleles, + AN=hl.if_else( + ht.AF_hom > 0, + hl.int32(ht.AC_hom / ht.AF_hom), + hl.int32(ht.AC_het / ht.AF_het), + ), + AF_hom=ht.AF_hom, + AF_het=ht.AF_het, + ) + return ht.key_by('locus', 'alleles') diff --git a/v03_pipeline/lib/reference_datasets/hgmd.py b/v03_pipeline/lib/reference_datasets/hgmd.py new file mode 100644 index 000000000..df922ac01 --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/hgmd.py @@ -0,0 +1,20 @@ +import hail as hl + +from v03_pipeline.lib.model import ReferenceGenome + + +def get_ht(path: str, reference_genome: ReferenceGenome) -> hl.Table: + mt = hl.import_vcf( + path, + reference_genome=reference_genome.value, + force=True, + skip_invalid_loci=True, + contig_recoding=reference_genome.contig_recoding(), + ) + ht = mt.rows() + return ht.select( + **{ + 'accession': ht.rsid, + 'class': ht.info.CLASS, + }, + ) diff --git a/v03_pipeline/lib/reference_datasets/hgmd_test.py b/v03_pipeline/lib/reference_datasets/hgmd_test.py new file mode 100644 index 000000000..953ffe7f5 --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/hgmd_test.py @@ -0,0 +1,41 @@ +import unittest +from unittest.mock import patch + +import hail as hl + +from v03_pipeline.lib.model import ReferenceGenome +from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset + +TEST_HGMD_VCF = 'v03_pipeline/var/test/reference_datasets/raw/test_hgmd.vcf' + + +class HGMDTest(unittest.TestCase): + def test_hgmd_38(self): + with patch.object( + ReferenceDataset, + 'path', + return_value=TEST_HGMD_VCF, + ): + ht = ReferenceDataset.hgmd.get_ht(ReferenceGenome.GRCh38) + self.assertEqual( + ht.collect(), + [ + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=925942, + reference_genome='GRCh38', + ), + alleles=['A', 'G'], + accession='CM2039807', + class_id=1, + ), + ], + ) + self.assertEqual( + ht.globals.collect()[0], + hl.Struct( + version='1.0', + enums=hl.Struct(**ReferenceDataset.hgmd.enums), + ), + ) diff --git a/v03_pipeline/lib/reference_datasets/high_af_variants.py b/v03_pipeline/lib/reference_datasets/high_af_variants.py new file mode 100644 index 000000000..f73a077d3 --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/high_af_variants.py @@ -0,0 +1,20 @@ +import hail as hl + +ONE_TENTH_PERCENT = 0.001 +ONE_PERCENT = 0.01 +THREE_PERCENT = 0.03 +FIVE_PERCENT = 0.05 +TEN_PERCENT = 0.10 + + +def get_ht( + ht: hl.Table, +) -> hl.Table: + ht = ht.select_globals() + ht = ht.filter(ht.AF_POPMAX_OR_GLOBAL > ONE_TENTH_PERCENT) + return ht.select( + is_gt_1_percent=ht.AF_POPMAX_OR_GLOBAL > ONE_PERCENT, + is_gt_3_percent=ht.AF_POPMAX_OR_GLOBAL > THREE_PERCENT, + is_gt_5_percent=ht.AF_POPMAX_OR_GLOBAL > FIVE_PERCENT, + is_gt_10_percent=ht.AF_POPMAX_OR_GLOBAL > TEN_PERCENT, + ) diff --git a/v03_pipeline/lib/reference_datasets/hmtvar.py b/v03_pipeline/lib/reference_datasets/hmtvar.py new file mode 100644 index 000000000..0fdcdecd8 --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/hmtvar.py @@ -0,0 +1,23 @@ +import hail as hl +import requests + +from v03_pipeline.lib.model.definitions import ReferenceGenome + + +def get_ht( + url: str, + reference_genome: ReferenceGenome, +) -> hl.Table: + response = requests.get(url, stream=True, timeout=10) + data = response.json() + ht = hl.Table.parallelize(data) + ht = ht.select( + locus=hl.locus( + reference_genome.mito_contig, + ht.nt_start, + reference_genome.value, + ), + alleles=hl.array([ht.ref_rCRS, ht.alt]), + score=ht.disease_score, + ) + return ht.key_by('locus', 'alleles') diff --git a/v03_pipeline/lib/reference_datasets/local_constraint_mito.py b/v03_pipeline/lib/reference_datasets/local_constraint_mito.py new file mode 100644 index 000000000..bb473318d --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/local_constraint_mito.py @@ -0,0 +1,24 @@ +import os + +import hail as hl + +from v03_pipeline.lib.model import ReferenceGenome +from v03_pipeline.lib.reference_datasets.misc import download_zip_file + +EXTRACTED_FILE_NAME = 'supplementary_dataset_7.tsv' + + +def get_ht(url: str, reference_genome: ReferenceGenome) -> hl.Table: + with download_zip_file(url, decode_content=True) as unzipped_dir: + ht = hl.import_table( + os.path.join( + unzipped_dir, + EXTRACTED_FILE_NAME, + ), + ) + ht = ht.select( + locus=hl.locus('chrM', hl.parse_int32(ht.Position), reference_genome.value), + alleles=[ht.Reference, ht.Alternate], + score=hl.parse_float32(ht.MLC_score), + ) + return ht.key_by('locus', 'alleles') diff --git a/v03_pipeline/lib/reference_datasets/misc.py b/v03_pipeline/lib/reference_datasets/misc.py new file mode 100644 index 000000000..5e1004c71 --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/misc.py @@ -0,0 +1,150 @@ +import contextlib +import os +import tempfile +import zipfile + +import hail as hl +import requests + +from v03_pipeline.lib.misc.io import split_multi_hts +from v03_pipeline.lib.model.dataset_type import DatasetType +from v03_pipeline.lib.model.definitions import ReferenceGenome + +BIALLELIC = 2 + + +def get_enum_select_fields( + ht: hl.Table, + enums: dict | None, +) -> dict[str, hl.Expression]: + enum_select_fields = {} + for field_name, values in (enums or {}).items(): + if not hasattr(ht, field_name): + if hasattr(ht, f'{field_name}_id') or hasattr(ht, f'{field_name}_ids'): + continue + error = f'Unused enum {field_name}' + raise ValueError(error) + + lookup = hl.dict( + hl.enumerate(values, index_first=False).extend( + # NB: adding missing values here allows us to + # hard fail if a mapped key is present and has an unexpected value + # but propagate missing values. + [(hl.missing(hl.tstr), hl.missing(hl.tint32))], + ), + ) + # NB: this conditioning on type is "outside" the hail expression context. + if ( + isinstance(ht[field_name].dtype, hl.tarray | hl.tset) + and ht[field_name].dtype.element_type == hl.tstr + ): + enum_select_fields[f'{field_name}_ids'] = ht[field_name].map( + lambda x: lookup[x], # noqa: B023 + ) + else: + enum_select_fields[f'{field_name}_id'] = lookup[ht[field_name]] + return enum_select_fields + + +def filter_mito_contigs( + reference_genome: ReferenceGenome, + dataset_type: DatasetType, + ht: hl.Table, +) -> hl.Table: + if dataset_type == DatasetType.MITO: + return ht.filter(ht.locus.contig == reference_genome.mito_contig) + return ht.filter(ht.locus.contig != reference_genome.mito_contig) + + +def filter_contigs(ht, reference_genome: ReferenceGenome): + if hasattr(ht, 'interval'): + return ht.filter( + hl.set(reference_genome.standard_contigs).contains( + ht.interval.start.contig, + ), + ) + return ht.filter( + hl.set(reference_genome.standard_contigs).contains(ht.locus.contig), + ) + + +def vcf_to_ht( + file_name: str, + reference_genome: ReferenceGenome, + split_multi=False, +) -> hl.Table: + mt = hl.import_vcf( + file_name, + reference_genome=reference_genome.value, + drop_samples=True, + skip_invalid_loci=True, + contig_recoding=reference_genome.contig_recoding(include_mt=True), + force_bgz=True, + array_elements_required=False, + ) + if split_multi: + return split_multi_hts(mt, True).rows() + + # Validate that there exist no multialellic variants in the table. + count_non_biallelic = mt.aggregate_rows( + hl.agg.count_where(hl.len(mt.alleles) > BIALLELIC), + ) + if count_non_biallelic: + error = f'Encountered {count_non_biallelic} multiallelic variants' + raise ValueError(error) + return mt.rows() + + +def key_by_locus_alleles(ht: hl.Table, reference_genome: ReferenceGenome) -> hl.Table: + chrom = ( + hl.format('chr%s', ht.chrom) + if reference_genome == ReferenceGenome.GRCh38 + else ht.chrom + ) + ht = ht.transmute( + locus=hl.locus(chrom, ht.pos, reference_genome.value), + alleles=hl.array([ht.ref, ht.alt]), + ) + return ht.key_by('locus', 'alleles') + + +def copyfileobj(fsrc, fdst, decode_content, length=16 * 1024): + """Copy data from file-like object fsrc to file-like object fdst.""" + while True: + buf = fsrc.read(length, decode_content=decode_content) + if not buf: + break + fdst.write(buf) + + +@contextlib.contextmanager +def download_zip_file(url, suffix='.zip', decode_content=False): + with tempfile.NamedTemporaryFile( + suffix=suffix, + ) as tmp_file, requests.get(url, stream=True, timeout=10) as r: + copyfileobj(r.raw, tmp_file, decode_content) + with zipfile.ZipFile(tmp_file.name, 'r') as zipf: + zipf.extractall(os.path.dirname(tmp_file.name)) + # Extracting the zip file + yield os.path.dirname(tmp_file.name) + + +def select_for_interval_reference_dataset( + ht: hl.Table, + reference_genome: ReferenceGenome, + additional_selects: dict, + chrom_field: str = 'chrom', + start_field: str = 'start', + end_field: str = 'end', +) -> hl.Table: + ht = ht.select( + interval=hl.locus_interval( + ht[chrom_field], + ht[start_field] + 1, + ht[end_field] + 1, + reference_genome=reference_genome.value, + invalid_missing=True, + ), + **additional_selects, + ) + return ht.key_by('interval') diff --git a/v03_pipeline/lib/reference_datasets/misc_test.py b/v03_pipeline/lib/reference_datasets/misc_test.py new file mode 100644 index 000000000..dbfc0de72 --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/misc_test.py @@ -0,0 +1,76 @@ +import unittest + +import hail as hl + +from v03_pipeline.lib.model.definitions import ReferenceGenome +from v03_pipeline.lib.reference_datasets.misc import get_enum_select_fields, vcf_to_ht + +EXAC_PATH = 'v03_pipeline/var/test/reference_datasets/raw/exac.vcf' + + +class MiscTest(unittest.TestCase): + def test_get_enum_select_fields(self): + ht = hl.Table.parallelize( + [ + {'variant': ['1', '2'], 'sv_type': 'a', 'sample_fix': '1'}, + { + 'variant': ['1', '3', '2'], + 'sv_type': 'b', + 'sample_fix': '2', + }, + {'variant': ['1', '3'], 'sv_type': 'c', 'sample_fix': '3'}, + {'variant': ['4'], 'sv_type': 'd', 'sample_fix': '4'}, + ], + hl.tstruct( + variant=hl.dtype('array'), + sv_type=hl.dtype('str'), + sample_fix=hl.dtype('str'), + ), + ) + enum_select_fields = get_enum_select_fields( + ht, + { + 'variant': ['1', '2', '3', '4'], + 'sv_type': ['a', 'b', 'c', 'd'], + }, + ) + mapped_ht = ht.transmute(**enum_select_fields) + self.assertListEqual( + mapped_ht.collect(), + [ + hl.Struct(variant_ids=[0, 1], sv_type_id=0, sample_fix='1'), + hl.Struct(variant_ids=[0, 2, 1], sv_type_id=1, sample_fix='2'), + hl.Struct(variant_ids=[0, 2], sv_type_id=2, sample_fix='3'), + hl.Struct(variant_ids=[3], sv_type_id=3, sample_fix='4'), + ], + ) + + mapped_enum_select_fields = get_enum_select_fields( + mapped_ht, + { + 'variant': ['1', '2', '3', '4'], + 'sv_type': ['a', 'b', 'c', 'd'], + }, + ) + self.assertDictEqual(mapped_enum_select_fields, {}) + + enum_select_fields = get_enum_select_fields( + ht, + {'sv_type': ['d']}, + ) + mapped_ht = ht.select(**enum_select_fields) + self.assertRaises(Exception, mapped_ht.collect) + + with self.assertRaises(ValueError) as cm: + get_enum_select_fields(ht, {'variant_renamed': ['1', '2', '3', '4']}) + self.assertEqual(str(cm.exception), 'Unused enum variant_renamed') + + self.assertDictEqual(get_enum_select_fields(ht, None), {}) + + def test_vcf_to_ht_throw_multiallelic(self): + self.assertRaises( + ValueError, + vcf_to_ht, + EXAC_PATH, + ReferenceGenome.GRCh38, + ) diff --git a/v03_pipeline/lib/reference_datasets/mitimpact.py b/v03_pipeline/lib/reference_datasets/mitimpact.py new file mode 100644 index 000000000..f12c507c1 --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/mitimpact.py @@ -0,0 +1,26 @@ +import os + +import hail as hl + +from v03_pipeline.lib.model.definitions import ReferenceGenome +from v03_pipeline.lib.reference_datasets.misc import download_zip_file + + +def get_ht( + url: str, + reference_genome: ReferenceGenome, +) -> hl.Table: + extracted_filename = url.removesuffix('.zip').split('/')[-1] + with download_zip_file(url, suffix='.txt.zip') as unzipped_dir: + ht = hl.import_table( + os.path.join( + unzipped_dir, + extracted_filename, + ), + ) + ht = ht.select( + locus=hl.locus('chrM', hl.parse_int32(ht.Start), reference_genome), + alleles=[ht.Ref, ht.Alt], + score=hl.parse_float32(ht.APOGEE2_score), + ) + return ht.key_by('locus', 'alleles') diff --git a/v03_pipeline/lib/reference_datasets/mitomap.py b/v03_pipeline/lib/reference_datasets/mitomap.py new file mode 100644 index 000000000..346856acf --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/mitomap.py @@ -0,0 +1,22 @@ +import hail as hl + +from v03_pipeline.lib.model import ReferenceGenome + + +def get_ht(path: str, reference_genome: ReferenceGenome) -> hl.Table: + ht = hl.import_table( + path, + delimiter=',', + quote='"', + types={'Position': hl.tint32}, + ) + ht = ht.select( + locus=hl.locus( + 'chrM', + ht.Position, + reference_genome=reference_genome.value, + ), + alleles=ht.Allele.first_match_in('m.[0-9]+([ATGC]+)>([ATGC]+)'), + pathogenic=True, + ) + return ht.key_by('locus', 'alleles') diff --git a/v03_pipeline/lib/reference_datasets/mitomap_test.py b/v03_pipeline/lib/reference_datasets/mitomap_test.py new file mode 100644 index 000000000..bef0b036e --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/mitomap_test.py @@ -0,0 +1,51 @@ +import unittest +from unittest.mock import patch + +import hail as hl + +from v03_pipeline.lib.model import ReferenceGenome +from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset + +TEST_MITOMAP_CSV = 'v03_pipeline/var/test/reference_datasets/raw/test_mitomap.csv' + + +class MitomapTest(unittest.TestCase): + def test_mitomap(self): + with patch.object( + ReferenceDataset, + 'path', + return_value=TEST_MITOMAP_CSV, + ): + ht = ReferenceDataset.mitomap.get_ht(ReferenceGenome.GRCh38) + self.assertEqual( + ht.collect(), + [ + hl.Struct( + locus=hl.Locus( + contig='chrM', + position=583, + reference_genome='GRCh38', + ), + alleles=['G', 'A'], + pathogenic=True, + ), + hl.Struct( + locus=hl.Locus( + contig='chrM', + position=591, + reference_genome='GRCh38', + ), + alleles=['C', 'T'], + pathogenic=True, + ), + hl.Struct( + locus=hl.Locus( + contig='chrM', + position=616, + reference_genome='GRCh38', + ), + alleles=['T', 'C'], + pathogenic=True, + ), + ], + ) diff --git a/v03_pipeline/lib/reference_data/queries.py b/v03_pipeline/lib/reference_datasets/queries.py similarity index 100% rename from v03_pipeline/lib/reference_data/queries.py rename to v03_pipeline/lib/reference_datasets/queries.py diff --git a/v03_pipeline/lib/reference_datasets/reference_dataset.py b/v03_pipeline/lib/reference_datasets/reference_dataset.py new file mode 100644 index 000000000..bc5d42f0f --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/reference_dataset.py @@ -0,0 +1,412 @@ +import importlib +import types +from collections.abc import Callable +from enum import Enum +from typing import Union + +import hail as hl + +from v03_pipeline.lib.model import AccessControl, DatasetType, Env, ReferenceGenome +from v03_pipeline.lib.reference_datasets import clinvar, dbnsfp +from v03_pipeline.lib.reference_datasets.misc import ( + filter_contigs, + filter_mito_contigs, + get_enum_select_fields, +) + +DATASET_TYPES = 'dataset_types' +ENUMS = 'enums' +EXCLUDE_FROM_ANNOTATIONS = 'exclude_from_annotations' +FILTER = 'filter' +IS_INTERVAL = 'is_interval' +SELECT = 'select' +VERSION = 'version' +PATH = 'path' + + +class BaseReferenceDataset: + @classmethod + def for_reference_genome_dataset_type( + cls, + reference_genome: ReferenceGenome, + dataset_type: DatasetType, + ) -> set[Union['ReferenceDataset', 'ReferenceDatasetQuery']]: + reference_datasets = [ + dataset + for dataset, config in CONFIG.items() + if dataset_type in config.get(reference_genome, {}).get(DATASET_TYPES, []) + ] + if not Env.ACCESS_PRIVATE_REFERENCE_DATASETS: + return { + dataset + for dataset in reference_datasets + if dataset.access_control == AccessControl.PUBLIC + } + return set(reference_datasets) + + @classmethod + def for_reference_genome_dataset_type_annotations( + cls, + reference_genome: ReferenceGenome, + dataset_type: DatasetType, + ) -> set['ReferenceDataset']: + return { + dataset + for dataset in cls.for_reference_genome_dataset_type( + reference_genome, + dataset_type, + ) + if not CONFIG[dataset].get(EXCLUDE_FROM_ANNOTATIONS, False) + } + + @property + def is_keyed_by_interval(self) -> bool: + return CONFIG[self].get(IS_INTERVAL, False) + + @property + def access_control(self) -> AccessControl: + if self == ReferenceDataset.hgmd: + return AccessControl.PRIVATE + return AccessControl.PUBLIC + + def version(self, reference_genome: ReferenceGenome) -> str: + version = CONFIG[self][reference_genome][VERSION] + if isinstance(version, types.FunctionType): + return version( + self.path(reference_genome), + ) + return version + + @property + def enums(self) -> dict | None: + return CONFIG[self].get(ENUMS) + + @property + def enum_globals(self) -> hl.Struct: + if self.enums: + return hl.Struct(**self.enums) + return hl.missing(hl.tstruct(hl.tstr, hl.tarray(hl.tstr))) + + @property + def filter( # noqa: A003 + self, + ) -> Callable[[ReferenceGenome, DatasetType, hl.Table], hl.Table] | None: + return CONFIG[self].get(FILTER) + + @property + def select( + self, + ) -> Callable[[ReferenceGenome, DatasetType, hl.Table], hl.Table] | None: + return CONFIG[self].get(SELECT) + + def path(self, reference_genome: ReferenceGenome) -> str | list[str]: + return CONFIG[self][reference_genome][PATH] + + def get_ht( + self, + reference_genome: ReferenceGenome, + ) -> hl.Table: + module = importlib.import_module( + f'v03_pipeline.lib.reference_datasets.{self.name}', + ) + path = self.path(reference_genome) + ht = module.get_ht(path, reference_genome) + enum_selects = get_enum_select_fields(ht, self.enums) + if enum_selects: + ht = ht.transmute(**enum_selects) + ht = filter_contigs(ht, reference_genome) + # NB: we do not filter with "filter" here + # ReferenceDatasets are DatasetType agnostic and that + # filter is only used at annotation time. + return ht.annotate_globals( + version=self.version(reference_genome), + enums=self.enum_globals, + ) + + +class ReferenceDataset(BaseReferenceDataset, str, Enum): + clinvar = 'clinvar' + dbnsfp = 'dbnsfp' + exac = 'exac' + eigen = 'eigen' + helix_mito = 'helix_mito' + hgmd = 'hgmd' + hmtvar = 'hmtvar' + mitimpact = 'mitimpact' + splice_ai = 'splice_ai' + topmed = 'topmed' + gnomad_coding_and_noncoding = 'gnomad_coding_and_noncoding' + gnomad_exomes = 'gnomad_exomes' + gnomad_genomes = 'gnomad_genomes' + gnomad_qc = 'gnomad_qc' + gnomad_mito = 'gnomad_mito' + gnomad_non_coding_constraint = 'gnomad_non_coding_constraint' + screen = 'screen' + local_constraint_mito = 'local_constraint_mito' + mitomap = 'mitomap' + + +class ReferenceDatasetQuery(BaseReferenceDataset, str, Enum): + clinvar_path_variants = 'clinvar_path_variants' + high_af_variants = 'high_af_variants' + + @property + def requires(self) -> ReferenceDataset: + return { + self.clinvar_path_variants: ReferenceDataset.clinvar, + self.high_af_variants: ReferenceDataset.gnomad_genomes, + }[self] + + def get_ht( + self, + reference_genome: ReferenceGenome, + dataset_type: DatasetType, + reference_dataset_ht: hl.Table, + ) -> hl.Table: + module = importlib.import_module( + f'v03_pipeline.lib.reference_datasets.{self.name}', + ) + ht = module.get_ht(reference_dataset_ht) + if self.filter: + ht = self.filter(reference_genome, dataset_type, ht) + return ht.annotate_globals( + version=self.version(reference_genome), + ) + + +CONFIG = { + ReferenceDataset.dbnsfp: { + ENUMS: { + 'MutationTaster_pred': ['D', 'A', 'N', 'P'], + }, + FILTER: filter_mito_contigs, + SELECT: dbnsfp.select, + ReferenceGenome.GRCh37: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]), + VERSION: '1.0', + PATH: 'https://dbnsfp.s3.amazonaws.com/dbNSFP4.7a.zip', + }, + ReferenceGenome.GRCh38: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL, DatasetType.MITO]), + VERSION: '1.0', + PATH: 'https://dbnsfp.s3.amazonaws.com/dbNSFP4.7a.zip', + }, + }, + ReferenceDataset.eigen: { + ReferenceGenome.GRCh37: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]), + VERSION: '1.0', + # NB: The download link on the Eigen website (http://www.columbia.edu/~ii2135/download.html) is broken + # as of 11/15/24 so we will host the data + PATH: 'gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.ht', + }, + ReferenceGenome.GRCh38: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]), + VERSION: '1.0', + PATH: 'gs://seqr-reference-data/GRCh38/eigen/EIGEN_coding_noncoding.liftover_grch38.ht', + }, + }, + ReferenceDataset.clinvar: { + ENUMS: clinvar.ENUMS, + FILTER: filter_mito_contigs, + ReferenceGenome.GRCh37: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]), + VERSION: clinvar.parse_clinvar_release_date, + PATH: 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', + }, + ReferenceGenome.GRCh38: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL, DatasetType.MITO]), + VERSION: clinvar.parse_clinvar_release_date, + PATH: 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', + }, + }, + ReferenceDataset.exac: { + ReferenceGenome.GRCh37: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]), + VERSION: '1.0', + PATH: 'gs://gcp-public-data--gnomad/legacy/exacv1_downloads/release1/ExAC.r1.sites.vep.vcf.gz', + }, + ReferenceGenome.GRCh38: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]), + VERSION: '1.0', + # NB: Exac is only available on GRCh37 so we host a lifted over version + PATH: 'gs://seqr-reference-data/GRCh38/gnomad/ExAC.r1.sites.liftover.b38.vcf.gz', + }, + }, + ReferenceDataset.helix_mito: { + ReferenceGenome.GRCh38: { + DATASET_TYPES: frozenset([DatasetType.MITO]), + VERSION: '1.0', + PATH: 'https://helix-research-public.s3.amazonaws.com/mito/HelixMTdb_20200327.tsv', + }, + }, + ReferenceDataset.splice_ai: { + ENUMS: { + 'splice_consequence': [ + 'Acceptor gain', + 'Acceptor loss', + 'Donor gain', + 'Donor loss', + 'No consequence', + ], + }, + ReferenceGenome.GRCh37: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]), + VERSION: '1.0', + PATH: [ + 'gs://seqr-reference-data/GRCh37/spliceai/spliceai_scores.masked.snv.hg19.vcf.gz', + 'gs://seqr-reference-data/GRCh37/spliceai/spliceai_scores.masked.indel.hg19.vcf.gz', + ], + }, + ReferenceGenome.GRCh38: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]), + VERSION: '1.0', + # NB: SpliceAI data is only available to download for authenticated Illumina users, so we will host the data + PATH: [ + 'gs://seqr-reference-data/GRCh38/spliceai/spliceai_scores.masked.snv.hg38.vcf.gz', + 'gs://seqr-reference-data/GRCh38/spliceai/spliceai_scores.masked.indel.hg38.vcf.gz', + ], + }, + }, + ReferenceDataset.topmed: { + ReferenceGenome.GRCh37: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]), + VERSION: '1.0', + PATH: 'gs://seqr-reference-data/GRCh37/TopMed/bravo-dbsnp-all.removed_chr_prefix.liftunder_GRCh37.vcf.gz', + }, + ReferenceGenome.GRCh38: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]), + VERSION: '1.0', + # NB: TopMed data is available to download via https://legacy.bravo.sph.umich.edu/freeze8/hg38/downloads/vcf/ + # However, users must be authenticated and accept TOS to access it so for now we will host a copy of the data + PATH: 'gs://seqr-reference-data/GRCh38/TopMed/bravo-dbsnp-all.vcf.gz', + }, + }, + ReferenceDataset.hmtvar: { + ReferenceGenome.GRCh38: { + DATASET_TYPES: frozenset([DatasetType.MITO]), + VERSION: '1.0', + # NB: https://www.hmtvar.uniba.it is unavailable as of 11/15/24 so we will host the data + PATH: 'https://storage.googleapis.com/seqr-reference-data/GRCh38/mitochondrial/HmtVar/HmtVar%20Jan.%2010%202022.json', + }, + }, + ReferenceDataset.mitimpact: { + ReferenceGenome.GRCh38: { + DATASET_TYPES: frozenset([DatasetType.MITO]), + VERSION: '1.0', + PATH: 'https://mitimpact.css-mendel.it/cdn/MitImpact_db_3.1.3.txt.zip', + }, + }, + ReferenceDataset.hgmd: { + ENUMS: {'class': ['DM', 'DM?', 'DP', 'DFP', 'FP', 'R']}, + ReferenceGenome.GRCh37: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]), + VERSION: '1.0', + PATH: 'gs://seqr-reference-data-private/GRCh37/HGMD/HGMD_Pro_2023.1_hg19.vcf.gz', + }, + ReferenceGenome.GRCh38: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]), + VERSION: '1.0', + PATH: 'gs://seqr-reference-data-private/GRCh38/HGMD/HGMD_Pro_2023.1_hg38.vcf.gz', + }, + }, + ReferenceDataset.gnomad_exomes: { + ReferenceGenome.GRCh37: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]), + VERSION: '1.0', + PATH: 'gs://gcp-public-data--gnomad/release/2.1.1/ht/exomes/gnomad.exomes.r2.1.1.sites.ht', + }, + ReferenceGenome.GRCh38: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]), + VERSION: '1.0', + PATH: 'gs://gcp-public-data--gnomad/release/4.1/ht/exomes/gnomad.exomes.v4.1.sites.ht', + }, + }, + ReferenceDataset.gnomad_genomes: { + ReferenceGenome.GRCh37: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]), + VERSION: '1.0', + PATH: 'gs://gcp-public-data--gnomad/release/2.1.1/ht/genomes/gnomad.genomes.r2.1.1.sites.ht', + }, + ReferenceGenome.GRCh38: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]), + VERSION: '1.0', + PATH: 'gs://gcp-public-data--gnomad/release/4.1/ht/genomes/gnomad.genomes.v4.1.sites.ht', + }, + }, + ReferenceDataset.gnomad_qc: { + EXCLUDE_FROM_ANNOTATIONS: True, + ReferenceGenome.GRCh37: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]), + VERSION: '1.0', + PATH: 'gs://seqr-reference-data/gnomad_qc/GRCh37/gnomad.joint.high_callrate_common_biallelic_snps.pruned.mt', + }, + ReferenceGenome.GRCh38: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]), + VERSION: '1.0', + PATH: 'gs://gcp-public-data--gnomad/release/4.0/pca/gnomad.v4.0.pca_loadings.ht', + }, + }, + ReferenceDataset.mitomap: { + ReferenceGenome.GRCh38: { + DATASET_TYPES: frozenset([DatasetType.MITO]), + VERSION: '1.0', + # Downloaded via https://www.mitomap.org/foswiki/bin/view/MITOMAP/ConfirmedMutations + PATH: 'gs://seqr-reference-data/GRCh38/mitochondrial/MITOMAP/mitomap_confirmed_mutations_nov_2024.csv', + }, + }, + ReferenceDataset.gnomad_mito: { + ReferenceGenome.GRCh38: { + DATASET_TYPES: frozenset([DatasetType.MITO]), + VERSION: '1.0', + PATH: 'gs://gcp-public-data--gnomad/release/3.1/ht/genomes/gnomad.genomes.v3.1.sites.chrM.ht', + }, + }, + ReferenceDataset.gnomad_non_coding_constraint: { + IS_INTERVAL: True, + ReferenceGenome.GRCh38: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]), + VERSION: '1.0', + PATH: 'gs://gcp-public-data--gnomad/release/3.1/secondary_analyses/genomic_constraint/constraint_z_genome_1kb.qc.download.txt.gz', + }, + }, + ReferenceDataset.screen: { + ENUMS: { + 'region_type': [ + 'CTCF-bound', + 'CTCF-only', + 'DNase-H3K4me3', + 'PLS', + 'dELS', + 'pELS', + 'DNase-only', + 'low-DNase', + ], + }, + IS_INTERVAL: True, + ReferenceGenome.GRCh38: { + DATASET_TYPES: frozenset([DatasetType.SNV_INDEL]), + VERSION: '1.0', + PATH: 'https://downloads.wenglab.org/V3/GRCh38-cCREs.bed', + }, + }, + ReferenceDataset.local_constraint_mito: { + ReferenceGenome.GRCh38: { + DATASET_TYPES: frozenset([DatasetType.MITO]), + VERSION: '1.0', + PATH: 'https://www.biorxiv.org/content/biorxiv/early/2023/01/27/2022.12.16.520778/DC3/embed/media-3.zip', + }, + }, +} +CONFIG[ReferenceDatasetQuery.clinvar_path_variants] = { + EXCLUDE_FROM_ANNOTATIONS: True, + **CONFIG[ReferenceDataset.clinvar], +} +CONFIG[ReferenceDataset.gnomad_coding_and_noncoding] = { + EXCLUDE_FROM_ANNOTATIONS: True, + **CONFIG[ReferenceDataset.gnomad_genomes], +} +CONFIG[ReferenceDatasetQuery.high_af_variants] = { + EXCLUDE_FROM_ANNOTATIONS: True, + **CONFIG[ReferenceDataset.gnomad_genomes], +} diff --git a/v03_pipeline/lib/reference_datasets/screen.py b/v03_pipeline/lib/reference_datasets/screen.py new file mode 100644 index 000000000..4960e7adf --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/screen.py @@ -0,0 +1,38 @@ +import shutil +import tempfile + +import hail as hl +import requests + +from v03_pipeline.lib.model import ReferenceGenome +from v03_pipeline.lib.reference_datasets.misc import ( + select_for_interval_reference_dataset, +) + + +def get_ht(path: str, reference_genome: ReferenceGenome) -> hl.Table: + with tempfile.NamedTemporaryFile( + suffix='.bed', + delete=False, + ) as tmp_file, requests.get( + path, + stream=True, + timeout=10, + ) as r: + shutil.copyfileobj(r.raw, tmp_file) + ht = hl.import_table( + tmp_file.name, + no_header=True, + types={ + 'f1': hl.tint32, + 'f2': hl.tint32, + }, + ) + return select_for_interval_reference_dataset( + ht, + reference_genome, + {'region_type': ht['f5'].split(',')}, + chrom_field='f0', + start_field='f1', + end_field='f2', + ) diff --git a/v03_pipeline/lib/reference_datasets/splice_ai.py b/v03_pipeline/lib/reference_datasets/splice_ai.py new file mode 100644 index 000000000..e0e3f6db1 --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/splice_ai.py @@ -0,0 +1,33 @@ +import hail as hl + +from v03_pipeline.lib.model import ReferenceGenome +from v03_pipeline.lib.reference_datasets.misc import vcf_to_ht + + +def get_ht( + paths: list[str], + reference_genome: ReferenceGenome, +) -> hl.Table: + ht = vcf_to_ht(paths, reference_genome) + + # SpliceAI INFO field description from the VCF header: SpliceAIv1.3 variant annotation. These include + # delta scores (DS) and delta positions (DP) for acceptor gain (AG), acceptor loss (AL), donor gain (DG), and + # donor loss (DL). Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL + ds_start_index = 2 + ds_end_index = 6 + num_delta_scores = ds_end_index - ds_start_index + ht = ht.select( + delta_scores=ht.info.SpliceAI[0] + .split(delim='\\|')[ds_start_index:ds_end_index] + .map(hl.float32), + ) + ht = ht.annotate(delta_score=hl.max(ht.delta_scores)) + return ht.annotate( + splice_consequence_id=hl.if_else( + ht.delta_score > 0, + # Splice Consequence enum ID is the index of the max score + ht.delta_scores.index(ht.delta_score), + # If no score, use the last index for "No Consequence" + num_delta_scores, + ), + ).drop('delta_scores') diff --git a/v03_pipeline/lib/reference_datasets/topmed.py b/v03_pipeline/lib/reference_datasets/topmed.py new file mode 100644 index 000000000..4e0fddf11 --- /dev/null +++ b/v03_pipeline/lib/reference_datasets/topmed.py @@ -0,0 +1,20 @@ +import hail as hl + +from v03_pipeline.lib.misc.nested_field import parse_nested_field +from v03_pipeline.lib.model import ReferenceGenome +from v03_pipeline.lib.reference_datasets.misc import vcf_to_ht + +SELECT = { + 'AC': 'info.AC#', + 'AF': 'info.AF#', + 'AN': 'info.AN', + 'Hom': 'info.Hom#', + 'Het': 'info.Het#', +} + + +def get_ht(path: str, reference_genome: ReferenceGenome) -> hl.Table: + ht = vcf_to_ht(path, reference_genome) + return ht.select( + **{k: parse_nested_field(ht, v) for k, v in SELECT.items()}, + ) diff --git a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py index 31c718034..cca50e609 100644 --- a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py +++ b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py @@ -2,30 +2,25 @@ import luigi from v03_pipeline.lib.annotations.misc import annotate_enums -from v03_pipeline.lib.annotations.rdc_dependencies import ( - get_rdc_annotation_dependencies, -) -from v03_pipeline.lib.model import ( - ReferenceDatasetCollection, -) from v03_pipeline.lib.paths import ( + valid_reference_dataset_path, variant_annotations_table_path, ) +from v03_pipeline.lib.reference_datasets.reference_dataset import ( + BaseReferenceDataset, + ReferenceDatasetQuery, +) from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget -from v03_pipeline.lib.tasks.reference_data.update_cached_reference_dataset_queries import ( - UpdateCachedReferenceDatasetQueries, +from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset import ( + UpdatedReferenceDatasetTask, ) -from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection import ( - UpdatedReferenceDatasetCollectionTask, +from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_query import ( + UpdatedReferenceDatasetQueryTask, ) class BaseUpdateVariantAnnotationsTableTask(BaseUpdateTask): - @property - def rdc_annotation_dependencies(self) -> dict[str, hl.Table]: - return get_rdc_annotation_dependencies(self.dataset_type, self.reference_genome) - def output(self) -> luigi.Target: return GCSorLocalTarget( variant_annotations_table_path( @@ -35,20 +30,26 @@ def output(self) -> luigi.Target: ) def requires(self) -> list[luigi.Task]: - requirements = [ - self.clone(UpdateCachedReferenceDatasetQueries), - ] - requirements.extend( - self.clone( - UpdatedReferenceDatasetCollectionTask, - reference_dataset_collection=rdc, - ) - for rdc in ReferenceDatasetCollection.for_reference_genome_dataset_type( - self.reference_genome, - self.dataset_type, - ) - ) - return requirements + reqs = [] + for reference_dataset in BaseReferenceDataset.for_reference_genome_dataset_type( + self.reference_genome, + self.dataset_type, + ): + if isinstance(reference_dataset, ReferenceDatasetQuery): + reqs.append( + self.clone( + UpdatedReferenceDatasetQueryTask, + reference_dataset_query=reference_dataset, + ), + ) + else: + reqs.append( + self.clone( + UpdatedReferenceDatasetTask, + reference_dataset=reference_dataset, + ), + ) + return reqs def initialize_table(self) -> hl.Table: key_type = self.dataset_type.table_key_type(self.reference_genome) @@ -57,7 +58,6 @@ def initialize_table(self) -> hl.Table: key_type, key=key_type.fields, globals=hl.Struct( - paths=hl.Struct(), versions=hl.Struct(), enums=hl.Struct(), updates=hl.empty_set( @@ -79,28 +79,27 @@ def annotate_globals( ht: hl.Table, ) -> hl.Table: ht = ht.annotate_globals( - paths=hl.Struct(), versions=hl.Struct(), enums=hl.Struct(), ) - for rdc in ReferenceDatasetCollection.for_reference_genome_dataset_type( + for ( + reference_dataset + ) in BaseReferenceDataset.for_reference_genome_dataset_type_annotations( self.reference_genome, self.dataset_type, ): - rdc_ht = self.rdc_annotation_dependencies[f'{rdc.value}_ht'] - rdc_globals = rdc_ht.index_globals() + rd_ht = hl.read_table( + valid_reference_dataset_path(self.reference_genome, reference_dataset), + ) + rd_ht_globals = rd_ht.index_globals() ht = ht.select_globals( - paths=hl.Struct( - **ht.globals.paths, - **rdc_globals.paths, - ), versions=hl.Struct( **ht.globals.versions, - **rdc_globals.versions, + **{reference_dataset.name: rd_ht_globals.version}, ), enums=hl.Struct( **ht.globals.enums, - **rdc_globals.enums, + **{reference_dataset.name: rd_ht_globals.enums}, ), updates=ht.globals.updates, migrations=ht.globals.migrations, diff --git a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table_test.py b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table_test.py index ce7747768..1b50875e3 100644 --- a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table_test.py +++ b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table_test.py @@ -1,83 +1,55 @@ -import shutil -from unittest.mock import patch - import hail as hl import luigi.worker +import responses from v03_pipeline.lib.model import ( DatasetType, - ReferenceDatasetCollection, ReferenceGenome, ) -from v03_pipeline.lib.paths import valid_reference_dataset_collection_path +from v03_pipeline.lib.paths import valid_reference_dataset_query_path +from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDatasetQuery from v03_pipeline.lib.tasks.base.base_update_variant_annotations_table import ( BaseUpdateVariantAnnotationsTableTask, ) from v03_pipeline.lib.tasks.files import GCSorLocalFolderTarget -from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask -from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase - -TEST_COMBINED_1 = 'v03_pipeline/var/test/reference_data/test_combined_1.ht' -TEST_HGMD_1 = 'v03_pipeline/var/test/reference_data/test_hgmd_1.ht' -TEST_INTERVAL_1 = 'v03_pipeline/var/test/reference_data/test_interval_1.ht' - +from v03_pipeline.lib.test.mock_clinvar_urls import mock_clinvar_urls +from v03_pipeline.lib.test.mocked_reference_datasets_testcase import ( + MockedReferenceDatasetsTestCase, +) -class BaseVariantAnnotationsTableTest(MockedDatarootTestCase): - def setUp(self) -> None: - super().setUp() - shutil.copytree( - TEST_COMBINED_1, - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - ReferenceDatasetCollection.COMBINED, - ), - ) - shutil.copytree( - TEST_HGMD_1, - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - ReferenceDatasetCollection.HGMD, - ), - ) - shutil.copytree( - TEST_INTERVAL_1, - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - ReferenceDatasetCollection.INTERVAL, - ), - ) - @patch( - 'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.UpdatedReferenceDatasetCollectionTask', - ) - @patch( - 'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.UpdateCachedReferenceDatasetQueries', - ) +class BaseVariantAnnotationsTableTest(MockedReferenceDatasetsTestCase): + @responses.activate def test_should_create_initialized_table( self, - mock_update_crdqs_task, - mock_update_rdc_task, ) -> None: - mock_update_rdc_task.return_value = MockCompleteTask() - mock_update_crdqs_task.return_value = MockCompleteTask() - vat_task = BaseUpdateVariantAnnotationsTableTask( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - ) - self.assertTrue('annotations.ht' in vat_task.output().path) - self.assertTrue(DatasetType.SNV_INDEL.value in vat_task.output().path) - self.assertFalse(vat_task.output().exists()) - self.assertFalse(vat_task.complete()) - - worker = luigi.worker.Worker() - worker.add(vat_task) - worker.run() - self.assertTrue(GCSorLocalFolderTarget(vat_task.output().path).exists()) - self.assertTrue(vat_task.complete()) - - ht = hl.read_table(vat_task.output().path) - self.assertEqual(ht.count(), 0) - self.assertEqual(list(ht.key.keys()), ['locus', 'alleles']) + with mock_clinvar_urls(ReferenceGenome.GRCh38): + vat_task = BaseUpdateVariantAnnotationsTableTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + ) + self.assertTrue('annotations.ht' in vat_task.output().path) + self.assertFalse(vat_task.output().exists()) + self.assertFalse(vat_task.complete()) + + worker = luigi.worker.Worker() + worker.add(vat_task) + worker.run() + self.assertTrue(GCSorLocalFolderTarget(vat_task.output().path).exists()) + self.assertTrue(vat_task.complete()) + + ht = hl.read_table(vat_task.output().path) + self.assertEqual(ht.count(), 0) + self.assertEqual(list(ht.key.keys()), ['locus', 'alleles']) + self.assertEqual( + hl.eval( + hl.read_table( + valid_reference_dataset_query_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + ReferenceDatasetQuery.clinvar_path_variants, + ), + ).globals.version, + ), + '2024-11-11', + ) diff --git a/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries.py b/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries.py deleted file mode 100644 index dc9c2a17e..000000000 --- a/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries.py +++ /dev/null @@ -1,37 +0,0 @@ -import luigi -import luigi.util - -from v03_pipeline.lib.model import ( - CachedReferenceDatasetQuery, -) -from v03_pipeline.lib.tasks.base.base_loading_run_params import ( - BaseLoadingRunParams, -) -from v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query import ( - UpdatedCachedReferenceDatasetQuery, -) - - -@luigi.util.inherits(BaseLoadingRunParams) -class UpdateCachedReferenceDatasetQueries(luigi.Task): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.checked_for_tasks = False - self.dynamic_crdq_tasks = set() - - def complete(self) -> bool: - return self.checked_for_tasks - - def run(self): - self.checked_for_tasks = True - for crdq in CachedReferenceDatasetQuery.for_reference_genome_dataset_type( - self.reference_genome, - self.dataset_type, - ): - self.dynamic_crdq_tasks.add( - UpdatedCachedReferenceDatasetQuery( - **self.param_kwargs, - crdq=crdq, - ), - ) - yield self.dynamic_crdq_tasks diff --git a/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries_test.py b/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries_test.py deleted file mode 100644 index d6bf33d36..000000000 --- a/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries_test.py +++ /dev/null @@ -1,124 +0,0 @@ -import unittest -from unittest import mock - -import luigi - -from v03_pipeline.lib.model import ( - CachedReferenceDatasetQuery, - DatasetType, - ReferenceGenome, - SampleType, -) -from v03_pipeline.lib.tasks.reference_data.update_cached_reference_dataset_queries import ( - UpdateCachedReferenceDatasetQueries, -) -from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask - - -@mock.patch( - 'v03_pipeline.lib.tasks.reference_data.update_cached_reference_dataset_queries.UpdatedCachedReferenceDatasetQuery', -) -class UpdateCachedReferenceDatasetQueriesTest(unittest.TestCase): - def test_37_snv_indel(self, mock_crdq_task): - mock_crdq_task.return_value = MockCompleteTask() - worker = luigi.worker.Worker() - kwargs = { - 'sample_type': SampleType.WGS, - 'callset_path': '', - 'project_guids': [], - 'project_remap_paths': [], - 'project_pedigree_paths': [], - 'skip_validation': True, - 'run_id': '1', - } - task = UpdateCachedReferenceDatasetQueries( - reference_genome=ReferenceGenome.GRCh37, - dataset_type=DatasetType.SNV_INDEL, - **kwargs, - ) - worker.add(task) - worker.run() - self.assertTrue(task.complete()) - call_args_list = mock_crdq_task.call_args_list - self.assertEqual(len(call_args_list), 4) - self.assertEqual( - [x.kwargs['crdq'] for x in call_args_list], - list(CachedReferenceDatasetQuery), - ) - - def test_38_snv_indel(self, mock_crdq_task): - mock_crdq_task.return_value = MockCompleteTask() - worker = luigi.worker.Worker() - kwargs = { - 'sample_type': SampleType.WGS, - 'callset_path': '', - 'project_guids': [], - 'project_remap_paths': [], - 'project_pedigree_paths': [], - 'skip_validation': True, - 'run_id': '2', - } - task = UpdateCachedReferenceDatasetQueries( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - **kwargs, - ) - worker.add(task) - worker.run() - self.assertTrue(task.complete()) - call_args_list = mock_crdq_task.call_args_list - self.assertEqual(len(call_args_list), 4) - self.assertEqual( - [x.kwargs['crdq'] for x in call_args_list], - list(CachedReferenceDatasetQuery), - ) - - def test_38_mito(self, mock_crdq_task): - mock_crdq_task.return_value = MockCompleteTask() - worker = luigi.worker.Worker() - kwargs = { - 'sample_type': SampleType.WGS, - 'callset_path': '', - 'project_guids': [], - 'project_remap_paths': [], - 'project_pedigree_paths': [], - 'skip_validation': True, - 'run_id': '3', - } - task = UpdateCachedReferenceDatasetQueries( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.MITO, - **kwargs, - ) - worker.add(task) - worker.run() - self.assertTrue(task.complete()) - call_args_list = mock_crdq_task.call_args_list - self.assertEqual(len(call_args_list), 1) - self.assertEqual( - next(x.kwargs['crdq'] for x in call_args_list), - CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, - ) - - def test_38_sv(self, mock_crdq_task): - mock_crdq_task.return_value = MockCompleteTask() - worker = luigi.worker.Worker() - kwargs = { - 'sample_type': SampleType.WGS, - 'callset_path': '', - 'project_guids': [], - 'project_remap_paths': [], - 'project_pedigree_paths': [], - 'skip_validation': True, - 'run_id': '4', - } - task = UpdateCachedReferenceDatasetQueries( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SV, - **kwargs, - ) - worker.add(task) - worker.run() - self.assertTrue(task.complete()) - # assert no crdq tasks for this reference genome and dataset type - mock_crdq_task.assert_has_calls([]) diff --git a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py index e505c01f7..002eb8d62 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py +++ b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py @@ -3,15 +3,13 @@ from v03_pipeline.lib.annotations.fields import get_fields from v03_pipeline.lib.logger import get_logger -from v03_pipeline.lib.model import ReferenceDatasetCollection -from v03_pipeline.lib.reference_data.compare_globals import ( - Globals, - clinvar_versions_equal, - get_datasets_to_update, +from v03_pipeline.lib.paths import valid_reference_dataset_path +from v03_pipeline.lib.reference_datasets.reference_dataset import ( + BaseReferenceDataset, + ReferenceDataset, ) -from v03_pipeline.lib.reference_data.config import CONFIG -from v03_pipeline.lib.tasks.base.base_loading_run_params import ( - BaseLoadingRunParams, +from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import ( + BaseLoadingPipelineParams, ) from v03_pipeline.lib.tasks.base.base_update_variant_annotations_table import ( BaseUpdateVariantAnnotationsTableTask, @@ -20,94 +18,89 @@ logger = get_logger(__name__) -@luigi.util.inherits(BaseLoadingRunParams) +@luigi.util.inherits(BaseLoadingPipelineParams) class UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( BaseUpdateVariantAnnotationsTableTask, ): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self._datasets_to_update = [] - - @property - def reference_dataset_collections(self) -> list[ReferenceDatasetCollection]: - return ReferenceDatasetCollection.for_reference_genome_dataset_type( - self.reference_genome, - self.dataset_type, - ) + self._datasets_to_update: set[str] = set() def complete(self) -> bool: - logger.info( - 'Checking if UpdateVariantAnnotationsTableWithUpdatedReferenceDataset is complete', - ) - self._datasets_to_update = [] - + reference_dataset_names = { + rd.name + for rd in BaseReferenceDataset.for_reference_genome_dataset_type_annotations( + self.reference_genome, + self.dataset_type, + ) + } if not super().complete(): - for rdc in self.reference_dataset_collections: - self._datasets_to_update.extend( - rdc.datasets( - self.dataset_type, - ), - ) + self._datasets_to_update = reference_dataset_names return False - - datasets_to_check = [ - dataset - for rdc in self.reference_dataset_collections - for dataset in rdc.datasets(self.dataset_type) - ] - - if any( - 'clinvar' in d for d in datasets_to_check - ) and not clinvar_versions_equal( - hl.read_table(self.output().path), - self.reference_genome, - self.dataset_type, - ): - datasets_to_check.remove('clinvar') - self._datasets_to_update.append('clinvar') - - annotations_ht_globals = Globals.from_ht( - hl.read_table(self.output().path), - datasets_to_check, + # Find datasets with mismatched versions + annotation_ht_versions = dict( + hl.eval(hl.read_table(self.output().path).globals.versions), ) - rdc_ht_globals = Globals.from_dataset_configs( - self.reference_genome, - datasets_to_check, + self._datasets_to_update = ( + reference_dataset_names ^ annotation_ht_versions.keys() ) - self._datasets_to_update.extend( - get_datasets_to_update( - annotations_ht_globals, - rdc_ht_globals, - ), + for dataset_name in reference_dataset_names & annotation_ht_versions.keys(): + if ( + ReferenceDataset(dataset_name).version(self.reference_genome) + != annotation_ht_versions[dataset_name] + ): + self._datasets_to_update.add(dataset_name) + logger.info( + f"Datasets to update: {', '.join(d for d in self._datasets_to_update)}", ) - logger.info(f'Datasets to update: {self._datasets_to_update}') return not self._datasets_to_update def update_table(self, ht: hl.Table) -> hl.Table: - for dataset in self._datasets_to_update: - if dataset in ht.row: - ht = ht.drop(dataset) - if dataset not in CONFIG: + for dataset_name in self._datasets_to_update: + if dataset_name in ht.row: + ht = ht.drop(dataset_name) + if dataset_name not in set(ReferenceDataset): continue - - rdc = ReferenceDatasetCollection.for_dataset(dataset, self.dataset_type) - rdc_ht = self.rdc_annotation_dependencies[f'{rdc.value}_ht'] - if rdc.requires_annotation: + reference_dataset = ReferenceDataset(dataset_name) + reference_dataset_ht = hl.read_table( + valid_reference_dataset_path(self.reference_genome, reference_dataset), + ) + if reference_dataset.is_keyed_by_interval: formatting_fn = next( x for x in self.dataset_type.formatting_annotation_fns( self.reference_genome, ) - if x.__name__ == dataset + if x.__name__ == reference_dataset.name ) ht = ht.annotate( **get_fields( ht, [formatting_fn], - **self.rdc_annotation_dependencies, + **{f'{reference_dataset.name}_ht': reference_dataset_ht}, **self.param_kwargs, ), ) else: - ht = ht.join(rdc_ht.select(dataset), 'left') + if reference_dataset.select: + reference_dataset_ht = reference_dataset.select( + self.reference_genome, + self.dataset_type, + reference_dataset_ht, + ) + if reference_dataset.filter: + reference_dataset_ht = reference_dataset.filter( + self.reference_genome, + self.dataset_type, + reference_dataset_ht, + ) + reference_dataset_ht = reference_dataset_ht.select( + **{ + f'{reference_dataset.name}': hl.Struct( + **reference_dataset_ht.row_value, + ), + }, + ) + ht = ht.join(reference_dataset_ht, 'left') + return self.annotate_globals(ht) diff --git a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py index e008aaa8f..a0076f946 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py +++ b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py @@ -1,11 +1,12 @@ -import shutil -from unittest import mock +from unittest.mock import patch import hail as hl import luigi.worker +import responses from v03_pipeline.lib.annotations.enums import ( BIOTYPES, + CLINVAR_ASSERTIONS, CLINVAR_PATHOGENICITIES, FIVEUTR_CONSEQUENCES, LOF_FILTERS, @@ -17,725 +18,90 @@ ) from v03_pipeline.lib.model import ( DatasetType, - ReferenceDatasetCollection, ReferenceGenome, - SampleType, ) -from v03_pipeline.lib.paths import valid_reference_dataset_collection_path -from v03_pipeline.lib.reference_data.clinvar import CLINVAR_ASSERTIONS -from v03_pipeline.lib.reference_data.config import CONFIG +from v03_pipeline.lib.reference_datasets.reference_dataset import ( + BaseReferenceDataset, + ReferenceDataset, +) from v03_pipeline.lib.tasks.files import GCSorLocalFolderTarget from v03_pipeline.lib.tasks.reference_data.update_variant_annotations_table_with_updated_reference_dataset import ( UpdateVariantAnnotationsTableWithUpdatedReferenceDataset, ) -from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask -from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase +from v03_pipeline.lib.test.mock_clinvar_urls import mock_clinvar_urls +from v03_pipeline.lib.test.mocked_reference_datasets_testcase import ( + MockedReferenceDatasetsTestCase, +) -TEST_COMBINED_1 = 'v03_pipeline/var/test/reference_data/test_combined_1.ht' -TEST_HGMD_1 = 'v03_pipeline/var/test/reference_data/test_hgmd_1.ht' -TEST_INTERVAL_1 = 'v03_pipeline/var/test/reference_data/test_interval_1.ht' -TEST_COMBINED_MITO_1 = 'v03_pipeline/var/test/reference_data/test_combined_mito_1.ht' -TEST_INTERVAL_MITO_1 = 'v03_pipeline/var/test/reference_data/test_interval_mito_1.ht' -TEST_COMBINED_37 = 'v03_pipeline/var/test/reference_data/test_combined_37.ht' -TEST_HGMD_37 = 'v03_pipeline/var/test/reference_data/test_hgmd_37.ht' TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' -TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt' - -MOCK_CADD_CONFIG = { - 'version': 'v1.6', - 'select': ['PHRED'], - 'source_path': 'gs://seqr-reference-data/GRCh37/CADD/CADD_snvs_and_indels.v1.6.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - PHRED=hl.tfloat32, - ), - key=['locus', 'alleles'], - globals=hl.Struct( - version='v1.6', - ), +BASE_ENUMS = { + 'sorted_motif_feature_consequences': hl.Struct( + consequence_term=MOTIF_CONSEQUENCE_TERMS, ), -} -MOCK_CLINVAR_CONFIG = { - **CONFIG['clinvar']['38'], - 'source_path': 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - info=hl.tstruct( - ALLELEID=hl.tint32, - CLNSIG=hl.tarray(hl.tstr), - CLNSIGCONF=hl.tarray(hl.tstr), - CLNREVSTAT=hl.tarray(hl.tstr), - ), - submitters=hl.tarray(hl.tstr), - conditions=hl.tarray(hl.tstr), - ), - key=['locus', 'alleles'], - globals=hl.Struct( - version='2023-11-26', - ), + 'sorted_regulatory_feature_consequences': hl.Struct( + biotype=REGULATORY_BIOTYPES, + consequence_term=REGULATORY_CONSEQUENCE_TERMS, ), -} - -MOCK_EIGEN_CONFIG = { - 'select': {'Eigen_phred': 'info.Eigen-phred'}, - 'source_path': 'gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - info=hl.tstruct(**{'Eigen-phred': hl.tfloat32}), + 'sorted_transcript_consequences': hl.Struct( + biotype=BIOTYPES, + consequence_term=TRANSCRIPT_CONSEQUENCE_TERMS, + loftee=hl.Struct( + lof_filter=LOF_FILTERS, ), - key=['locus', 'alleles'], - globals=hl.Struct(), - ), -} - -MOCK_EXAC_CONFIG = { - **CONFIG['exac']['38'], - 'source_path': 'gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vep.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - info=hl.tstruct( - AF_POPMAX=hl.tfloat64, - AF=hl.tarray(hl.tfloat64), - AC_Adj=hl.tarray(hl.tint32), - AC_Het=hl.tarray(hl.tint32), - AC_Hom=hl.tarray(hl.tint32), - AC_Hemi=hl.tarray(hl.tint32), - AN_Adj=hl.tint32, - ), - a_index=hl.tint32, + utrannotator=hl.Struct( + fiveutr_consequence=FIVEUTR_CONSEQUENCES, ), - key=['locus', 'alleles'], - globals=hl.Struct(), ), } -MOCK_MPC_CONFIG = { - **CONFIG['mpc']['38'], - 'source_path': 'gs://seqr-reference-data/GRCh37/MPC/fordist_constraint_official_mpc_values.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - info=hl.tstruct( - MPC=hl.tstr, - ), - ), - key=['locus', 'alleles'], - globals=hl.Struct(), - ), -} -MOCK_PRIMATE_AI_CONFIG = { - 'version': 'v0.2', - 'select': {'score': 'info.score'}, - 'source_path': 'gs://seqr-reference-data/GRCh37/primate_ai/PrimateAI_scores_v0.2.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - info=hl.tstruct( - score=hl.tfloat64, - ), - ), - key=['locus', 'alleles'], - globals=hl.Struct( - version='v0.2', - ), - ), -} -MOCK_SPLICE_AI_CONFIG = { - **CONFIG['splice_ai']['38'], - 'source_path': 'gs://seqr-reference-data/GRCh37/spliceai/spliceai_scores.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - info=hl.tstruct( - max_DS=hl.tfloat64, - splice_consequence=hl.tstr, - ), - ), - key=['locus', 'alleles'], - globals=hl.Struct(), - ), -} -MOCK_TOPMED_CONFIG = { - **CONFIG['topmed']['38'], - 'source_path': 'gs://seqr-reference-data/GRCh37/TopMed/bravo-dbsnp-all.removed_chr_prefix.liftunder_GRCh37.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - info=hl.tstruct( - AC=hl.tint32, - AN=hl.tint32, - AF=hl.tfloat64, - Hom=hl.tint32, - Het=hl.tint32, - ), - ), - key=['locus', 'alleles'], - globals=hl.Struct(), - ), -} -MOCK_CONFIG = { - 'cadd': { - '37': MOCK_CADD_CONFIG, - '38': MOCK_CADD_CONFIG, - }, - 'clinvar': { - '37': MOCK_CLINVAR_CONFIG, - '38': MOCK_CLINVAR_CONFIG, - }, - 'dbnsfp': { - '37': { - **CONFIG['dbnsfp']['37'], - 'version': '2.9.3', - 'source_path': 'gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - REVEL_score=hl.tstr, - SIFT_score=hl.tstr, - Polyphen2_HVAR_score=hl.tstr, - MutationTaster_pred=hl.tstr, - ), - key=['locus', 'alleles'], - globals=hl.Struct( - version='2.9.3', - ), - ), - }, - '38': { - **CONFIG['dbnsfp']['38'], - 'version': '2.9.3', - 'source_path': 'gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - REVEL_score=hl.tstr, - SIFT_score=hl.tstr, - Polyphen2_HVAR_score=hl.tstr, - MutationTaster_pred=hl.tstr, - VEST4_score=hl.tstr, - MutPred_score=hl.tstr, - fathmm_MKL_coding_score=hl.tfloat64, - ), - key=['locus', 'alleles'], - globals=hl.Struct( - version='2.9.3', - ), - ), - }, - }, - 'eigen': { - '37': MOCK_EIGEN_CONFIG, - '38': MOCK_EIGEN_CONFIG, - }, - 'exac': { - '37': MOCK_EXAC_CONFIG, - '38': MOCK_EXAC_CONFIG, - }, - 'gnomad_exomes': { - '37': { - **CONFIG['gnomad_exomes']['37'], - 'source_path': 'gs://gcp-public-data--gnomad/release/2.1.1/ht/exomes/gnomad.exomes.r2.1.1.sites.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - freq=hl.tarray( - hl.tstruct( - AF=hl.tfloat64, - AN=hl.tint32, - AC=hl.tint32, - homozygote_count=hl.tint32, - ), - ), - popmax=hl.tarray( - hl.tstruct( - AF=hl.tfloat64, - AN=hl.tint32, - AC=hl.tint32, - homozygote_count=hl.tint32, - pop=hl.tstr, - ), - ), - faf=hl.tarray(hl.tstruct(faf95=hl.tfloat64)), - ), - key=['locus', 'alleles'], - globals=hl.Struct( - freq_index_dict={'gnomad': 0, 'gnomad_male': 1}, - popmax_index_dict={'gnomad': 0}, - ), - ), - }, - '38': { - **CONFIG['gnomad_exomes']['38'], - 'version': '4.1', - 'source_path': 'gs://gcp-public-data--gnomad/release/4.1/ht/exomes/gnomad.exomes.v4.1.sites.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - freq=hl.tarray( - hl.tstruct( - AF=hl.tfloat64, - AN=hl.tint32, - AC=hl.tint32, - homozygote_count=hl.tint32, - ), - ), - grpmax=hl.tstruct( - gnomad=hl.tstruct( - AF=hl.tfloat64, - AN=hl.tint32, - AC=hl.tint32, - homozygote_count=hl.tint32, - pop=hl.tstr, - ), - ), - faf=hl.tarray(hl.tstruct(faf95=hl.tfloat64)), - ), - key=['locus', 'alleles'], - globals=hl.Struct( - freq_index_dict={'adj': 0, 'XY_adj': 1}, - faf_index_dict={'adj': 0}, - ), - ), - }, - }, - 'gnomad_genomes': { - '37': { - **CONFIG['gnomad_genomes']['37'], - 'source_path': 'gs://gcp-public-data--gnomad/release/2.1.1/ht/genomes/gnomad.genomes.r2.1.1.sites.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - freq=hl.tarray( - hl.tstruct( - AF=hl.tfloat64, - AN=hl.tint32, - AC=hl.tint32, - homozygote_count=hl.tint32, - ), - ), - popmax=hl.tarray( - hl.tstruct( - AF=hl.tfloat64, - AN=hl.tint32, - AC=hl.tint32, - homozygote_count=hl.tint32, - pop=hl.tstr, - ), - ), - faf=hl.tarray(hl.tstruct(faf95=hl.tfloat64)), - ), - key=['locus', 'alleles'], - globals=hl.Struct( - freq_index_dict={'gnomad': 0, 'gnomad_male': 1}, - popmax_index_dict={'gnomad': 0}, - ), - ), - }, - '38': { - **CONFIG['gnomad_genomes']['38'], - 'version': '4.1', - 'source_path': 'gs://gcp-public-data--gnomad/release/4.1/ht/genomes/gnomad.genomes.v4.1.sites.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - freq=hl.tarray( - hl.tstruct( - AF=hl.tfloat64, - AN=hl.tint32, - AC=hl.tint32, - homozygote_count=hl.tint32, +class UpdateVATWithUpdatedReferenceDatasets(MockedReferenceDatasetsTestCase): + @responses.activate + def test_create_empty_annotations_table(self): + with patch.object( + BaseReferenceDataset, + 'for_reference_genome_dataset_type_annotations', + return_value=[ReferenceDataset.clinvar], + ), mock_clinvar_urls(ReferenceGenome.GRCh38): + task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + ) + worker = luigi.worker.Worker() + worker.add(task) + worker.run() + self.assertTrue(GCSorLocalFolderTarget(task.output().path).exists()) + self.assertTrue(task.complete()) + + ht = hl.read_table(task.output().path) + self.assertCountEqual( + ht.globals.collect(), + [ + hl.Struct( + versions=hl.Struct(clinvar='2024-11-11'), + enums=hl.Struct( + clinvar=hl.Struct( + assertion=CLINVAR_ASSERTIONS, + pathogenicity=CLINVAR_PATHOGENICITIES, + ), + **BASE_ENUMS, ), + migrations=[], + updates=set(), ), - grpmax=hl.tstruct( - AF=hl.tfloat64, - AN=hl.tint32, - AC=hl.tint32, - homozygote_count=hl.tint32, - pop=hl.tstr, - ), - faf=hl.tarray(hl.tstruct(faf95=hl.tfloat64)), - ), - key=['locus', 'alleles'], - globals=hl.Struct( - freq_index_dict={'adj': 0, 'XY_adj': 1}, - faf_index_dict={'adj': 0}, - ), - ), - }, - }, - 'mpc': { - '37': MOCK_MPC_CONFIG, - '38': MOCK_MPC_CONFIG, - }, - 'primate_ai': { - '37': MOCK_PRIMATE_AI_CONFIG, - '38': MOCK_PRIMATE_AI_CONFIG, - }, - 'splice_ai': { - '37': MOCK_SPLICE_AI_CONFIG, - '38': MOCK_SPLICE_AI_CONFIG, - }, - 'topmed': { - '37': MOCK_TOPMED_CONFIG, - '38': MOCK_TOPMED_CONFIG, - }, - 'hgmd': { - '37': { - **CONFIG['hgmd']['37'], - 'source_path': 'gs://seqr-reference-data-private/GRCh37/HGMD/HGMD_Pro_2023.1_hg19.vcf.gz', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - rsid=hl.tstr, - info=hl.tstruct( - CLASS=hl.tstr, - ), - ), - key=['locus', 'alleles'], - globals=hl.Struct(), - ), - }, - '38': { - **CONFIG['hgmd']['38'], - 'source_path': 'gs://seqr-reference-data-private/GRCh38/HGMD/HGMD_Pro_2023.1_hg38.vcf.gz', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - rsid=hl.tstr, - info=hl.tstruct( - CLASS=hl.tstr, - ), - ), - key=['locus', 'alleles'], - globals=hl.Struct(), - ), - }, - }, - 'gnomad_non_coding_constraint': { - '38': { - 'select': {'z_score': 'target'}, - 'source_path': 'gs://seqr-reference-data/GRCh38/gnomad_nc_constraint/gnomad_non-coding_constraint_z_scores.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - target=hl.tfloat64, - ), - key=['locus', 'alleles'], - globals=hl.Struct(), - ), - }, - }, - 'screen': { - '38': { - **CONFIG['screen']['38'], - 'source_path': 'gs://seqr-reference-data/GRCh38/ccREs/GRCh38-ccREs.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - target=hl.tarray(hl.tstr), - ), - key=['locus', 'alleles'], - globals=hl.Struct(), - ), - }, - }, -} -MOCK_CONFIG_MITO = { - 'clinvar_mito': { - '38': { - **CONFIG['clinvar_mito']['38'], - 'source_path': 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - info=hl.tstruct( - ALLELEID=hl.tint32, - CLNSIG=hl.tarray(hl.tstr), - CLNSIGCONF=hl.tarray(hl.tstr), - CLNREVSTAT=hl.tarray(hl.tstr), - ), - submitters=hl.tarray(hl.tstr), - conditions=hl.tarray(hl.tstr), - ), - key=['locus', 'alleles'], - globals=hl.Struct( - version='2023-07-22', - ), - ), - }, - }, - 'dbnsfp_mito': { - '38': { - **CONFIG['dbnsfp_mito']['38'], - 'source_path': 'gs://seqr-reference-data/GRCh38/dbNSFP/v4.2/dbNSFP4.2a_variant.with_new_scores.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - SIFT_score=hl.tstr, - MutationTaster_pred=hl.tstr, - ), - key=['locus', 'alleles'], - globals=hl.Struct( - version='4.2', - ), - ), - }, - }, - 'gnomad_mito': { - '38': { - **CONFIG['gnomad_mito']['38'], - 'source_path': 'gs://gcp-public-data--gnomad/release/3.1/ht/genomes/gnomad.genomes.v3.1.sites.chrM.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - AN=hl.tint64, - AC_hom=hl.tint64, - AC_het=hl.tint64, - AF_hom=hl.tfloat32, - AF_het=hl.tfloat32, - max_hl=hl.tfloat64, - ), - key=['locus', 'alleles'], - globals=hl.Struct(), - ), - }, - }, - 'helix_mito': { - '38': { - **CONFIG['helix_mito']['38'], - 'source_path': 'gs://seqr-reference-data/GRCh38/mitochondrial/Helix/HelixMTdb_20200327.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - counts_hom=hl.tint32, - counts_het=hl.tint32, - AF_hom=hl.tfloat64, - AF_het=hl.tfloat64, - AN=hl.tint32, - max_ARF=hl.tfloat64, - ), - key=['locus', 'alleles'], - globals=hl.Struct(), - ), - }, - }, - 'hmtvar': { - '38': { - **CONFIG['hmtvar']['38'], - 'source_path': 'gs://seqr-reference-data/GRCh38/mitochondrial/HmtVar/HmtVar%20Jan.%2010%202022.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - disease_score=hl.tfloat64, - ), - key=['locus', 'alleles'], - globals=hl.Struct(), - ), - }, - }, - 'mitomap': { - '38': { - **CONFIG['mitomap']['38'], - 'source_path': 'gs://seqr-reference-data/GRCh38/mitochondrial/MITOMAP/mitomap-confirmed-mutations-2022-02-04.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - pathogenic=hl.tbool, - ), - key=['locus', 'alleles'], - globals=hl.Struct(), - ), - }, - }, - 'mitimpact': { - '38': { - **CONFIG['mitimpact']['38'], - 'source_path': 'gs://seqr-reference-data/GRCh38/mitochondrial/MitImpact/MitImpact_db_3.1.3.ht', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - APOGEE2_score=hl.tfloat64, - ), - key=['locus', 'alleles'], - globals=hl.Struct(), - ), - }, - }, - 'high_constraint_region_mito': { - '38': { - **CONFIG['high_constraint_region_mito']['38'], - 'source_path': 'gs://seqr-reference-data/GRCh38/mitochondrial/Helix high constraint intervals Feb-15-2022.tsv', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - interval=hl.tstr, - ), - key=['interval'], - globals=hl.Struct(), - ), - }, - }, - 'local_constraint_mito': { - '38': { - **CONFIG['local_constraint_mito']['38'], - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - MLC_score=hl.tfloat32, - ), - key=['locus', 'alleles'], - globals=hl.Struct(), - ), - }, - }, -} - - -@mock.patch( - 'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.UpdatedReferenceDatasetCollectionTask', -) -@mock.patch( - 'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.UpdateCachedReferenceDatasetQueries', -) -@mock.patch( - 'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.BaseUpdateVariantAnnotationsTableTask.initialize_table', -) -class UpdateVATWithUpdatedRDC(MockedDatarootTestCase): - def setUp(self) -> None: - super().setUp() - shutil.copytree( - TEST_COMBINED_1, - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - ReferenceDatasetCollection.COMBINED, - ), - ) - shutil.copytree( - TEST_HGMD_1, - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - ReferenceDatasetCollection.HGMD, - ), - ) - shutil.copytree( - TEST_INTERVAL_1, - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - ReferenceDatasetCollection.INTERVAL, - ), - ) - shutil.copytree( - TEST_COMBINED_MITO_1, - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh38, - DatasetType.MITO, - ReferenceDatasetCollection.COMBINED, - ), - ) - shutil.copytree( - TEST_INTERVAL_MITO_1, - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh38, - DatasetType.MITO, - ReferenceDatasetCollection.INTERVAL, - ), - ) - shutil.copytree( - TEST_COMBINED_37, - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh37, - DatasetType.SNV_INDEL, - ReferenceDatasetCollection.COMBINED, - ), - ) - shutil.copytree( - TEST_HGMD_37, - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh37, - DatasetType.SNV_INDEL, - ReferenceDatasetCollection.HGMD, - ), - ) + ], + ) - @mock.patch.dict( - 'v03_pipeline.lib.reference_data.compare_globals.CONFIG', - MOCK_CONFIG, + @responses.activate + @patch( + 'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.BaseUpdateVariantAnnotationsTableTask.initialize_table', ) - @mock.patch( - 'v03_pipeline.lib.tasks.reference_data.update_variant_annotations_table_with_updated_reference_dataset.clinvar_versions_equal', - ) - def test_update_vat_with_updated_rdc_snv_indel_38( + def test_update_vat_snv_indel_38( self, - mock_clinvar_versions_equal, - mock_initialize_table, - mock_update_crdqs_task, - mock_update_rdc_task, + mock_initialize_annotations_ht, ): - mock_clinvar_versions_equal.return_value = True - mock_update_rdc_task.return_value = MockCompleteTask() - mock_update_crdqs_task.return_value = MockCompleteTask() - mock_initialize_table.return_value = hl.Table.parallelize( + mock_initialize_annotations_ht.return_value = hl.Table.parallelize( [ hl.Struct( locus=hl.Locus( @@ -752,212 +118,141 @@ def test_update_vat_with_updated_rdc_snv_indel_38( ), key=['locus', 'alleles'], globals=hl.Struct( - paths=hl.Struct(), versions=hl.Struct(), enums=hl.Struct(), updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), migrations=hl.empty_array(hl.tstr), ), ) - task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, - callset_path=TEST_SNV_INDEL_VCF, - project_guids=[], - project_remap_paths=[], - project_pedigree_paths=[], - skip_validation=True, - run_id='3', - ) - worker = luigi.worker.Worker() - worker.add(task) - worker.run() - self.assertTrue(GCSorLocalFolderTarget(task.output().path).exists()) - self.assertTrue(task.complete()) - ht = hl.read_table(task.output().path) - self.assertCountEqual( - ht.collect(), - [ - hl.Struct( - locus=hl.Locus( - contig='chr1', - position=871269, - reference_genome='GRCh38', - ), - alleles=['A', 'C'], - cadd=hl.Struct(PHRED=2), - clinvar=hl.Struct( - alleleId=None, - conflictingPathogenicities=None, - goldStars=None, - pathogenicity_id=None, - assertion_ids=None, - submitters=None, - conditions=None, - ), - dbnsfp=hl.Struct( - REVEL_score=0.0430000014603138, - SIFT_score=None, - Polyphen2_HVAR_score=None, - MutationTaster_pred_id=0, - VEST4_score=None, - MutPred_score=None, - fathmm_MKL_coding_score=None, - ), - eigen=hl.Struct(Eigen_phred=1.5880000591278076), - exac=hl.Struct( - AF_POPMAX=0.0004100881633348763, - AF=0.0004633000062312931, - AC_Adj=51, - AC_Het=51, - AC_Hom=0, - AC_Hemi=None, - AN_Adj=108288, - ), - gnomad_exomes=hl.Struct( - AF=0.00012876000255346298, - AN=240758, - AC=31, - Hom=0, - AF_POPMAX_OR_GLOBAL=0.0001119549197028391, - FAF_AF=9.315000352216884e-05, - Hemi=0, - ), - gnomad_genomes=None, - mpc=None, - primate_ai=None, - splice_ai=hl.Struct( - delta_score=0.029999999329447746, - splice_consequence_id=3, - ), - topmed=None, - gnomad_non_coding_constraint=hl.Struct(z_score=0.75), - screen=hl.Struct(region_type_ids=[1]), - hgmd=hl.Struct(accession='abcdefg', class_id=3), - ), - ], - ) - self.assertCountEqual( - ht.globals.collect(), - [ - hl.Struct( - paths=hl.Struct( - cadd='gs://seqr-reference-data/GRCh37/CADD/CADD_snvs_and_indels.v1.6.ht', - clinvar='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', - dbnsfp='gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.ht', - eigen='gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.ht', - exac='gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vep.ht', - gnomad_exomes='gs://gcp-public-data--gnomad/release/4.1/ht/exomes/gnomad.exomes.v4.1.sites.ht', - gnomad_genomes='gs://gcp-public-data--gnomad/release/4.1/ht/genomes/gnomad.genomes.v4.1.sites.ht', - mpc='gs://seqr-reference-data/GRCh37/MPC/fordist_constraint_official_mpc_values.ht', - primate_ai='gs://seqr-reference-data/GRCh37/primate_ai/PrimateAI_scores_v0.2.ht', - splice_ai='gs://seqr-reference-data/GRCh37/spliceai/spliceai_scores.ht', - topmed='gs://seqr-reference-data/GRCh37/TopMed/bravo-dbsnp-all.removed_chr_prefix.liftunder_GRCh37.ht', - gnomad_non_coding_constraint='gs://seqr-reference-data/GRCh38/gnomad_nc_constraint/gnomad_non-coding_constraint_z_scores.ht', - screen='gs://seqr-reference-data/GRCh38/ccREs/GRCh38-ccREs.ht', - hgmd='gs://seqr-reference-data-private/GRCh38/HGMD/HGMD_Pro_2023.1_hg38.vcf.gz', - ), - versions=hl.Struct( - cadd='v1.6', - clinvar='2023-11-26', - dbnsfp='2.9.3', - eigen=None, - exac=None, - gnomad_exomes='4.1', - gnomad_genomes='4.1', - mpc=None, - primate_ai='v0.2', - splice_ai=None, - topmed=None, - gnomad_non_coding_constraint=None, - screen=None, - hgmd='HGMD_Pro_2023', - ), - enums=hl.Struct( - cadd=hl.Struct(), - clinvar=hl.Struct( - pathogenicity=CLINVAR_PATHOGENICITIES, - assertion=CLINVAR_ASSERTIONS, + with mock_clinvar_urls(): + task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + ) + worker = luigi.worker.Worker() + worker.add(task) + worker.run() + self.assertTrue(GCSorLocalFolderTarget(task.output().path).exists()) + self.assertTrue(task.complete()) + + ht = hl.read_table(task.output().path) + self.assertCountEqual( + ht.globals.collect(), + [ + hl.Struct( + versions=hl.Struct( + dbnsfp='1.0', + eigen='1.0', + clinvar='2024-11-11', + exac='1.0', + splice_ai='1.0', + topmed='1.0', + hgmd='1.0', + gnomad_exomes='1.0', + gnomad_genomes='1.0', + screen='1.0', + gnomad_non_coding_constraint='1.0', ), - dbnsfp=hl.Struct( - MutationTaster_pred=['D', 'A', 'N', 'P'], + enums=hl.Struct( + dbnsfp=ReferenceDataset.dbnsfp.enum_globals, + eigen=hl.Struct(), + clinvar=ReferenceDataset.clinvar.enum_globals, + exac=hl.Struct(), + splice_ai=ReferenceDataset.splice_ai.enum_globals, + topmed=hl.Struct(), + hgmd=ReferenceDataset.hgmd.enum_globals, + gnomad_exomes=hl.Struct(), + gnomad_genomes=hl.Struct(), + screen=ReferenceDataset.screen.enum_globals, + gnomad_non_coding_constraint=hl.Struct(), + **BASE_ENUMS, ), - eigen=hl.Struct(), - exac=hl.Struct(), - gnomad_exomes=hl.Struct(), - gnomad_genomes=hl.Struct(), - mpc=hl.Struct(), - primate_ai=hl.Struct(), - splice_ai=hl.Struct( - splice_consequence=[ - 'Acceptor gain', - 'Acceptor loss', - 'Donor gain', - 'Donor loss', - 'No consequence', - ], + migrations=[], + updates=set(), + ), + ], + ) + self.assertCountEqual( + ht.collect(), + [ + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=871269, + reference_genome='GRCh38', + ), + alleles=['A', 'C'], + dbnsfp=hl.Struct( + REVEL_score=0.0430000014603138, + SIFT_score=None, + Polyphen2_HVAR_score=None, + MutationTaster_pred_id=0, + VEST4_score=None, + MutPred_score=None, + fathmm_MKL_coding_score=None, + MPC_score=None, + CADD_phred=2, + PrimateAI_score=None, ), - topmed=hl.Struct(), - gnomad_non_coding_constraint=hl.Struct(), - screen=hl.Struct( - region_type=[ - 'CTCF-bound', - 'CTCF-only', - 'DNase-H3K4me3', - 'PLS', - 'dELS', - 'pELS', - 'DNase-only', - 'low-DNase', - ], + eigen=hl.Struct(Eigen_phred=1.5880000591278076), + clinvar=hl.Struct( + alleleId=None, + conflictingPathogenicities=None, + goldStars=None, + pathogenicity_id=None, + assertion_ids=None, + submitters=None, + conditions=None, ), - hgmd=hl.Struct( - **{'class': ['DM', 'DM?', 'DP', 'DFP', 'FP', 'R']}, + exac=hl.Struct( + AF_POPMAX=0.0004100881633348763, + AF=0.0004633000062312931, + AC_Adj=51, + AC_Het=51, + AC_Hom=0, + AC_Hemi=None, + AN_Adj=108288, ), - sorted_motif_feature_consequences=hl.Struct( - consequence_term=MOTIF_CONSEQUENCE_TERMS, + splice_ai=hl.Struct( + delta_score=0.029999999329447746, + splice_consequence_id=3, ), - sorted_regulatory_feature_consequences=hl.Struct( - biotype=REGULATORY_BIOTYPES, - consequence_term=REGULATORY_CONSEQUENCE_TERMS, + topmed=hl.Struct(AC=None, AF=None, AN=None, Hom=None, Het=None), + hgmd=hl.Struct(accession='abcdefg', class_id=3), + gnomad_exomes=hl.Struct( + AF=0.00012876000255346298, + AN=240758, + AC=31, + Hom=0, + AF_POPMAX_OR_GLOBAL=0.0001119549197028391, + FAF_AF=9.315000352216884e-05, + Hemi=0, ), - sorted_transcript_consequences=hl.Struct( - biotype=BIOTYPES, - consequence_term=TRANSCRIPT_CONSEQUENCE_TERMS, - loftee=hl.Struct( - lof_filter=LOF_FILTERS, - ), - utrannotator=hl.Struct( - fiveutr_consequence=FIVEUTR_CONSEQUENCES, - ), + gnomad_genomes=hl.Struct( + AC=None, + AF=None, + AN=None, + Hom=None, + AF_POPMAX_OR_GLOBAL=None, + FAF_AF=None, + Hemi=None, ), + gnomad_non_coding_constraint=hl.Struct(z_score=0.75), + screen=hl.Struct(region_type_ids=[1]), ), - migrations=[], - updates=set(), - ), - ], - ) + ], + ) - @mock.patch.dict( - 'v03_pipeline.lib.reference_data.compare_globals.CONFIG', - MOCK_CONFIG_MITO, - ) - @mock.patch( - 'v03_pipeline.lib.tasks.reference_data.update_variant_annotations_table_with_updated_reference_dataset.clinvar_versions_equal', + @responses.activate + @patch( + 'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.BaseUpdateVariantAnnotationsTableTask.initialize_table', ) - def test_update_vat_with_updated_rdc_mito_38( + def test_update_vat_mito_38( self, - mock_clinvar_versions_equal, - mock_initialize_table, - mock_update_crdqs_task, - mock_update_rdc_task, + mock_initialize_annotations_ht, ): - mock_clinvar_versions_equal.return_value = (True,) - mock_update_rdc_task.return_value = MockCompleteTask() - mock_update_crdqs_task.return_value = MockCompleteTask() - mock_initialize_table.return_value = hl.Table.parallelize( + mock_initialize_annotations_ht.return_value = hl.Table.parallelize( [ hl.Struct( locus=hl.Locus( @@ -974,152 +269,118 @@ def test_update_vat_with_updated_rdc_mito_38( ), key=['locus', 'alleles'], globals=hl.Struct( - paths=hl.Struct(), versions=hl.Struct(), enums=hl.Struct(), updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), migrations=hl.empty_array(hl.tstr), ), ) - task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.MITO, - sample_type=SampleType.WGS, - callset_path=TEST_MITO_MT, - project_guids=[], - project_remap_paths=[], - project_pedigree_paths=[], - skip_validation=True, - run_id='1', - ) - worker = luigi.worker.Worker() - worker.add(task) - worker.run() - self.assertTrue(GCSorLocalFolderTarget(task.output().path).exists()) - self.assertTrue(task.complete()) - ht = hl.read_table(task.output().path) - self.assertCountEqual( - ht.globals.collect(), - [ - hl.Struct( - paths=hl.Struct( - gnomad_mito='gs://gcp-public-data--gnomad/release/3.1/ht/genomes/gnomad.genomes.v3.1.sites.chrM.ht', - helix_mito='gs://seqr-reference-data/GRCh38/mitochondrial/Helix/HelixMTdb_20200327.ht', - hmtvar='gs://seqr-reference-data/GRCh38/mitochondrial/HmtVar/HmtVar%20Jan.%2010%202022.ht', - mitomap='gs://seqr-reference-data/GRCh38/mitochondrial/MITOMAP/mitomap-confirmed-mutations-2022-02-04.ht', - mitimpact='gs://seqr-reference-data/GRCh38/mitochondrial/MitImpact/MitImpact_db_3.1.3.ht', - clinvar_mito='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', - dbnsfp_mito='gs://seqr-reference-data/GRCh38/dbNSFP/v4.2/dbNSFP4.2a_variant.with_new_scores.ht', - high_constraint_region_mito='gs://seqr-reference-data/GRCh38/mitochondrial/Helix high constraint intervals Feb-15-2022.tsv', - local_constraint_mito='gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv', - ), - versions=hl.Struct( - gnomad_mito='v3.1', - helix_mito='20200327', - hmtvar='Jan. 10 2022', - mitomap='Feb. 04 2022', - mitimpact='3.1.3', - clinvar_mito='2023-07-22', - dbnsfp_mito='4.2', - high_constraint_region_mito='Feb-15-2022', - local_constraint_mito='2024-07-24', - ), - enums=hl.Struct( - gnomad_mito=hl.Struct(), - helix_mito=hl.Struct(), - hmtvar=hl.Struct(), - mitomap=hl.Struct(), - mitimpact=hl.Struct(), - clinvar_mito=hl.Struct( - pathogenicity=CLINVAR_PATHOGENICITIES, - assertion=CLINVAR_ASSERTIONS, + with mock_clinvar_urls(): + task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.MITO, + ) + worker = luigi.worker.Worker() + worker.add(task) + worker.run() + self.assertTrue(GCSorLocalFolderTarget(task.output().path).exists()) + self.assertTrue(task.complete()) + + ht = hl.read_table(task.output().path) + self.assertCountEqual( + ht.globals.collect(), + [ + hl.Struct( + versions=hl.Struct( + helix_mito='1.0', + hmtvar='1.0', + mitimpact='1.0', + mitomap='1.0', + gnomad_mito='1.0', + local_constraint_mito='1.0', + clinvar='2024-11-11', + dbnsfp='1.0', ), - dbnsfp_mito=hl.Struct( - MutationTaster_pred=['D', 'A', 'N', 'P'], + enums=hl.Struct( + helix_mito=hl.Struct(), + hmtvar=hl.Struct(), + mitimpact=hl.Struct(), + mitomap=hl.Struct(), + gnomad_mito=hl.Struct(), + local_constraint_mito=hl.Struct(), + clinvar=ReferenceDataset.clinvar.enum_globals, + dbnsfp=ReferenceDataset.dbnsfp.enum_globals, + sorted_transcript_consequences=hl.Struct( + biotype=BIOTYPES, + consequence_term=TRANSCRIPT_CONSEQUENCE_TERMS, + lof_filter=LOF_FILTERS, + ), + mitotip=hl.Struct( + trna_prediction=MITOTIP_PATHOGENICITIES, + ), ), - high_constraint_region_mito=hl.Struct(), - local_constraint_mito=hl.Struct(), - sorted_transcript_consequences=hl.Struct( - biotype=BIOTYPES, - consequence_term=TRANSCRIPT_CONSEQUENCE_TERMS, - lof_filter=LOF_FILTERS, + migrations=[], + updates=set(), + ), + ], + ) + self.assertCountEqual( + ht.collect(), + [ + hl.Struct( + locus=hl.Locus( + contig='chrM', + position=1, + reference_genome='GRCh38', ), - mitotip=hl.Struct( - trna_prediction=MITOTIP_PATHOGENICITIES, + alleles=['A', 'C'], + helix_mito=hl.Struct( + AC_het=0, + AF_het=0.0, + AN=195982, + max_hl=None, + AC_hom=0, + AF_hom=0, + ), + hmtvar=hl.Struct(score=0.6700000166893005), + mitimpact=hl.Struct(score=0.42500001192092896), + mitomap=hl.Struct(pathogenic=None), + gnomad_mito=hl.Struct( + AC_het=0, + AF_het=0.0, + AN=195982, + max_hl=None, + AC_hom=0, + AF_hom=0, + ), + local_constraint_mito=hl.Struct(score=0.5), + clinvar=hl.Struct( + alleleId=None, + conflictingPathogenicities=None, + goldStars=None, + pathogenicity_id=None, + assertion_ids=None, + submitters=None, + conditions=None, + ), + dbnsfp=hl.Struct( + SIFT_score=None, + MutationTaster_pred_id=2, ), ), - migrations=[], - updates=set(), - ), - ], - ) - self.assertCountEqual( - ht.collect(), - [ - hl.Struct( - locus=hl.Locus( - contig='chrM', - position=1, - reference_genome='GRCh38', - ), - alleles=['A', 'C'], - clinvar_mito=hl.Struct( - alleleId=None, - conflictingPathogenicities=None, - goldStars=None, - pathogenicity_id=None, - assertion_ids=None, - submitters=None, - conditions=None, - ), - dbnsfp_mito=hl.Struct( - SIFT_score=None, - MutationTaster_pred_id=2, - ), - gnomad_mito=hl.Struct( - AC_het=0, - AF_het=0.0, - AN=195982, - max_hl=None, - AC_hom=0, - AF_hom=0, - ), - helix_mito=hl.Struct( - AC_het=0, - AF_het=0.0, - AN=195982, - max_hl=None, - AC_hom=0, - AF_hom=0, - ), - hmtvar=hl.Struct(score=0.6700000166893005), - mitomap=None, - mitimpact=hl.Struct(score=0.42500001192092896), - high_constraint_region_mito=True, - local_constraint_mito=hl.Struct(score=0.5), - ), - ], - ) + ], + ) - @mock.patch.dict( - 'v03_pipeline.lib.reference_data.compare_globals.CONFIG', - MOCK_CONFIG, + @responses.activate + @patch( + 'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.BaseUpdateVariantAnnotationsTableTask.initialize_table', ) - @mock.patch( - 'v03_pipeline.lib.tasks.reference_data.update_variant_annotations_table_with_updated_reference_dataset.clinvar_versions_equal', - ) - def test_update_vat_with_updated_rdc_snv_indel_37( + def test_update_vat_snv_indel_37( self, - mock_clinvar_versions_equal, - mock_initialize_table, - mock_update_crdqs_task, - mock_update_rdc_task, + mock_initialize_annotations_ht, ): - mock_clinvar_versions_equal.return_value = True - mock_update_rdc_task.return_value = MockCompleteTask() - mock_update_crdqs_task.return_value = MockCompleteTask() - mock_initialize_table.return_value = hl.Table.parallelize( + mock_initialize_annotations_ht.return_value = hl.Table.parallelize( [ hl.Struct( locus=hl.Locus( @@ -1136,156 +397,123 @@ def test_update_vat_with_updated_rdc_snv_indel_37( ), key=['locus', 'alleles'], globals=hl.Struct( - paths=hl.Struct(), versions=hl.Struct(), enums=hl.Struct(), updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), migrations=hl.empty_array(hl.tstr), ), ) - task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( - reference_genome=ReferenceGenome.GRCh37, - dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, - callset_path=TEST_SNV_INDEL_VCF, - project_guids=[], - project_remap_paths=[], - project_pedigree_paths=[], - skip_validation=True, - run_id='2', - ) - worker = luigi.worker.Worker() - worker.add(task) - worker.run() - self.assertTrue(GCSorLocalFolderTarget(task.output().path).exists()) - self.assertTrue(task.complete()) - ht = hl.read_table(task.output().path) - self.assertCountEqual( - ht.globals.collect(), - [ - hl.Struct( - paths=hl.Struct( - cadd='gs://seqr-reference-data/GRCh37/CADD/CADD_snvs_and_indels.v1.6.ht', - clinvar='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', - dbnsfp='gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.ht', - eigen='gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.ht', - exac='gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vep.ht', - gnomad_exomes='gs://gcp-public-data--gnomad/release/2.1.1/ht/exomes/gnomad.exomes.r2.1.1.sites.ht', - gnomad_genomes='gs://gcp-public-data--gnomad/release/2.1.1/ht/genomes/gnomad.genomes.r2.1.1.sites.ht', - mpc='gs://seqr-reference-data/GRCh37/MPC/fordist_constraint_official_mpc_values.ht', - primate_ai='gs://seqr-reference-data/GRCh37/primate_ai/PrimateAI_scores_v0.2.ht', - splice_ai='gs://seqr-reference-data/GRCh37/spliceai/spliceai_scores.ht', - topmed='gs://seqr-reference-data/GRCh37/TopMed/bravo-dbsnp-all.removed_chr_prefix.liftunder_GRCh37.ht', - hgmd='gs://seqr-reference-data-private/GRCh37/HGMD/HGMD_Pro_2023.1_hg19.vcf.gz', - ), - versions=hl.Struct( - cadd='v1.6', - clinvar='2023-11-26', - dbnsfp='2.9.3', - eigen=None, - exac=None, - gnomad_exomes='r2.1.1', - gnomad_genomes='r2.1.1', - mpc=None, - primate_ai='v0.2', - splice_ai=None, - topmed=None, - hgmd='HGMD_Pro_2023', - ), - enums=hl.Struct( - cadd=hl.Struct(), - clinvar=hl.Struct( - pathogenicity=CLINVAR_PATHOGENICITIES, - assertion=CLINVAR_ASSERTIONS, + with mock_clinvar_urls(ReferenceGenome.GRCh37): + task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( + reference_genome=ReferenceGenome.GRCh37, + dataset_type=DatasetType.SNV_INDEL, + ) + worker = luigi.worker.Worker() + worker.add(task) + worker.run() + self.assertTrue(GCSorLocalFolderTarget(task.output().path).exists()) + self.assertTrue(task.complete()) + + ht = hl.read_table(task.output().path) + self.assertCountEqual( + ht.globals.collect(), + [ + hl.Struct( + versions=hl.Struct( + dbnsfp='1.0', + eigen='1.0', + clinvar='2024-11-11', + exac='1.0', + splice_ai='1.0', + topmed='1.0', + hgmd='1.0', + gnomad_exomes='1.0', + gnomad_genomes='1.0', + ), + enums=hl.Struct( + dbnsfp=ReferenceDataset.dbnsfp.enum_globals, + eigen=hl.Struct(), + clinvar=ReferenceDataset.clinvar.enum_globals, + exac=hl.Struct(), + splice_ai=ReferenceDataset.splice_ai.enum_globals, + topmed=hl.Struct(), + hgmd=ReferenceDataset.hgmd.enum_globals, + gnomad_exomes=hl.Struct(), + gnomad_genomes=hl.Struct(), + sorted_transcript_consequences=hl.Struct( + biotype=BIOTYPES, + consequence_term=TRANSCRIPT_CONSEQUENCE_TERMS, + lof_filter=LOF_FILTERS, + ), ), + migrations=[], + updates=set(), + ), + ], + ) + self.assertCountEqual( + ht.collect(), + [ + hl.Struct( + locus=hl.Locus( + contig=1, + position=871269, + reference_genome='GRCh37', + ), + alleles=['A', 'C'], dbnsfp=hl.Struct( - MutationTaster_pred=['D', 'A', 'N', 'P'], + REVEL_score=0.0430000014603138, + SIFT_score=None, + Polyphen2_HVAR_score=None, + MutationTaster_pred_id=0, + CADD_phred=9.699999809265137, + MPC_score=None, + PrimateAI_score=None, + ), + eigen=hl.Struct(Eigen_phred=1.5880000591278076), + clinvar=hl.Struct( + alleleId=None, + conflictingPathogenicities=None, + goldStars=None, + pathogenicity_id=None, + assertion_ids=None, + submitters=None, + conditions=None, + ), + exac=hl.Struct( + AF_POPMAX=0.0004100881633348763, + AF=0.0004633000062312931, + AC_Adj=51, + AC_Het=51, + AC_Hom=0, + AC_Hemi=None, + AN_Adj=108288, ), - eigen=hl.Struct(), - exac=hl.Struct(), - gnomad_exomes=hl.Struct(), - gnomad_genomes=hl.Struct(), - mpc=hl.Struct(), - primate_ai=hl.Struct(), splice_ai=hl.Struct( - splice_consequence=[ - 'Acceptor gain', - 'Acceptor loss', - 'Donor gain', - 'Donor loss', - 'No consequence', - ], + delta_score=0.029999999329447746, + splice_consequence_id=3, ), - topmed=hl.Struct(), - hgmd=hl.Struct( - **{'class': ['DM', 'DM?', 'DP', 'DFP', 'FP', 'R']}, + topmed=hl.Struct(AC=None, AF=None, AN=None, Hom=None, Het=None), + hgmd=None, + gnomad_exomes=hl.Struct( + AF=0.00012876000255346298, + AN=240758, + AC=31, + Hom=0, + AF_POPMAX_OR_GLOBAL=0.0001119549197028391, + FAF_AF=9.315000352216884e-05, + Hemi=0, ), - sorted_transcript_consequences=hl.Struct( - biotype=BIOTYPES, - consequence_term=TRANSCRIPT_CONSEQUENCE_TERMS, - lof_filter=LOF_FILTERS, + gnomad_genomes=hl.Struct( + AC=None, + AF=None, + AN=None, + Hom=None, + AF_POPMAX_OR_GLOBAL=None, + FAF_AF=None, + Hemi=None, ), ), - migrations=[], - updates=set(), - ), - ], - ) - self.assertCountEqual( - ht.collect(), - [ - hl.Struct( - locus=hl.Locus( - contig=1, - position=871269, - reference_genome='GRCh37', - ), - alleles=['A', 'C'], - cadd=hl.Struct(PHRED=9.699999809265137), - clinvar=hl.Struct( - alleleId=None, - conflictingPathogenicities=None, - goldStars=None, - pathogenicity_id=None, - assertion_ids=None, - submitters=None, - conditions=None, - ), - dbnsfp=hl.Struct( - REVEL_score=0.0430000014603138, - SIFT_score=None, - Polyphen2_HVAR_score=None, - MutationTaster_pred_id=0, - ), - eigen=hl.Struct(Eigen_phred=1.5880000591278076), - exac=hl.Struct( - AF_POPMAX=0.0004100881633348763, - AF=0.0004633000062312931, - AC_Adj=51, - AC_Het=51, - AC_Hom=0, - AC_Hemi=None, - AN_Adj=108288, - ), - gnomad_exomes=hl.Struct( - AF=0.00012876000255346298, - AN=240758, - AC=31, - Hom=0, - AF_POPMAX_OR_GLOBAL=0.0001119549197028391, - FAF_AF=9.315000352216884e-05, - Hemi=0, - ), - gnomad_genomes=None, - mpc=None, - primate_ai=None, - splice_ai=hl.Struct( - delta_score=0.029999999329447746, - splice_consequence_id=3, - ), - topmed=None, - hgmd=None, - ), - ], - ) + ], + ) diff --git a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py deleted file mode 100644 index 566337f2e..000000000 --- a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py +++ /dev/null @@ -1,264 +0,0 @@ -import shutil -from typing import Any -from unittest import mock - -import hail as hl -import luigi - -import v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection -from v03_pipeline.lib.annotations.enums import CLINVAR_PATHOGENICITIES -from v03_pipeline.lib.model import ( - CachedReferenceDatasetQuery, - DatasetType, - ReferenceDatasetCollection, - ReferenceGenome, - SampleType, -) -from v03_pipeline.lib.paths import ( - cached_reference_dataset_query_path, - valid_reference_dataset_collection_path, -) -from v03_pipeline.lib.reference_data.clinvar import CLINVAR_ASSERTIONS -from v03_pipeline.lib.reference_data.config import CONFIG -from v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query import ( - UpdatedCachedReferenceDatasetQuery, -) -from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask -from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase - -COMBINED_1_PATH = 'v03_pipeline/var/test/reference_data/test_combined_1.ht' -CLINVAR_CRDQ_PATH = ( - 'v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht' -) -TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' - -MOCK_CONFIG = { - 'gnomad_qc': { - '38': { - 'version': 'v3.1', - 'source_path': 'gs://gnomad/sample_qc/mt/genomes_v3.1/gnomad_v3.1_qc_mt_v2_sites_dense.mt', - 'custom_import': lambda *_: hl.Table.parallelize( - [ - { - 'locus': hl.Locus( - contig='chr1', - position=871269, - reference_genome='GRCh38', - ), - 'alleles': ['A', 'C'], - }, - ], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - ), - key=['locus', 'alleles'], - globals=hl.Struct(), - ), - }, - }, - 'clinvar': { - '38': { - **CONFIG['clinvar']['38'], - 'source_path': 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - info=hl.tstruct( - ALLELEID=hl.tint32, - CLNSIG=hl.tarray(hl.tstr), - CLNSIGCONF=hl.tarray(hl.tstr), - CLNREVSTAT=hl.tarray(hl.tstr), - ), - submitters=hl.tarray(hl.tstr), - conditions=hl.tarray(hl.tstr), - ), - key=['locus', 'alleles'], - globals=hl.Struct( - version='2023-11-26', - ), - ), - }, - }, -} - - -class UpdatedCachedReferenceDatasetQueryTest(MockedDatarootTestCase): - @mock.patch.dict( - 'v03_pipeline.lib.reference_data.compare_globals.CONFIG', - MOCK_CONFIG, - ) - @mock.patch( - 'v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query.CONFIG', - MOCK_CONFIG, - ) - @mock.patch( - 'v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query.HailTableTask', - ) - def test_gnomad_qc( - self, - mock_hailtabletask, - ) -> None: - """ - Given a crdq task for gnomad_qc, expect the crdq table to be created by querying the raw dataset. - """ - # raw dataset dependency exists - mock_hailtabletask.return_value = MockCompleteTask() - - worker = luigi.worker.Worker() - task = UpdatedCachedReferenceDatasetQuery( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - crdq=CachedReferenceDatasetQuery.GNOMAD_QC, - sample_type=SampleType.WGS, - callset_path=TEST_SNV_INDEL_VCF, - project_guids=[], - project_remap_paths=[], - project_pedigree_paths=[], - skip_validation=True, - run_id='1', - ) - worker.add(task) - worker.run() - self.assertTrue(task.complete()) - - ht = hl.read_table(task.output().path) - self.assertCountEqual( - ht.collect(), - [ - hl.Struct( - locus=hl.Locus( - contig='chr1', - position=871269, - reference_genome='GRCh38', - ), - alleles=['A', 'C'], - ), - ], - ) - self.assertCountEqual( - ht.globals.collect(), - [ - hl.Struct( - paths=hl.Struct(gnomad_qc=CONFIG['gnomad_qc']['38']['source_path']), - versions=hl.Struct(gnomad_qc='v3.1'), - enums=hl.Struct(gnomad_qc=hl.Struct()), - ), - ], - ) - - @mock.patch.dict( - 'v03_pipeline.lib.reference_data.compare_globals.CONFIG', - MOCK_CONFIG, - ) - @mock.patch.object( - v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection, - 'UpdatedReferenceDatasetCollectionTask', - ) - @mock.patch( - 'v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query.CachedReferenceDatasetQuery.query', - ) - @mock.patch( - 'v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query.clinvar_versions_equal', - ) - def test_clinvar( - self, - mock_clinvar_versions_equal, - mock_crdq_query, - mock_updated_rdc_task, - ) -> None: - """ - Given a crdq task where there exists a clinvar crdq table and a clinvar rdc table, - expect task to replace the clinvar crdq table with new version. - """ - mock_clinvar_versions_equal.return_value = True - - # rdc dependency exists - mock_updated_rdc_task.return_value = MockCompleteTask() - - # copy existing crdq to test path - # clinvar has version '2022-01-01' - shutil.copytree( - CLINVAR_CRDQ_PATH, - cached_reference_dataset_query_path( - ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, - ), - ) - - # copy existing rdc to test path - # clinvar has version '2023-11-26' - shutil.copytree( - COMBINED_1_PATH, - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - ReferenceDatasetCollection.COMBINED, - ), - ) - - # mock the clinvar_path_variants query to something simpler for testing - def _clinvar_path_variants(table, **_: Any): - table = table.select_globals() - return table.select( - is_pathogenic=False, - is_likely_pathogenic=True, - ) - - mock_crdq_query.side_effect = _clinvar_path_variants - - worker = luigi.worker.Worker() - task = UpdatedCachedReferenceDatasetQuery( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, - sample_type=SampleType.WGS, - callset_path=TEST_SNV_INDEL_VCF, - project_guids=[], - project_remap_paths=[], - project_pedigree_paths=[], - skip_validation=True, - run_id='2', - ) - worker.add(task) - worker.run() - self.assertTrue(task.complete()) - - ht = hl.read_table(task.output().path) - self.assertCountEqual( - ht.collect(), - [ - hl.Struct( - locus=hl.Locus( - contig='chr1', - position=871269, - reference_genome='GRCh38', - ), - alleles=['A', 'C'], - is_pathogenic=False, - is_likely_pathogenic=True, - ), - ], - ) - self.assertCountEqual( - ht.globals.collect(), - [ - hl.Struct( - paths=hl.Struct( - clinvar=MOCK_CONFIG['clinvar']['38']['source_path'], - ), - enums=hl.Struct( - clinvar=hl.Struct( - pathogenicity=CLINVAR_PATHOGENICITIES, - assertion=CLINVAR_ASSERTIONS, - ), - ), - versions=hl.Struct( - clinvar='2023-11-26', # crdq table should have new clinvar version - ), - ), - ], - ) diff --git a/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset.py b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset.py new file mode 100644 index 000000000..e73f2db8b --- /dev/null +++ b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset.py @@ -0,0 +1,27 @@ +import luigi + +from luigi_pipeline.lib.hail_tasks import GCSorLocalTarget +from v03_pipeline.lib.paths import valid_reference_dataset_path +from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset +from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import ( + BaseLoadingPipelineParams, +) +from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask + + +@luigi.util.inherits(BaseLoadingPipelineParams) +class UpdatedReferenceDatasetTask(BaseWriteTask): + reference_dataset: ReferenceDataset = luigi.EnumParameter( + enum=ReferenceDataset, + ) + + def output(self): + return GCSorLocalTarget( + valid_reference_dataset_path( + self.reference_genome, + self.reference_dataset, + ), + ) + + def create_table(self): + return self.reference_dataset.get_ht(self.reference_genome) diff --git a/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py deleted file mode 100644 index bc19d39d5..000000000 --- a/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py +++ /dev/null @@ -1,345 +0,0 @@ -import shutil -from unittest import mock -from unittest.mock import ANY - -import hail as hl -import luigi.worker - -from v03_pipeline.lib.annotations.enums import CLINVAR_PATHOGENICITIES -from v03_pipeline.lib.model import ( - DatasetType, - ReferenceDatasetCollection, - ReferenceGenome, - SampleType, -) -from v03_pipeline.lib.paths import valid_reference_dataset_collection_path -from v03_pipeline.lib.reference_data.clinvar import CLINVAR_ASSERTIONS -from v03_pipeline.lib.reference_data.config import CONFIG -from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection import ( - UpdatedReferenceDatasetCollectionTask, -) -from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase - -COMBINED_2_PATH = 'v03_pipeline/var/test/reference_data/test_combined_2.ht' -TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' - -MOCK_PRIMATE_AI_DATASET_HT = hl.Table.parallelize( - [ - { - 'locus': hl.Locus( - contig='chr1', - position=871269, - reference_genome='GRCh38', - ), - 'alleles': ['A', 'C'], - 'info': hl.Struct(score=0.25), - }, - ], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - info=hl.tstruct(score=hl.tfloat32), - ), - key=['locus', 'alleles'], - globals=hl.Struct( - version='v0.3', - ), -) -MOCK_CADD_DATASET_HT = hl.Table.parallelize( - [ - { - 'locus': hl.Locus( - contig='chr1', - position=871269, - reference_genome='GRCh38', - ), - 'alleles': ['A', 'C'], - 'PHRED': 1, - }, - ], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - PHRED=hl.tint32, - ), - key=['locus', 'alleles'], - globals=hl.Struct( - version='v1.6', - ), -) -MOCK_CONFIG = { - 'primate_ai': { - '38': { - 'version': 'v0.3', - 'source_path': 'gs://seqr-reference-data/GRCh38/primate_ai/PrimateAI_scores_v0.2.liftover_grch38.ht', - 'select': { - 'score': 'info.score', - }, - 'custom_import': lambda *_: MOCK_PRIMATE_AI_DATASET_HT, - }, - }, - 'cadd': { - '38': { - 'version': 'v1.6', - 'source_path': 'gs://seqr-reference-data/GRCh38/CADD/CADD_snvs_and_indels.v1.6.ht', - 'select': ['PHRED'], - 'custom_import': lambda *_: MOCK_CADD_DATASET_HT, - }, - }, - 'clinvar': { - '38': { - **CONFIG['clinvar']['38'], - 'custom_import': lambda *_: hl.Table.parallelize( - [ - { - 'locus': hl.Locus( - contig='chr1', - position=871269, - reference_genome='GRCh38', - ), - 'alleles': ['A', 'C'], - 'rsid': '5', - 'info': hl.Struct( - ALLELEID=1, - CLNSIG=[ - 'Pathogenic/Likely_pathogenic/Pathogenic', - '_low_penetrance', - ], - CLNSIGCONF=[ - 'Pathogenic(8)|Likely_pathogenic(2)|Pathogenic', - '_low_penetrance(1)|Uncertain_significance(1)', - ], - CLNREVSTAT=['no_classifications_from_unflagged_records'], - ), - 'submitters': [ - 'OMIM', - 'Broad Institute Rare Disease Group, Broad Institute', - 'PreventionGenetics, part of Exact Sciences', - 'Invitae', - ], - 'conditions': [ - 'C3661900:not provided', - 'C0023264:Leigh syndrome', - 'na:FOXRED1-related condition', - 'C4748791:Mitochondrial complex 1 deficiency, nuclear type 19', - ], - }, - ], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - rsid=hl.tstr, - info=hl.tstruct( - ALLELEID=hl.tint32, - CLNSIG=hl.tarray(hl.tstr), - CLNSIGCONF=hl.tarray(hl.tstr), - CLNREVSTAT=hl.tarray(hl.tstr), - ), - submitters=hl.tarray(hl.tstr), - conditions=hl.tarray(hl.tstr), - ), - key=['locus', 'alleles'], - globals=hl.Struct( - version='2023-11-26', - ), - ), - }, - }, -} - - -class UpdatedReferenceDatasetCollectionTaskTest(MockedDatarootTestCase): - @mock.patch.dict( - 'v03_pipeline.lib.reference_data.compare_globals.CONFIG', - MOCK_CONFIG, - ) - @mock.patch.dict( - 'v03_pipeline.lib.reference_data.dataset_table_operations.CONFIG', - MOCK_CONFIG, - ) - @mock.patch.object(ReferenceDatasetCollection, 'datasets') - @mock.patch( - 'v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection.clinvar_versions_equal', - ) - def test_update_task_with_empty_reference_data_table( - self, - mock_clinvar_versions_equal, - mock_rdc_datasets, - ) -> None: - """ - Given a new task with no existing reference dataset collection table, - expect the task to create a new reference dataset collection table for all datasets in the collection. - """ - mock_clinvar_versions_equal.return_value = True - mock_rdc_datasets.return_value = ['cadd', 'primate_ai', 'clinvar'] - worker = luigi.worker.Worker() - task = UpdatedReferenceDatasetCollectionTask( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - reference_dataset_collection=ReferenceDatasetCollection.COMBINED, - sample_type=SampleType.WGS, - callset_path=TEST_SNV_INDEL_VCF, - project_guids=[], - project_remap_paths=[], - project_pedigree_paths=[], - skip_validation=True, - run_id='2', - ) - worker.add(task) - worker.run() - self.assertTrue(task.complete()) - - ht = hl.read_table(task.output().path) - self.assertCountEqual( - ht.collect(), - [ - hl.Struct( - locus=hl.Locus( - contig='chr1', - position=871269, - reference_genome='GRCh38', - ), - alleles=['A', 'C'], - primate_ai=hl.Struct(score=0.25), - cadd=hl.Struct(PHRED=1), - clinvar=hl.Struct( - alleleId=1, - submitters=[ - 'OMIM', - 'Broad Institute Rare Disease Group, Broad Institute', - 'PreventionGenetics, part of Exact Sciences', - 'Invitae', - ], - conditions=[ - 'not provided', - 'Leigh syndrome', - 'FOXRED1-related condition', - 'Mitochondrial complex 1 deficiency, nuclear type 19', - ], - conflictingPathogenicities=[ - hl.Struct(pathogenicity_id=0, count=9), - hl.Struct(pathogenicity_id=5, count=2), - hl.Struct(pathogenicity_id=12, count=1), - ], - goldStars=0, - pathogenicity_id=1, - assertion_ids=[5], - ), - ), - ], - ) - self.assertEqual( - ht.globals.collect(), - [ - hl.Struct( - paths=hl.Struct( - primate_ai='gs://seqr-reference-data/GRCh38/primate_ai/PrimateAI_scores_v0.2.liftover_grch38.ht', - cadd='gs://seqr-reference-data/GRCh38/CADD/CADD_snvs_and_indels.v1.6.ht', - clinvar='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', - ), - versions=hl.Struct( - primate_ai='v0.3', - cadd='v1.6', - clinvar='2023-11-26', - ), - enums=hl.Struct( - primate_ai=hl.Struct(), - cadd=hl.Struct(), - clinvar=hl.Struct( - pathogenicity=CLINVAR_PATHOGENICITIES, - assertion=CLINVAR_ASSERTIONS, - ), - ), - date=ANY, - ), - ], - ) - - @mock.patch.dict( - 'v03_pipeline.lib.reference_data.compare_globals.CONFIG', - MOCK_CONFIG, - ) - @mock.patch.dict( - 'v03_pipeline.lib.reference_data.dataset_table_operations.CONFIG', - MOCK_CONFIG, - ) - @mock.patch.object(ReferenceDatasetCollection, 'datasets') - def test_update_task_with_existing_reference_dataset_collection_table( - self, - mock_rdc_datasets, - ) -> None: - """ - Given an existing reference dataset collection which contains only the primate_ai dataset and has globals: - Struct(paths=Struct(primate_ai='gs://seqr-reference-data/GRCh38/primate_ai/PrimateAI_scores_v0.2.liftover_grch38.ht'), - versions=Struct(primate_ai='v0.2'), - enums=Struct(primate_ai=Struct()), - date=ANY), - expect the task to update the existing reference dataset collection table with the new dataset (cadd), - new values for primate_ai, and update the globals with the new primate_ai dataset's globals and cadd's globals. - """ - # copy existing reference dataset collection (primate_ai only) in COMBINED_2_PATH to test path - shutil.copytree( - COMBINED_2_PATH, - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - ReferenceDatasetCollection.COMBINED, - ), - ) - - mock_rdc_datasets.return_value = ['cadd', 'primate_ai'] - worker = luigi.worker.Worker() - task = UpdatedReferenceDatasetCollectionTask( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - reference_dataset_collection=ReferenceDatasetCollection.COMBINED, - sample_type=SampleType.WGS, - callset_path=TEST_SNV_INDEL_VCF, - project_guids=[], - project_remap_paths=[], - project_pedigree_paths=[], - skip_validation=True, - run_id='2', - ) - worker.add(task) - worker.run() - self.assertTrue(task.complete()) - - ht = hl.read_table(task.output().path) - self.assertCountEqual( - ht.collect(), - [ - hl.Struct( - locus=hl.Locus( - contig='chr1', - position=871269, - reference_genome='GRCh38', - ), - alleles=['A', 'C'], - primate_ai=hl.Struct( - score=0.25, - ), # expect row in primate_ai to be updated from 0.5 to 0.25 - cadd=hl.Struct(PHRED=1), - ), - ], - ) - self.assertEqual( - ht.globals.collect(), - [ - hl.Struct( - paths=hl.Struct( - cadd='gs://seqr-reference-data/GRCh38/CADD/CADD_snvs_and_indels.v1.6.ht', - primate_ai='gs://seqr-reference-data/GRCh38/primate_ai/PrimateAI_scores_v0.2.liftover_grch38.ht', - ), - versions=hl.Struct( - cadd='v1.6', - primate_ai='v0.3', # expect primate_ai version to be updated - ), - enums=hl.Struct( - cadd=hl.Struct(), - primate_ai=hl.Struct(), - ), - date=ANY, - ), - ], - ) diff --git a/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_query.py b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_query.py new file mode 100644 index 000000000..ff7db2db8 --- /dev/null +++ b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_query.py @@ -0,0 +1,54 @@ +import hail as hl +import luigi + +from luigi_pipeline.lib.hail_tasks import GCSorLocalTarget +from v03_pipeline.lib.paths import valid_reference_dataset_query_path +from v03_pipeline.lib.reference_datasets.reference_dataset import ( + ReferenceDatasetQuery, +) +from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import ( + BaseLoadingPipelineParams, +) +from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask +from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset import ( + UpdatedReferenceDatasetTask, +) + + +@luigi.util.inherits(BaseLoadingPipelineParams) +class UpdatedReferenceDatasetQueryTask(BaseWriteTask): + reference_dataset_query: ReferenceDatasetQuery = luigi.EnumParameter( + enum=ReferenceDatasetQuery, + ) + + # Reference Dataset Queries do not include version + # in the path to allow for simpler reading logic + # when they are used downstream by the hail search + # service. + def complete(self): + return super().complete() and hl.eval( + hl.read_table(self.output().path).version + == self.reference_dataset_query.version(self.reference_genome), + ) + + def requires(self): + return self.clone( + UpdatedReferenceDatasetTask, + reference_dataset=self.reference_dataset_query.requires, + ) + + def output(self): + return GCSorLocalTarget( + valid_reference_dataset_query_path( + self.reference_genome, + self.dataset_type, + self.reference_dataset_query, + ), + ) + + def create_table(self): + return self.reference_dataset_query.get_ht( + self.reference_genome, + self.dataset_type, + hl.read_table(self.input().path), + ) diff --git a/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_query_test.py b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_query_test.py new file mode 100644 index 000000000..fae45c7ef --- /dev/null +++ b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_query_test.py @@ -0,0 +1,182 @@ +from unittest.mock import patch + +import hail as hl +import luigi +import responses + +from v03_pipeline.lib.misc.io import write +from v03_pipeline.lib.model.dataset_type import DatasetType +from v03_pipeline.lib.model.definitions import ReferenceGenome +from v03_pipeline.lib.paths import ( + valid_reference_dataset_path, + valid_reference_dataset_query_path, +) +from v03_pipeline.lib.reference_datasets.reference_dataset import ( + ReferenceDataset, + ReferenceDatasetQuery, +) +from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_query import ( + UpdatedReferenceDatasetQueryTask, +) +from v03_pipeline.lib.test.mock_clinvar_urls import mock_clinvar_urls +from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase + +GNOMAD_GENOMES_38_PATH = ( + 'v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht' +) + + +class UpdatedReferenceDatasetQueryTaskTest(MockedDatarootTestCase): + def setUp(self) -> None: + super().setUp() + # clinvar ReferenceDataset exists but is old + # clinvar_path ReferenceDatasetQuery dne + with patch.object( + ReferenceDataset, + 'version', + return_value='2021-01-01', + ): + write( + hl.Table.parallelize( + [ + { + 'locus': hl.Locus( + contig='chr1', + position=1, + reference_genome='GRCh38', + ), + 'alleles': ['A', 'C'], + }, + ], + hl.tstruct( + locus=hl.tlocus('GRCh38'), + alleles=hl.tarray(hl.tstr), + ), + key=['locus', 'alleles'], + globals=hl.Struct(version='2021-01-01'), + ), + valid_reference_dataset_path( + ReferenceGenome.GRCh38, + ReferenceDataset.clinvar, + ), + ) + + @responses.activate + def test_updated_query_and_dependency( + self, + ) -> None: + with mock_clinvar_urls(): + worker = luigi.worker.Worker() + task = UpdatedReferenceDatasetQueryTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + reference_dataset_query=ReferenceDatasetQuery.clinvar_path_variants, + ) + worker.add(task) + worker.run() + self.assertTrue(task.complete()) + clinvar_ht_path = valid_reference_dataset_path( + ReferenceGenome.GRCh38, + ReferenceDataset.clinvar, + ) + clinvar_ht = hl.read_table(clinvar_ht_path) + self.assertTrue('2024-11-11' in clinvar_ht_path) + self.assertEqual( + hl.eval(clinvar_ht.version), + '2024-11-11', + ) + self.assertTrue(hasattr(clinvar_ht, 'submitters')) + contigs = clinvar_ht.aggregate(hl.agg.collect_as_set(clinvar_ht.locus.contig)) + self.assertTrue( + 'chr1' in contigs, + ) + self.assertTrue( + 'chrM' in contigs, + ) + clinvar_path_ht_path = valid_reference_dataset_query_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + ReferenceDatasetQuery.clinvar_path_variants, + ) + clinvar_path_ht = hl.read_table(clinvar_path_ht_path) + self.assertEqual( + hl.eval(clinvar_path_ht.version), + '2024-11-11', + ) + self.assertTrue(hasattr(clinvar_path_ht, 'is_likely_pathogenic')) + contigs = clinvar_path_ht.aggregate( + hl.agg.collect_as_set(clinvar_path_ht.locus.contig), + ) + self.assertTrue( + 'chr1' in contigs, + ) + self.assertFalse( + 'chrM' in contigs, + ) + + @responses.activate + def test_updated_clinvar_query_and_dependency_mito( + self, + ) -> None: + with mock_clinvar_urls(): + worker = luigi.worker.Worker() + task = UpdatedReferenceDatasetQueryTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.MITO, + reference_dataset_query=ReferenceDatasetQuery.clinvar_path_variants, + ) + worker.add(task) + worker.run() + self.assertTrue(task.complete()) + clinvar_ht = hl.read_table( + valid_reference_dataset_path( + ReferenceGenome.GRCh38, + ReferenceDataset.clinvar, + ), + ) + self.assertEqual( + hl.eval(clinvar_ht.version), + '2024-11-11', + ) + clinvar_path_ht_path = valid_reference_dataset_query_path( + ReferenceGenome.GRCh38, + DatasetType.MITO, + ReferenceDatasetQuery.clinvar_path_variants, + ) + clinvar_path_ht = hl.read_table(clinvar_path_ht_path) + contigs = clinvar_path_ht.aggregate( + hl.agg.collect_as_set(clinvar_path_ht.locus.contig), + ) + self.assertFalse( + 'chr1' in contigs, + ) + self.assertTrue( + 'chrM' in contigs, + ) + + def test_updated_query_high_af_variants(self) -> None: + with patch.object( + ReferenceDataset, + 'path', + return_value=GNOMAD_GENOMES_38_PATH, + ): + worker = luigi.worker.Worker() + task = UpdatedReferenceDatasetQueryTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + reference_dataset_query=ReferenceDatasetQuery.high_af_variants, + ) + worker.add(task) + worker.run() + self.assertTrue(task.complete()) + high_af_variants_ht_path = valid_reference_dataset_query_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + ReferenceDatasetQuery.high_af_variants, + ) + high_af_variants_ht = hl.read_table(high_af_variants_ht_path) + self.assertEqual( + hl.eval(high_af_variants_ht.version), + '1.0', + ) + self.assertTrue(hasattr(high_af_variants_ht, 'is_gt_1_percent')) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 345966489..5ae628407 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -4,10 +4,10 @@ import hail as hl import luigi.worker +import responses from v03_pipeline.lib.annotations.enums import ( BIOTYPES, - CLINVAR_PATHOGENICITIES, FIVEUTR_CONSEQUENCES, LOF_FILTERS, MITOTIP_PATHOGENICITIES, @@ -22,17 +22,14 @@ from v03_pipeline.lib.misc.io import remap_pedigree_hash from v03_pipeline.lib.misc.validation import validate_expected_contig_frequency from v03_pipeline.lib.model import ( - CachedReferenceDatasetQuery, DatasetType, - ReferenceDatasetCollection, ReferenceGenome, SampleType, ) from v03_pipeline.lib.paths import ( - cached_reference_dataset_query_path, - valid_reference_dataset_collection_path, + valid_reference_dataset_path, ) -from v03_pipeline.lib.reference_data.clinvar import CLINVAR_ASSERTIONS +from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset from v03_pipeline.lib.tasks.base.base_update_variant_annotations_table import ( BaseUpdateVariantAnnotationsTableTask, ) @@ -40,8 +37,11 @@ from v03_pipeline.lib.tasks.update_variant_annotations_table_with_new_samples import ( UpdateVariantAnnotationsTableWithNewSamplesTask, ) +from v03_pipeline.lib.test.mock_clinvar_urls import mock_clinvar_urls from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask -from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase +from v03_pipeline.lib.test.mocked_reference_datasets_testcase import ( + MockedReferenceDatasetsTestCase, +) from v03_pipeline.var.test.vep.mock_vep_data import MOCK_37_VEP_DATA, MOCK_38_VEP_DATA GRCH38_TO_GRCH37_LIFTOVER_REF_PATH = ( @@ -55,13 +55,6 @@ TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv' TEST_PEDIGREE_4 = 'v03_pipeline/var/test/pedigrees/test_pedigree_4.tsv' TEST_PEDIGREE_5 = 'v03_pipeline/var/test/pedigrees/test_pedigree_5.tsv' -TEST_COMBINED_1 = 'v03_pipeline/var/test/reference_data/test_combined_1.ht' -TEST_COMBINED_37 = 'v03_pipeline/var/test/reference_data/test_combined_37.ht' -TEST_COMBINED_MITO_1 = 'v03_pipeline/var/test/reference_data/test_combined_mito_1.ht' -TEST_HGMD_1 = 'v03_pipeline/var/test/reference_data/test_hgmd_1.ht' -TEST_HGMD_37 = 'v03_pipeline/var/test/reference_data/test_hgmd_37.ht' -TEST_INTERVAL_1 = 'v03_pipeline/var/test/reference_data/test_interval_1.ht' -TEST_INTERVAL_MITO_1 = 'v03_pipeline/var/test/reference_data/test_interval_mito_1.ht' GENE_ID_MAPPING = { 'OR4F5': 'ENSG00000186092', @@ -85,140 +78,70 @@ TEST_RUN_ID = 'manual__2024-04-03' -@patch( - 'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.UpdatedReferenceDatasetCollectionTask', -) -@patch( - 'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.UpdateCachedReferenceDatasetQueries', -) -class UpdateVariantAnnotationsTableWithNewSamplesTaskTest(MockedDatarootTestCase): - def setUp(self) -> None: - super().setUp() - shutil.copytree( - TEST_COMBINED_1, - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - ReferenceDatasetCollection.COMBINED, - ), - ) - shutil.copytree( - TEST_COMBINED_37, - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh37, - DatasetType.SNV_INDEL, - ReferenceDatasetCollection.COMBINED, - ), - ) - shutil.copytree( - TEST_HGMD_1, - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - ReferenceDatasetCollection.HGMD, - ), - ) - shutil.copytree( - TEST_HGMD_37, - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh37, - DatasetType.SNV_INDEL, - ReferenceDatasetCollection.HGMD, - ), - ) - shutil.copytree( - TEST_INTERVAL_1, - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - ReferenceDatasetCollection.INTERVAL, - ), - ) - shutil.copytree( - TEST_COMBINED_MITO_1, - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh38, - DatasetType.MITO, - ReferenceDatasetCollection.COMBINED, - ), - ) - shutil.copytree( - TEST_INTERVAL_MITO_1, - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh38, - DatasetType.MITO, - ReferenceDatasetCollection.INTERVAL, - ), - ) - +class UpdateVariantAnnotationsTableWithNewSamplesTaskTest( + MockedReferenceDatasetsTestCase, +): + @responses.activate @patch( 'v03_pipeline.lib.tasks.write_new_variants_table.UpdateVariantAnnotationsTableWithUpdatedReferenceDataset', ) def test_missing_pedigree( self, mock_update_vat_with_rdc_task, - mock_update_crdqs_task, - mock_update_rdc_task, ) -> None: - mock_update_rdc_task.return_value = MockCompleteTask() - mock_update_crdqs_task.return_value = MockCompleteTask() - mock_update_vat_with_rdc_task.return_value = MockCompleteTask() - uvatwns_task = UpdateVariantAnnotationsTableWithNewSamplesTask( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, - callset_path=TEST_SNV_INDEL_VCF, - project_guids=['R0113_test_project'], - project_remap_paths=[TEST_REMAP], - project_pedigree_paths=['bad_pedigree'], - skip_validation=True, - run_id=TEST_RUN_ID, - ) - worker = luigi.worker.Worker() - worker.add(uvatwns_task) - worker.run() - self.assertFalse(uvatwns_task.complete()) + with mock_clinvar_urls(): + mock_update_vat_with_rdc_task.return_value = MockCompleteTask() + uvatwns_task = UpdateVariantAnnotationsTableWithNewSamplesTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path=TEST_SNV_INDEL_VCF, + project_guids=['R0113_test_project'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=['bad_pedigree'], + skip_validation=True, + run_id=TEST_RUN_ID, + ) + worker = luigi.worker.Worker() + worker.add(uvatwns_task) + worker.run() + self.assertFalse(uvatwns_task.complete()) + @responses.activate @patch( 'v03_pipeline.lib.tasks.write_new_variants_table.UpdateVariantAnnotationsTableWithUpdatedReferenceDataset', ) - def test_missing_interval_reference( + def test_missing_interval_reference_dataset( self, - mock_update_vat_with_rdc_task, - mock_update_crdqs_task, - mock_update_rdc_task, + mock_update_vat_with_rd_task, ) -> None: - mock_update_rdc_task.return_value = MockCompleteTask() - mock_update_crdqs_task.return_value = MockCompleteTask() - mock_update_vat_with_rdc_task.return_value = MockCompleteTask() - shutil.rmtree( - valid_reference_dataset_collection_path( - ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - ReferenceDatasetCollection.INTERVAL, - ), - ) - uvatwns_task = UpdateVariantAnnotationsTableWithNewSamplesTask( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, - callset_path=TEST_SNV_INDEL_VCF, - project_guids=['R0113_test_project'], - project_remap_paths=[TEST_REMAP], - project_pedigree_paths=[TEST_PEDIGREE_3], - skip_validation=True, - run_id=TEST_RUN_ID, - ) - worker = luigi.worker.Worker() - worker.add(uvatwns_task) - worker.run() - self.assertFalse(uvatwns_task.complete()) + with mock_clinvar_urls(): + mock_update_vat_with_rd_task.return_value = MockCompleteTask() + shutil.rmtree( + valid_reference_dataset_path( + ReferenceGenome.GRCh38, + ReferenceDataset.screen, + ), + ) + uvatwns_task = UpdateVariantAnnotationsTableWithNewSamplesTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path=TEST_SNV_INDEL_VCF, + project_guids=['R0113_test_project'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_3], + skip_validation=True, + run_id=TEST_RUN_ID, + ) + worker = luigi.worker.Worker() + worker.add(uvatwns_task) + worker.run() + self.assertFalse(uvatwns_task.complete()) + @responses.activate @patch('v03_pipeline.lib.tasks.write_new_variants_table.register_alleles_in_chunks') @patch('v03_pipeline.lib.tasks.write_new_variants_table.Env') - @patch( - 'v03_pipeline.lib.tasks.validate_callset.UpdatedCachedReferenceDatasetQuery', - ) @patch( 'v03_pipeline.lib.tasks.write_new_variants_table.UpdateVariantAnnotationsTableWithUpdatedReferenceDataset', ) @@ -236,17 +159,11 @@ def test_multiple_update_vat( mock_load_gencode_ensembl_to_refseq_id: Mock, mock_vep: Mock, mock_standard_contigs: Mock, - mock_update_vat_with_rdc_task: Mock, - mock_updated_cached_reference_dataset_query, + mock_update_vat_with_rd_task: Mock, mock_env: Mock, mock_register_alleles: Mock, - mock_update_crdqs_task, - mock_update_rdc_task: Mock, ) -> None: - mock_updated_cached_reference_dataset_query.return_value = MockCompleteTask() - mock_update_rdc_task.return_value = MockCompleteTask() - mock_update_crdqs_task.return_value = MockCompleteTask() - mock_update_vat_with_rdc_task.return_value = ( + mock_update_vat_with_rd_task.return_value = ( BaseUpdateVariantAnnotationsTableTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, @@ -348,337 +265,284 @@ def test_multiple_update_vat( ), key='locus', globals=hl.Struct( - paths=hl.Struct( - gnomad_genomes='gs://gcp-public-data--gnomad/release/4.1/ht/genomes/gnomad.genomes.v4.1.sites.ht', - ), versions=hl.Struct( - gnomad_genomes='4.1', + gnomad_genomes='1.0', ), enums=hl.Struct( gnomad_genomes=hl.Struct(), ), ), ) - coding_and_noncoding_variants_ht.write( - cached_reference_dataset_query_path( - ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS, - ), - ) - worker = luigi.worker.Worker() - uvatwns_task_3 = UpdateVariantAnnotationsTableWithNewSamplesTask( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, - callset_path=TEST_SNV_INDEL_VCF, - project_guids=['R0113_test_project'], - project_remap_paths=[TEST_REMAP], - project_pedigree_paths=[TEST_PEDIGREE_3], - skip_validation=False, - run_id=TEST_RUN_ID, - ) - worker.add(uvatwns_task_3) - worker.run() - self.assertTrue(uvatwns_task_3.complete()) - ht = hl.read_table(uvatwns_task_3.output().path) - self.assertEqual(ht.count(), 30) - self.assertEqual( - [ - x - for x in ht.select( - 'gt_stats', - 'CAID', - ).collect() - if x.locus.position <= 871269 # noqa: PLR2004 - ], - [ - hl.Struct( - locus=hl.Locus( - contig='chr1', - position=871269, - reference_genome='GRCh38', - ), - alleles=['A', 'C'], - gt_stats=hl.Struct(AC=0, AN=6, AF=0.0, hom=0), - CAID='CA1', + + with mock_clinvar_urls(): + coding_and_noncoding_variants_ht.write( + valid_reference_dataset_path( + ReferenceGenome.GRCh38, + ReferenceDataset.gnomad_coding_and_noncoding, ), - ], - ) - self.assertEqual( - ht.globals.updates.collect(), - [ - { + overwrite=True, + ) + worker = luigi.worker.Worker() + + uvatwns_task_3 = UpdateVariantAnnotationsTableWithNewSamplesTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path=TEST_SNV_INDEL_VCF, + project_guids=['R0113_test_project'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_3], + skip_validation=False, + run_id=TEST_RUN_ID, + ) + worker.add(uvatwns_task_3) + worker.run() + self.assertTrue(uvatwns_task_3.complete()) + ht = hl.read_table(uvatwns_task_3.output().path) + self.assertEqual(ht.count(), 30) + self.assertEqual( + [ + x + for x in ht.select( + 'gt_stats', + 'CAID', + ).collect() + if x.locus.position <= 871269 # noqa: PLR2004 + ], + [ hl.Struct( - callset=TEST_SNV_INDEL_VCF, - project_guid='R0113_test_project', - remap_pedigree_hash=hl.eval( - remap_pedigree_hash(TEST_REMAP, TEST_PEDIGREE_3), + locus=hl.Locus( + contig='chr1', + position=871269, + reference_genome='GRCh38', ), + alleles=['A', 'C'], + gt_stats=hl.Struct(AC=0, AN=6, AF=0.0, hom=0), + CAID='CA1', ), - }, - ], - ) - - # Ensure that new variants are added correctly to the table. - uvatwns_task_4 = UpdateVariantAnnotationsTableWithNewSamplesTask( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, - callset_path=TEST_SNV_INDEL_VCF, - project_guids=['R0114_project4'], - project_remap_paths=[TEST_REMAP], - project_pedigree_paths=[TEST_PEDIGREE_4], - skip_validation=False, - run_id=TEST_RUN_ID + '-another-run', - ) - worker.add(uvatwns_task_4) - worker.run() - self.assertTrue(uvatwns_task_4.complete()) - ht = hl.read_table(uvatwns_task_4.output().path) - self.assertCountEqual( - [ - x - for x in ht.select( - 'cadd', - 'clinvar', - 'hgmd', - 'variant_id', - 'xpos', - 'gt_stats', - 'screen', - 'CAID', - ).collect() - if x.locus.position <= 878809 # noqa: PLR2004 - ], - [ - hl.Struct( - locus=hl.Locus( - contig='chr1', - position=871269, - reference_genome='GRCh38', - ), - alleles=['A', 'C'], - cadd=hl.Struct(PHRED=2), - clinvar=hl.Struct( - alleleId=None, - conflictingPathogenicities=None, - goldStars=None, - pathogenicity_id=None, - assertion_ids=None, - submitters=None, - conditions=None, - ), - hgmd=hl.Struct( - accession='abcdefg', - class_id=3, - ), - variant_id='1-871269-A-C', - xpos=1000871269, - gt_stats=hl.Struct(AC=1, AN=32, AF=0.03125, hom=0), - screen=hl.Struct(region_type_ids=[1]), - CAID='CA1', - ), - hl.Struct( - locus=hl.Locus( - contig='chr1', - position=874734, - reference_genome='GRCh38', - ), - alleles=['C', 'T'], - cadd=None, - clinvar=None, - hgmd=None, - variant_id='1-874734-C-T', - xpos=1000874734, - gt_stats=hl.Struct(AC=1, AN=32, AF=0.03125, hom=0), - screen=hl.Struct(region_type_ids=[]), - CAID='CA2', - ), - hl.Struct( - locus=hl.Locus( - contig='chr1', - position=876499, - reference_genome='GRCh38', - ), - alleles=['A', 'G'], - cadd=None, - clinvar=None, - hgmd=None, - variant_id='1-876499-A-G', - xpos=1000876499, - gt_stats=hl.Struct(AC=31, AN=32, AF=0.96875, hom=15), - screen=hl.Struct(region_type_ids=[]), - CAID='CA3', - ), - hl.Struct( - locus=hl.Locus( - contig='chr1', - position=878314, - reference_genome='GRCh38', - ), - alleles=['G', 'C'], - cadd=None, - clinvar=None, - hgmd=None, - variant_id='1-878314-G-C', - xpos=1000878314, - gt_stats=hl.Struct(AC=3, AN=32, AF=0.09375, hom=0), - screen=hl.Struct(region_type_ids=[]), - CAID='CA4', - ), - hl.Struct( - locus=hl.Locus( - contig='chr1', - position=878809, - reference_genome='GRCh38', - ), - alleles=['C', 'T'], - cadd=None, - clinvar=None, - hgmd=None, - variant_id='1-878809-C-T', - xpos=1000878809, - gt_stats=hl.Struct(AC=1, AN=32, AF=0.03125, hom=0), - screen=hl.Struct(region_type_ids=[]), - CAID=None, - ), - ], - ) - self.assertCountEqual( - ht.filter( - ht.locus.position <= 878809, # noqa: PLR2004 - ).sorted_transcript_consequences.consequence_term_ids.collect(), - [ - [[9], [23, 26], [23, 13, 26]], - [[9], [23, 26], [23, 13, 26]], - [[9], [23, 26], [23, 13, 26]], - [[9], [23, 26], [23, 13, 26]], - [[9], [23, 26], [23, 13, 26]], - ], - ) - self.assertCountEqual( - ht.globals.collect(), - [ - hl.Struct( - updates={ + ], + ) + self.assertEqual( + ht.globals.updates.collect(), + [ + { hl.Struct( - callset='v03_pipeline/var/test/callsets/1kg_30variants.vcf', + callset=TEST_SNV_INDEL_VCF, project_guid='R0113_test_project', remap_pedigree_hash=hl.eval( - remap_pedigree_hash( - TEST_REMAP, - TEST_PEDIGREE_3, - ), - ), - ), - hl.Struct( - callset='v03_pipeline/var/test/callsets/1kg_30variants.vcf', - project_guid='R0114_project4', - remap_pedigree_hash=hl.eval( - remap_pedigree_hash( - TEST_REMAP, - TEST_PEDIGREE_4, - ), + remap_pedigree_hash(TEST_REMAP, TEST_PEDIGREE_3), ), ), }, - paths=hl.Struct( - cadd='gs://seqr-reference-data/GRCh37/CADD/CADD_snvs_and_indels.v1.6.ht', - clinvar='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', - dbnsfp='gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.ht', - eigen='gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.ht', - exac='gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vep.ht', - gnomad_exomes='gs://gcp-public-data--gnomad/release/4.1/ht/exomes/gnomad.exomes.v4.1.sites.ht', - gnomad_genomes='gs://gcp-public-data--gnomad/release/4.1/ht/genomes/gnomad.genomes.v4.1.sites.ht', - mpc='gs://seqr-reference-data/GRCh37/MPC/fordist_constraint_official_mpc_values.ht', - primate_ai='gs://seqr-reference-data/GRCh37/primate_ai/PrimateAI_scores_v0.2.ht', - splice_ai='gs://seqr-reference-data/GRCh37/spliceai/spliceai_scores.ht', - topmed='gs://seqr-reference-data/GRCh37/TopMed/bravo-dbsnp-all.removed_chr_prefix.liftunder_GRCh37.ht', - gnomad_non_coding_constraint='gs://seqr-reference-data/GRCh38/gnomad_nc_constraint/gnomad_non-coding_constraint_z_scores.ht', - screen='gs://seqr-reference-data/GRCh38/ccREs/GRCh38-ccREs.ht', - hgmd='gs://seqr-reference-data-private/GRCh38/HGMD/HGMD_Pro_2023.1_hg38.vcf.gz', - ), - versions=hl.Struct( - cadd='v1.6', - clinvar='2023-11-26', - dbnsfp='2.9.3', - eigen=None, - exac=None, - gnomad_exomes='4.1', - gnomad_genomes='4.1', - mpc=None, - primate_ai='v0.2', - splice_ai=None, - topmed=None, - gnomad_non_coding_constraint=None, - screen=None, - hgmd='HGMD_Pro_2023', - ), - migrations=[], - enums=hl.Struct( - cadd=hl.Struct(), + ], + ) + + # Ensure that new variants are added correctly to the table. + uvatwns_task_4 = UpdateVariantAnnotationsTableWithNewSamplesTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path=TEST_SNV_INDEL_VCF, + project_guids=['R0114_project4'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_4], + skip_validation=False, + run_id=TEST_RUN_ID + '-another-run', + ) + worker.add(uvatwns_task_4) + worker.run() + self.assertTrue(uvatwns_task_4.complete()) + ht = hl.read_table(uvatwns_task_4.output().path) + self.assertCountEqual( + [ + x + for x in ht.select( + 'clinvar', + 'hgmd', + 'variant_id', + 'xpos', + 'gt_stats', + 'screen', + 'CAID', + ).collect() + if x.locus.position <= 878809 # noqa: PLR2004 + ], + [ + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=871269, + reference_genome='GRCh38', + ), + alleles=['A', 'C'], clinvar=hl.Struct( - assertion=CLINVAR_ASSERTIONS, - pathogenicity=CLINVAR_PATHOGENICITIES, + alleleId=None, + conflictingPathogenicities=None, + goldStars=None, + pathogenicity_id=None, + assertion_ids=None, + submitters=None, + conditions=None, ), - dbnsfp=hl.Struct( - MutationTaster_pred=['D', 'A', 'N', 'P'], + hgmd=hl.Struct( + accession='abcdefg', + class_id=3, ), - eigen=hl.Struct(), - exac=hl.Struct(), - gnomad_exomes=hl.Struct(), - gnomad_genomes=hl.Struct(), - mpc=hl.Struct(), - primate_ai=hl.Struct(), - splice_ai=hl.Struct( - splice_consequence=[ - 'Acceptor gain', - 'Acceptor loss', - 'Donor gain', - 'Donor loss', - 'No consequence', - ], + variant_id='1-871269-A-C', + xpos=1000871269, + gt_stats=hl.Struct(AC=1, AN=32, AF=0.03125, hom=0), + screen=hl.Struct(region_type_ids=[1]), + CAID='CA1', + ), + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=874734, + reference_genome='GRCh38', ), - topmed=hl.Struct(), - hgmd=hl.Struct( - **{'class': ['DM', 'DM?', 'DP', 'DFP', 'FP', 'R']}, + alleles=['C', 'T'], + clinvar=None, + hgmd=None, + variant_id='1-874734-C-T', + xpos=1000874734, + gt_stats=hl.Struct(AC=1, AN=32, AF=0.03125, hom=0), + screen=hl.Struct(region_type_ids=[]), + CAID='CA2', + ), + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=876499, + reference_genome='GRCh38', ), - gnomad_non_coding_constraint=hl.Struct(), - screen=hl.Struct( - region_type=[ - 'CTCF-bound', - 'CTCF-only', - 'DNase-H3K4me3', - 'PLS', - 'dELS', - 'pELS', - 'DNase-only', - 'low-DNase', - ], + alleles=['A', 'G'], + clinvar=None, + hgmd=None, + variant_id='1-876499-A-G', + xpos=1000876499, + gt_stats=hl.Struct(AC=31, AN=32, AF=0.96875, hom=15), + screen=hl.Struct(region_type_ids=[]), + CAID='CA3', + ), + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=878314, + reference_genome='GRCh38', ), - sorted_motif_feature_consequences=hl.Struct( - consequence_term=MOTIF_CONSEQUENCE_TERMS, + alleles=['G', 'C'], + clinvar=None, + hgmd=None, + variant_id='1-878314-G-C', + xpos=1000878314, + gt_stats=hl.Struct(AC=3, AN=32, AF=0.09375, hom=0), + screen=hl.Struct(region_type_ids=[]), + CAID='CA4', + ), + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=878809, + reference_genome='GRCh38', ), - sorted_regulatory_feature_consequences=hl.Struct( - biotype=REGULATORY_BIOTYPES, - consequence_term=REGULATORY_CONSEQUENCE_TERMS, + alleles=['C', 'T'], + clinvar=None, + hgmd=None, + variant_id='1-878809-C-T', + xpos=1000878809, + gt_stats=hl.Struct(AC=1, AN=32, AF=0.03125, hom=0), + screen=hl.Struct(region_type_ids=[]), + CAID=None, + ), + ], + ) + self.assertCountEqual( + ht.filter( + ht.locus.position <= 878809, # noqa: PLR2004 + ).sorted_transcript_consequences.consequence_term_ids.collect(), + [ + [[9], [23, 26], [23, 13, 26]], + [[9], [23, 26], [23, 13, 26]], + [[9], [23, 26], [23, 13, 26]], + [[9], [23, 26], [23, 13, 26]], + [[9], [23, 26], [23, 13, 26]], + ], + ) + self.assertCountEqual( + ht.globals.collect(), + [ + hl.Struct( + updates={ + hl.Struct( + callset='v03_pipeline/var/test/callsets/1kg_30variants.vcf', + project_guid='R0113_test_project', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + TEST_REMAP, + TEST_PEDIGREE_3, + ), + ), + ), + hl.Struct( + callset='v03_pipeline/var/test/callsets/1kg_30variants.vcf', + project_guid='R0114_project4', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + TEST_REMAP, + TEST_PEDIGREE_4, + ), + ), + ), + }, + versions=hl.Struct( + clinvar='2024-11-11', + dbnsfp='1.0', + eigen='1.0', + exac='1.0', + gnomad_exomes='1.0', + gnomad_genomes='1.0', + splice_ai='1.0', + topmed='1.0', + gnomad_non_coding_constraint='1.0', + screen='1.0', + hgmd='1.0', ), - sorted_transcript_consequences=hl.Struct( - biotype=BIOTYPES, - consequence_term=TRANSCRIPT_CONSEQUENCE_TERMS, - loftee=hl.Struct( - lof_filter=LOF_FILTERS, + migrations=[], + enums=hl.Struct( + clinvar=ReferenceDataset.clinvar.enum_globals, + dbnsfp=ReferenceDataset.dbnsfp.enum_globals, + eigen=hl.Struct(), + exac=hl.Struct(), + gnomad_exomes=hl.Struct(), + gnomad_genomes=hl.Struct(), + splice_ai=ReferenceDataset.splice_ai.enum_globals, + topmed=hl.Struct(), + hgmd=ReferenceDataset.hgmd.enum_globals, + gnomad_non_coding_constraint=hl.Struct(), + screen=ReferenceDataset.screen.enum_globals, + sorted_motif_feature_consequences=hl.Struct( + consequence_term=MOTIF_CONSEQUENCE_TERMS, ), - utrannotator=hl.Struct( - fiveutr_consequence=FIVEUTR_CONSEQUENCES, + sorted_regulatory_feature_consequences=hl.Struct( + biotype=REGULATORY_BIOTYPES, + consequence_term=REGULATORY_CONSEQUENCE_TERMS, + ), + sorted_transcript_consequences=hl.Struct( + biotype=BIOTYPES, + consequence_term=TRANSCRIPT_CONSEQUENCE_TERMS, + loftee=hl.Struct( + lof_filter=LOF_FILTERS, + ), + utrannotator=hl.Struct( + fiveutr_consequence=FIVEUTR_CONSEQUENCES, + ), ), ), ), - ), - ], - ) + ], + ) + @responses.activate @patch('v03_pipeline.lib.tasks.write_new_variants_table.register_alleles_in_chunks') @patch( 'v03_pipeline.lib.tasks.write_new_variants_table.UpdateVariantAnnotationsTableWithUpdatedReferenceDataset', @@ -687,14 +551,10 @@ def test_multiple_update_vat( def test_update_vat_grch37( self, mock_vep: Mock, - mock_update_vat_with_rdc_task: Mock, + mock_update_vat_with_rd_task: Mock, mock_register_alleles: Mock, - mock_update_crdqs_task, - mock_update_rdc_task: Mock, ) -> None: - mock_update_rdc_task.return_value = MockCompleteTask() - mock_update_crdqs_task.return_value = MockCompleteTask() - mock_update_vat_with_rdc_task.return_value = ( + mock_update_vat_with_rd_task.return_value = ( BaseUpdateVariantAnnotationsTableTask( reference_genome=ReferenceGenome.GRCh37, dataset_type=DatasetType.SNV_INDEL, @@ -702,151 +562,147 @@ def test_update_vat_grch37( ) mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_37_VEP_DATA) mock_register_alleles.side_effect = None - worker = luigi.worker.Worker() - uvatwns_task = UpdateVariantAnnotationsTableWithNewSamplesTask( - reference_genome=ReferenceGenome.GRCh37, - dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, - callset_path=TEST_SNV_INDEL_VCF, - project_guids=['R0113_test_project'], - project_remap_paths=[TEST_REMAP], - project_pedigree_paths=[TEST_PEDIGREE_3], - skip_validation=True, - run_id=TEST_RUN_ID, - ) - worker.add(uvatwns_task) - worker.run() - self.assertTrue(uvatwns_task.complete()) - ht = hl.read_table(uvatwns_task.output().path) - self.assertEqual(ht.count(), 30) - self.assertCountEqual( - ht.globals.paths.collect(), - [ + + with mock_clinvar_urls(ReferenceGenome.GRCh37): + worker = luigi.worker.Worker() + uvatwns_task = UpdateVariantAnnotationsTableWithNewSamplesTask( + reference_genome=ReferenceGenome.GRCh37, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path=TEST_SNV_INDEL_VCF, + project_guids=['R0113_test_project'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_3], + skip_validation=True, + run_id=TEST_RUN_ID, + ) + worker.add(uvatwns_task) + worker.run() + self.assertTrue(uvatwns_task.complete()) + ht = hl.read_table(uvatwns_task.output().path) + self.assertEqual(ht.count(), 30) + self.assertFalse(hasattr(ht, 'rg37_locus')) + self.assertEqual( + ht.collect()[0], hl.Struct( - cadd='gs://seqr-reference-data/GRCh37/CADD/CADD_snvs_and_indels.v1.6.ht', - clinvar='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', - dbnsfp='gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.ht', - eigen='gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.ht', - exac='gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vep.ht', - hgmd='gs://seqr-reference-data-private/GRCh37/HGMD/HGMD_Pro_2023.1_hg19.vcf.gz', - gnomad_exomes='gs://gcp-public-data--gnomad/release/2.1.1/ht/exomes/gnomad.exomes.r2.1.1.sites.ht', - gnomad_genomes='gs://gcp-public-data--gnomad/release/2.1.1/ht/genomes/gnomad.genomes.r2.1.1.sites.ht', - mpc='gs://seqr-reference-data/GRCh37/MPC/fordist_constraint_official_mpc_values.ht', - primate_ai='gs://seqr-reference-data/GRCh37/primate_ai/PrimateAI_scores_v0.2.ht', - splice_ai='gs://seqr-reference-data/GRCh37/spliceai/spliceai_scores.ht', - topmed='gs://seqr-reference-data/GRCh37/TopMed/bravo-dbsnp-all.removed_chr_prefix.liftunder_GRCh37.ht', - ), - ], - ) - self.assertFalse(hasattr(ht, 'rg37_locus')) - self.assertEqual( - ht.collect()[0], - hl.Struct( - locus=hl.Locus(contig=1, position=871269, reference_genome='GRCh37'), - alleles=['A', 'C'], - rsid=None, - variant_id='1-871269-A-C', - xpos=1000871269, - sorted_transcript_consequences=[ - hl.Struct( - amino_acids='S/L', - canonical=1, - codons='tCg/tTg', - gene_id='ENSG00000188976', - hgvsc='ENST00000327044.6:c.1667C>T', - hgvsp='ENSP00000317992.6:p.Ser556Leu', - transcript_id='ENST00000327044', - biotype_id=39, - consequence_term_ids=[9], - is_lof_nagnag=None, - lof_filter_ids=[0, 1], + locus=hl.Locus( + contig=1, + position=871269, + reference_genome='GRCh37', ), - hl.Struct( - amino_acids=None, - canonical=None, - codons=None, - gene_id='ENSG00000188976', - hgvsc='ENST00000477976.1:n.3114C>T', - hgvsp=None, - transcript_id='ENST00000477976', - biotype_id=38, - consequence_term_ids=[23, 26], - is_lof_nagnag=None, - lof_filter_ids=None, + alleles=['A', 'C'], + rsid=None, + variant_id='1-871269-A-C', + xpos=1000871269, + sorted_transcript_consequences=[ + hl.Struct( + amino_acids='S/L', + canonical=1, + codons='tCg/tTg', + gene_id='ENSG00000188976', + hgvsc='ENST00000327044.6:c.1667C>T', + hgvsp='ENSP00000317992.6:p.Ser556Leu', + transcript_id='ENST00000327044', + biotype_id=39, + consequence_term_ids=[9], + is_lof_nagnag=None, + lof_filter_ids=[0, 1], + ), + hl.Struct( + amino_acids=None, + canonical=None, + codons=None, + gene_id='ENSG00000188976', + hgvsc='ENST00000477976.1:n.3114C>T', + hgvsp=None, + transcript_id='ENST00000477976', + biotype_id=38, + consequence_term_ids=[23, 26], + is_lof_nagnag=None, + lof_filter_ids=None, + ), + hl.Struct( + amino_acids=None, + canonical=None, + codons=None, + gene_id='ENSG00000188976', + hgvsc='ENST00000483767.1:n.523C>T', + hgvsp=None, + transcript_id='ENST00000483767', + biotype_id=38, + consequence_term_ids=[23, 26], + is_lof_nagnag=None, + lof_filter_ids=None, + ), + ], + rg38_locus=hl.Locus( + contig='chr1', + position=935889, + reference_genome='GRCh38', ), - hl.Struct( - amino_acids=None, - canonical=None, - codons=None, - gene_id='ENSG00000188976', - hgvsc='ENST00000483767.1:n.523C>T', - hgvsp=None, - transcript_id='ENST00000483767', - biotype_id=38, - consequence_term_ids=[23, 26], - is_lof_nagnag=None, - lof_filter_ids=None, + clinvar=hl.Struct( + alleleId=None, + conflictingPathogenicities=None, + goldStars=None, + pathogenicity_id=None, + assertion_ids=None, + submitters=None, + conditions=None, ), - ], - rg38_locus=hl.Locus( - contig='chr1', - position=935889, - reference_genome='GRCh38', - ), - cadd=hl.Struct(PHRED=9.699999809265137), - clinvar=hl.Struct( - alleleId=None, - conflictingPathogenicities=None, - goldStars=None, - pathogenicity_id=None, - assertion_ids=None, - submitters=None, - conditions=None, - ), - eigen=hl.Struct(Eigen_phred=1.5880000591278076), - exac=hl.Struct( - AF_POPMAX=0.0004100881633348763, - AF=0.0004633000062312931, - AC_Adj=51, - AC_Het=51, - AC_Hom=0, - AC_Hemi=None, - AN_Adj=108288, - ), - gnomad_exomes=hl.Struct( - AF=0.00012876000255346298, - AN=240758, - AC=31, - Hom=0, - AF_POPMAX_OR_GLOBAL=0.0001119549197028391, - FAF_AF=9.315000352216884e-05, - Hemi=0, - ), - gnomad_genomes=None, - mpc=None, - primate_ai=None, - splice_ai=hl.Struct( - delta_score=0.029999999329447746, - splice_consequence_id=3, - ), - topmed=None, - dbnsfp=hl.Struct( - REVEL_score=0.0430000014603138, - SIFT_score=None, - Polyphen2_HVAR_score=None, - MutationTaster_pred_id=0, + eigen=hl.Struct(Eigen_phred=1.5880000591278076), + exac=hl.Struct( + AF_POPMAX=0.0004100881633348763, + AF=0.0004633000062312931, + AC_Adj=51, + AC_Het=51, + AC_Hom=0, + AC_Hemi=None, + AN_Adj=108288, + ), + gnomad_exomes=hl.Struct( + AF=0.00012876000255346298, + AN=240758, + AC=31, + Hom=0, + AF_POPMAX_OR_GLOBAL=0.0001119549197028391, + FAF_AF=9.315000352216884e-05, + Hemi=0, + ), + gnomad_genomes=hl.Struct( + AF=None, + AN=None, + AC=None, + Hom=None, + AF_POPMAX_OR_GLOBAL=None, + FAF_AF=None, + Hemi=None, + ), + splice_ai=hl.Struct( + delta_score=0.029999999329447746, + splice_consequence_id=3, + ), + topmed=hl.Struct(AC=None, AF=None, AN=None, Hom=None, Het=None), + dbnsfp=hl.Struct( + REVEL_score=0.0430000014603138, + SIFT_score=None, + Polyphen2_HVAR_score=None, + MutationTaster_pred_id=0, + CADD_phred=9.699999809265137, + MPC_score=None, + PrimateAI_score=None, + ), + hgmd=None, + gt_stats=hl.Struct(AC=0, AN=6, AF=0.0, hom=0), + CAID=None, ), - hgmd=None, - gt_stats=hl.Struct(AC=0, AN=6, AF=0.0, hom=0), - CAID=None, - ), - ) + ) + @responses.activate @patch('v03_pipeline.lib.tasks.write_new_variants_table.register_alleles_in_chunks') @patch( 'v03_pipeline.lib.tasks.write_new_variants_table.UpdateVariantAnnotationsTableWithUpdatedReferenceDataset', ) - @patch('v03_pipeline.lib.model.reference_dataset_collection.Env') + @patch('v03_pipeline.lib.reference_datasets.reference_dataset.Env') @patch('v03_pipeline.lib.vep.hl.vep') @patch( 'v03_pipeline.lib.tasks.write_new_variants_table.load_gencode_ensembl_to_refseq_id', @@ -855,177 +711,151 @@ def test_update_vat_without_accessing_private_datasets( self, mock_load_gencode_ensembl_to_refseq_id: Mock, mock_vep: Mock, - mock_rdc_env: Mock, - mock_update_vat_with_rdc_task: Mock, + mock_rd_env: Mock, + mock_update_vat_with_rd_task: Mock, mock_register_alleles: Mock, - mock_update_crdqs_task, - mock_update_rdc_task: Mock, ) -> None: mock_load_gencode_ensembl_to_refseq_id.return_value = hl.dict( {'ENST00000327044': 'NM_015658.4'}, ) - mock_update_rdc_task.return_value = MockCompleteTask() - mock_update_crdqs_task.return_value = MockCompleteTask() - mock_update_vat_with_rdc_task.return_value = ( + mock_update_vat_with_rd_task.return_value = ( BaseUpdateVariantAnnotationsTableTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, ) ) shutil.rmtree( - valid_reference_dataset_collection_path( + valid_reference_dataset_path( ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - ReferenceDatasetCollection.HGMD, + ReferenceDataset.hgmd, ), ) - mock_rdc_env.ACCESS_PRIVATE_REFERENCE_DATASETS = False + mock_rd_env.ACCESS_PRIVATE_REFERENCE_DATASETS = False mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_38_VEP_DATA) mock_register_alleles.side_effect = None - worker = luigi.worker.Worker() - uvatwns_task = UpdateVariantAnnotationsTableWithNewSamplesTask( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - sample_type=SampleType.WGS, - callset_path=TEST_SNV_INDEL_VCF, - project_guids=['R0113_test_project'], - project_remap_paths=[TEST_REMAP], - project_pedigree_paths=[TEST_PEDIGREE_3], - skip_validation=True, - run_id=TEST_RUN_ID, - ) - worker.add(uvatwns_task) - worker.run() - self.assertTrue(uvatwns_task.complete()) - ht = hl.read_table(uvatwns_task.output().path) - self.assertEqual(ht.count(), 30) - self.assertCountEqual( - ht.globals.versions.collect(), - [ - hl.Struct( - cadd='v1.6', - clinvar='2023-11-26', - dbnsfp='2.9.3', - eigen=None, - exac=None, - gnomad_exomes='4.1', - gnomad_genomes='4.1', - mpc=None, - primate_ai='v0.2', - splice_ai=None, - topmed=None, - gnomad_non_coding_constraint=None, - screen=None, - ), - ], - ) + with mock_clinvar_urls(): + worker = luigi.worker.Worker() + uvatwns_task = UpdateVariantAnnotationsTableWithNewSamplesTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path=TEST_SNV_INDEL_VCF, + project_guids=['R0113_test_project'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_3], + skip_validation=True, + run_id=TEST_RUN_ID, + ) + worker.add(uvatwns_task) + worker.run() + self.assertTrue(uvatwns_task.complete()) + ht = hl.read_table(uvatwns_task.output().path) + self.assertEqual(ht.count(), 30) + self.assertCountEqual( + ht.globals.versions.collect(), + [ + hl.Struct( + clinvar='2024-11-11', + dbnsfp='1.0', + eigen='1.0', + exac='1.0', + gnomad_exomes='1.0', + gnomad_genomes='1.0', + splice_ai='1.0', + topmed='1.0', + gnomad_non_coding_constraint='1.0', + screen='1.0', + ), + ], + ) + + @responses.activate @patch('v03_pipeline.lib.tasks.write_new_variants_table.register_alleles_in_chunks') @patch( 'v03_pipeline.lib.tasks.write_new_variants_table.UpdateVariantAnnotationsTableWithUpdatedReferenceDataset', ) def test_mito_update_vat( self, - mock_update_vat_with_rdc_task: Mock, + mock_update_vat_with_rd_task: Mock, mock_register_alleles: Mock, - mock_update_crdqs_task, - mock_update_rdc_task: Mock, ) -> None: - mock_update_rdc_task.return_value = MockCompleteTask() - mock_update_crdqs_task.return_value = MockCompleteTask() - mock_update_vat_with_rdc_task.return_value = ( + mock_update_vat_with_rd_task.return_value = ( BaseUpdateVariantAnnotationsTableTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.MITO, ) ) mock_register_alleles.side_effect = None - worker = luigi.worker.Worker() - update_variant_annotations_task = ( - UpdateVariantAnnotationsTableWithNewSamplesTask( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.MITO, - sample_type=SampleType.WGS, - callset_path=TEST_MITO_MT, - project_guids=['R0115_test_project2'], - project_remap_paths=['not_a_real_file'], - project_pedigree_paths=[TEST_PEDIGREE_5], - skip_validation=True, - run_id=TEST_RUN_ID, + + with mock_clinvar_urls(): + worker = luigi.worker.Worker() + update_variant_annotations_task = ( + UpdateVariantAnnotationsTableWithNewSamplesTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.MITO, + sample_type=SampleType.WGS, + callset_path=TEST_MITO_MT, + project_guids=['R0115_test_project2'], + project_remap_paths=['not_a_real_file'], + project_pedigree_paths=[TEST_PEDIGREE_5], + skip_validation=True, + run_id=TEST_RUN_ID, + ) ) - ) - worker.add(update_variant_annotations_task) - worker.run() - self.assertTrue(update_variant_annotations_task.complete()) - ht = hl.read_table(update_variant_annotations_task.output().path) - self.assertEqual(ht.count(), 5) - self.assertCountEqual( - ht.globals.collect(), - [ - hl.Struct( - paths=hl.Struct( - high_constraint_region_mito='gs://seqr-reference-data/GRCh38/mitochondrial/Helix high constraint intervals Feb-15-2022.tsv', - clinvar_mito='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', - dbnsfp_mito='gs://seqr-reference-data/GRCh38/dbNSFP/v4.2/dbNSFP4.2a_variant.with_new_scores.ht', - gnomad_mito='gs://gcp-public-data--gnomad/release/3.1/ht/genomes/gnomad.genomes.v3.1.sites.chrM.ht', - helix_mito='gs://seqr-reference-data/GRCh38/mitochondrial/Helix/HelixMTdb_20200327.ht', - hmtvar='gs://seqr-reference-data/GRCh38/mitochondrial/HmtVar/HmtVar%20Jan.%2010%202022.ht', - mitomap='gs://seqr-reference-data/GRCh38/mitochondrial/MITOMAP/mitomap-confirmed-mutations-2022-02-04.ht', - mitimpact='gs://seqr-reference-data/GRCh38/mitochondrial/MitImpact/MitImpact_db_3.1.3.ht', - local_constraint_mito='gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv', - ), - versions=hl.Struct( - high_constraint_region_mito='Feb-15-2022', - clinvar_mito='2023-07-22', - dbnsfp_mito='4.2', - gnomad_mito='v3.1', - helix_mito='20200327', - hmtvar='Jan. 10 2022', - mitomap='Feb. 04 2022', - mitimpact='3.1.3', - local_constraint_mito='2024-07-24', - ), - enums=hl.Struct( - high_constraint_region_mito=hl.Struct(), - local_constraint_mito=hl.Struct(), - clinvar_mito=hl.Struct( - assertion=CLINVAR_ASSERTIONS, - pathogenicity=CLINVAR_PATHOGENICITIES, - ), - dbnsfp_mito=hl.Struct( - MutationTaster_pred=['D', 'A', 'N', 'P'], + worker.add(update_variant_annotations_task) + worker.run() + self.assertTrue(update_variant_annotations_task.complete()) + ht = hl.read_table(update_variant_annotations_task.output().path) + self.assertEqual(ht.count(), 5) + self.assertCountEqual( + ht.globals.collect(), + [ + hl.Struct( + versions=hl.Struct( + clinvar='2024-11-11', + dbnsfp='1.0', + gnomad_mito='1.0', + helix_mito='1.0', + hmtvar='1.0', + mitomap='1.0', + mitimpact='1.0', + local_constraint_mito='1.0', ), - gnomad_mito=hl.Struct(), - helix_mito=hl.Struct(), - hmtvar=hl.Struct(), - mitomap=hl.Struct(), - mitimpact=hl.Struct(), - sorted_transcript_consequences=hl.Struct( - biotype=BIOTYPES, - consequence_term=TRANSCRIPT_CONSEQUENCE_TERMS, - lof_filter=LOF_FILTERS, + enums=hl.Struct( + local_constraint_mito=hl.Struct(), + clinvar=ReferenceDataset.clinvar.enum_globals, + dbnsfp=ReferenceDataset.dbnsfp.enum_globals, + gnomad_mito=hl.Struct(), + helix_mito=hl.Struct(), + hmtvar=hl.Struct(), + mitomap=hl.Struct(), + mitimpact=hl.Struct(), + sorted_transcript_consequences=hl.Struct( + biotype=BIOTYPES, + consequence_term=TRANSCRIPT_CONSEQUENCE_TERMS, + lof_filter=LOF_FILTERS, + ), + mitotip=hl.Struct(trna_prediction=MITOTIP_PATHOGENICITIES), ), - mitotip=hl.Struct(trna_prediction=MITOTIP_PATHOGENICITIES), - ), - migrations=[], - updates={ - hl.Struct( - callset='v03_pipeline/var/test/callsets/mito_1.mt', - project_guid='R0115_test_project2', - remap_pedigree_hash=hl.eval( - remap_pedigree_hash( - 'not_a_real_file', - TEST_PEDIGREE_5, + migrations=[], + updates={ + hl.Struct( + callset='v03_pipeline/var/test/callsets/mito_1.mt', + project_guid='R0115_test_project2', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + 'not_a_real_file', + TEST_PEDIGREE_5, + ), ), ), - ), - }, - ), - ], - ) - self.assertCountEqual( - ht.collect(), - [ + }, + ), + ], + ) + self.assertCountEqual( + ht.collect()[0], hl.Struct( locus=hl.Locus( contig='chrM', @@ -1035,7 +865,6 @@ def test_mito_update_vat( alleles=['T', 'C'], common_low_heteroplasmy=False, haplogroup=hl.Struct(is_defining=False), - high_constraint_region_mito=True, mitotip=hl.Struct(trna_prediction_id=None), rg37_locus=hl.Locus( contig='MT', @@ -1046,152 +875,8 @@ def test_mito_update_vat( sorted_transcript_consequences=None, variant_id='M-3-T-C', xpos=25000000003, - clinvar_mito=None, - dbnsfp_mito=None, - gnomad_mito=None, - helix_mito=None, - hmtvar=None, - mitomap=None, - mitimpact=None, - gt_stats=hl.Struct( - AC_het=1, - AF_het=0.25, - AC_hom=0, - AF_hom=0.0, - AN=4, - ), - local_constraint_mito=None, - ), - hl.Struct( - locus=hl.Locus( - contig='chrM', - position=8, - reference_genome='GRCh38', - ), - alleles=['G', 'T'], - common_low_heteroplasmy=False, - haplogroup=hl.Struct(is_defining=False), - high_constraint_region_mito=True, - mitotip=hl.Struct(trna_prediction_id=None), - rg37_locus=hl.Locus( - contig='MT', - position=8, - reference_genome='GRCh37', - ), - rsid=None, - sorted_transcript_consequences=None, - variant_id='M-8-G-T', - xpos=25000000008, - clinvar_mito=None, - dbnsfp_mito=None, - gnomad_mito=None, - helix_mito=None, - hmtvar=None, - mitomap=None, - mitimpact=None, - gt_stats=hl.Struct( - AC_het=1, - AF_het=0.25, - AC_hom=0, - AF_hom=0.0, - AN=4, - ), - local_constraint_mito=None, - ), - hl.Struct( - locus=hl.Locus( - contig='chrM', - position=12, - reference_genome='GRCh38', - ), - alleles=['T', 'C'], - common_low_heteroplasmy=False, - haplogroup=hl.Struct(is_defining=False), - high_constraint_region_mito=False, - mitotip=hl.Struct(trna_prediction_id=None), - rg37_locus=hl.Locus( - contig='MT', - position=12, - reference_genome='GRCh37', - ), - rsid=None, - sorted_transcript_consequences=None, - variant_id='M-12-T-C', - xpos=25000000012, - clinvar_mito=None, - dbnsfp_mito=None, - gnomad_mito=None, - helix_mito=None, - hmtvar=None, - mitomap=None, - mitimpact=None, - gt_stats=hl.Struct( - AC_het=1, - AF_het=0.25, - AC_hom=0, - AF_hom=0.0, - AN=4, - ), - local_constraint_mito=None, - ), - hl.Struct( - locus=hl.Locus( - contig='chrM', - position=16, - reference_genome='GRCh38', - ), - alleles=['A', 'T'], - common_low_heteroplasmy=False, - haplogroup=hl.Struct(is_defining=True), - high_constraint_region_mito=False, - mitotip=hl.Struct(trna_prediction_id=None), - rg37_locus=hl.Locus( - contig='MT', - position=16, - reference_genome='GRCh37', - ), - rsid='rs1556422363', - sorted_transcript_consequences=None, - variant_id='M-16-A-T', - xpos=25000000016, - clinvar_mito=None, - dbnsfp_mito=None, - gnomad_mito=None, - helix_mito=None, - hmtvar=None, - mitomap=None, - mitimpact=None, - gt_stats=hl.Struct( - AC_het=1, - AF_het=0.25, - AC_hom=0, - AF_hom=0.0, - AN=4, - ), - local_constraint_mito=None, - ), - hl.Struct( - locus=hl.Locus( - contig='chrM', - position=18, - reference_genome='GRCh38', - ), - alleles=['C', 'T'], - common_low_heteroplasmy=False, - haplogroup=hl.Struct(is_defining=False), - high_constraint_region_mito=False, - mitotip=hl.Struct(trna_prediction_id=None), - rg37_locus=hl.Locus( - contig='MT', - position=18, - reference_genome='GRCh37', - ), - rsid=None, - sorted_transcript_consequences=None, - variant_id='M-18-C-T', - xpos=25000000018, - clinvar_mito=None, - dbnsfp_mito=None, + clinvar=None, + dbnsfp=None, gnomad_mito=None, helix_mito=None, hmtvar=None, @@ -1206,8 +891,7 @@ def test_mito_update_vat( ), local_constraint_mito=None, ), - ], - ) + ) @patch( 'v03_pipeline.lib.tasks.write_new_variants_table.load_gencode_gene_symbol_to_gene_id', @@ -1215,11 +899,7 @@ def test_mito_update_vat( def test_sv_update_vat( self, mock_load_gencode: Mock, - mock_update_crdqs_task, - mock_update_rdc_task: Mock, ) -> None: - mock_update_rdc_task.return_value = MockCompleteTask() - mock_update_crdqs_task.return_value = MockCompleteTask() mock_load_gencode.return_value = GENE_ID_MAPPING worker = luigi.worker.Worker() update_variant_annotations_task = ( @@ -1249,7 +929,6 @@ def test_sv_update_vat( ht.globals.collect(), [ hl.Struct( - paths=hl.Struct(), versions=hl.Struct(), enums=hl.Struct( sv_type=SV_TYPES, @@ -1797,11 +1476,7 @@ def test_sv_update_vat( def test_gcnv_update_vat( self, - mock_update_crdqs_task, - mock_update_rdc_task, ) -> None: - mock_update_rdc_task.return_value = MockCompleteTask() - mock_update_crdqs_task.return_value = MockCompleteTask() worker = luigi.worker.Worker() update_variant_annotations_task = ( UpdateVariantAnnotationsTableWithNewSamplesTask( @@ -1830,7 +1505,6 @@ def test_gcnv_update_vat( ht.globals.collect(), [ hl.Struct( - paths=hl.Struct(), versions=hl.Struct(), enums=hl.Struct( sv_type=SV_TYPES, diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py index 3b2077446..e5601875b 100644 --- a/v03_pipeline/lib/tasks/validate_callset.py +++ b/v03_pipeline/lib/tasks/validate_callset.py @@ -4,23 +4,24 @@ from v03_pipeline.lib.misc.validation import ( SeqrValidationError, - get_validation_dependencies, validate_allele_type, validate_expected_contig_frequency, validate_imputed_sex_ploidy, validate_no_duplicate_variants, validate_sample_type, ) -from v03_pipeline.lib.model import CachedReferenceDatasetQuery from v03_pipeline.lib.model.environment import Env from v03_pipeline.lib.paths import ( imported_callset_path, + sex_check_table_path, + valid_reference_dataset_path, ) +from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget -from v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query import ( - UpdatedCachedReferenceDatasetQuery, +from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset import ( + UpdatedReferenceDatasetTask, ) from v03_pipeline.lib.tasks.write_imported_callset import WriteImportedCallsetTask from v03_pipeline.lib.tasks.write_sex_check_table import WriteSexCheckTableTask @@ -31,6 +32,28 @@ @luigi.util.inherits(BaseLoadingRunParams) class ValidateCallsetTask(BaseUpdateTask): + def get_validation_dependencies(self) -> dict[str, hl.Table]: + deps = {} + deps['coding_and_noncoding_variants_ht'] = hl.read_table( + valid_reference_dataset_path( + self.reference_genome, + ReferenceDataset.gnomad_coding_and_noncoding, + ), + ) + if ( + Env.CHECK_SEX_AND_RELATEDNESS + and self.dataset_type.check_sex_and_relatedness + and not self.skip_check_sex_and_relatedness + ): + deps['sex_check_ht'] = hl.read_table( + sex_check_table_path( + self.reference_genome, + self.dataset_type, + self.callset_path, + ), + ) + return deps + def complete(self) -> luigi.Target: if super().complete(): mt = hl.read_matrix_table(self.output().path) @@ -57,8 +80,8 @@ def requires(self) -> list[luigi.Task]: *requirements, ( self.clone( - UpdatedCachedReferenceDatasetQuery, - crdq=CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS, + UpdatedReferenceDatasetTask, + reference_dataset=ReferenceDataset.gnomad_coding_and_noncoding, ) ), ] @@ -98,9 +121,7 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable: callset_path=self.callset_path, validated_sample_type=self.sample_type.value, ) - validation_dependencies = get_validation_dependencies( - **self.param_kwargs, - ) + validation_dependencies = self.get_validation_dependencies() for validation_f in [ validate_allele_type, validate_imputed_sex_ploidy, diff --git a/v03_pipeline/lib/tasks/validate_callset_test.py b/v03_pipeline/lib/tasks/validate_callset_test.py index 991412824..8f3638376 100644 --- a/v03_pipeline/lib/tasks/validate_callset_test.py +++ b/v03_pipeline/lib/tasks/validate_callset_test.py @@ -1,29 +1,27 @@ import json import shutil -from unittest.mock import Mock, patch import luigi.worker from v03_pipeline.lib.model import ( - CachedReferenceDatasetQuery, DatasetType, ReferenceGenome, SampleType, ) from v03_pipeline.lib.paths import ( - cached_reference_dataset_query_path, + valid_reference_dataset_path, ) +from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset from v03_pipeline.lib.tasks.validate_callset import ( ValidateCallsetTask, ) from v03_pipeline.lib.tasks.write_validation_errors_for_run import ( WriteValidationErrorsForRunTask, ) -from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase -TEST_CODING_NONCODING_CRDQ_1 = ( - 'v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht' +TEST_CODING_AND_NONCODING_HT = ( + 'v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht' ) MULTIPLE_VALIDATION_EXCEPTIONS_VCF = ( 'v03_pipeline/var/test/callsets/multiple_validation_exceptions.vcf' @@ -36,22 +34,16 @@ class ValidateCallsetTest(MockedDatarootTestCase): def setUp(self) -> None: super().setUp() shutil.copytree( - TEST_CODING_NONCODING_CRDQ_1, - cached_reference_dataset_query_path( + TEST_CODING_AND_NONCODING_HT, + valid_reference_dataset_path( ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS, + ReferenceDataset.gnomad_coding_and_noncoding, ), ) - @patch( - 'v03_pipeline.lib.tasks.validate_callset.UpdatedCachedReferenceDatasetQuery', - ) def test_validate_callset_multiple_exceptions( self, - mock_updated_cached_reference_dataset_query: Mock, ) -> None: - mock_updated_cached_reference_dataset_query.return_value = MockCompleteTask() worker = luigi.worker.Worker() validate_callset_task = ValidateCallsetTask( reference_genome=ReferenceGenome.GRCh38, diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index 344d003ae..441ff919e 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -5,25 +5,23 @@ import luigi.util from v03_pipeline.lib.annotations.fields import get_fields -from v03_pipeline.lib.annotations.rdc_dependencies import ( - get_rdc_annotation_dependencies, -) from v03_pipeline.lib.misc.allele_registry import register_alleles_in_chunks from v03_pipeline.lib.misc.callsets import get_callset_ht from v03_pipeline.lib.misc.io import checkpoint, remap_pedigree_hash from v03_pipeline.lib.misc.math import constrain from v03_pipeline.lib.model import ( Env, - ReferenceDatasetCollection, ) from v03_pipeline.lib.paths import ( new_variants_table_path, + valid_reference_dataset_path, variant_annotations_table_path, ) -from v03_pipeline.lib.reference_data.gencode.mapping_gene_ids import ( +from v03_pipeline.lib.reference_datasets.gencode.mapping_gene_ids import ( load_gencode_ensembl_to_refseq_id, load_gencode_gene_symbol_to_gene_id, ) +from v03_pipeline.lib.reference_datasets.reference_dataset import BaseReferenceDataset from v03_pipeline.lib.tasks.base.base_loading_run_params import ( BaseLoadingRunParams, ) @@ -49,7 +47,17 @@ class WriteNewVariantsTableTask(BaseWriteTask): @property def annotation_dependencies(self) -> dict[str, hl.Table]: - deps = get_rdc_annotation_dependencies(self.dataset_type, self.reference_genome) + deps = {} + for ( + reference_dataset + ) in BaseReferenceDataset.for_reference_genome_dataset_type_annotations( + self.reference_genome, + self.dataset_type, + ): + deps[f'{reference_dataset.value}_ht'] = hl.read_table( + valid_reference_dataset_path(self.reference_genome, reference_dataset), + ) + if self.dataset_type.has_gencode_ensembl_to_refseq_id_mapping( self.reference_genome, ): @@ -163,15 +171,26 @@ def create_table(self) -> hl.Table: ), ) - # Join new variants against the reference dataset collections that are not "annotated". - for rdc in ReferenceDatasetCollection.for_reference_genome_dataset_type( + # Join new variants against the reference datasets that are not "annotated". + for ( + reference_dataset + ) in BaseReferenceDataset.for_reference_genome_dataset_type_annotations( self.reference_genome, self.dataset_type, ): - if rdc.requires_annotation: + if reference_dataset.is_keyed_by_interval: continue - rdc_ht = self.annotation_dependencies[f'{rdc.value}_ht'] - new_variants_ht = new_variants_ht.join(rdc_ht, 'left') + reference_dataset_ht = self.annotation_dependencies[ + f'{reference_dataset.value}_ht' + ] + reference_dataset_ht = reference_dataset_ht.select( + **{ + f'{reference_dataset.name}': hl.Struct( + **reference_dataset_ht.row_value, + ), + }, + ) + new_variants_ht = new_variants_ht.join(reference_dataset_ht, 'left') # Register the new variant alleles to the Clingen Allele Registry # and annotate new_variants table with CAID. @@ -198,6 +217,7 @@ def create_table(self) -> hl.Table: new_variants_ht = new_variants_ht.join(ar_ht, 'left') elif self.dataset_type.should_send_to_allele_registry: new_variants_ht = new_variants_ht.annotate(CAID=hl.missing(hl.tstr)) + return new_variants_ht.select_globals( updates={ hl.Struct( diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_table.py b/v03_pipeline/lib/tasks/write_relatedness_check_table.py index edfe0d716..a7056d7b2 100644 --- a/v03_pipeline/lib/tasks/write_relatedness_check_table.py +++ b/v03_pipeline/lib/tasks/write_relatedness_check_table.py @@ -3,15 +3,15 @@ import luigi.util from v03_pipeline.lib.methods.relatedness import call_relatedness -from v03_pipeline.lib.model import CachedReferenceDatasetQuery from v03_pipeline.lib.paths import ( relatedness_check_table_path, ) +from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget -from v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query import ( - UpdatedCachedReferenceDatasetQuery, +from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset import ( + UpdatedReferenceDatasetTask, ) from v03_pipeline.lib.tasks.validate_callset import ValidateCallsetTask @@ -31,8 +31,8 @@ def requires(self): return [ self.clone(ValidateCallsetTask), self.clone( - UpdatedCachedReferenceDatasetQuery, - crdq=CachedReferenceDatasetQuery.GNOMAD_QC, + UpdatedReferenceDatasetTask, + reference_dataset=ReferenceDataset.gnomad_qc, ), ] diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py b/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py index c96ba9ecb..135710545 100644 --- a/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py +++ b/v03_pipeline/lib/tasks/write_relatedness_check_table_test.py @@ -1,60 +1,40 @@ import shutil -from unittest import mock +from unittest.mock import patch import hail as hl import luigi.worker from v03_pipeline.lib.misc.io import import_vcf from v03_pipeline.lib.model import ( - CachedReferenceDatasetQuery, DatasetType, ReferenceGenome, SampleType, ) from v03_pipeline.lib.paths import ( - cached_reference_dataset_query_path, imported_callset_path, relatedness_check_table_path, + valid_reference_dataset_path, ) +from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset from v03_pipeline.lib.tasks.write_relatedness_check_table import ( WriteRelatednessCheckTableTask, ) from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase -TEST_GNOMAD_QC_HT = 'v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht' +TEST_GNOMAD_QC_HT = 'v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht' TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' - TEST_RUN_ID = 'manual__2024-04-03' -MOCK_CONFIG = { - 'gnomad_qc': { - '38': { - 'version': '4.0', - 'source_path': TEST_GNOMAD_QC_HT, - 'custom_import': lambda *_: hl.Table.parallelize( - [], - hl.tstruct( - locus=hl.tlocus('GRCh38'), - alleles=hl.tarray(hl.tstr), - ), - key=['locus', 'alleles'], - ), - }, - }, -} - class WriteRelatednessCheckTableTaskTest(MockedDatarootTestCase): def setUp(self) -> None: super().setUp() - self.gnomad_qc_path = cached_reference_dataset_query_path( - ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - CachedReferenceDatasetQuery.GNOMAD_QC, - ) shutil.copytree( TEST_GNOMAD_QC_HT, - self.gnomad_qc_path, + valid_reference_dataset_path( + ReferenceGenome.GRCh38, + ReferenceDataset.gnomad_qc, + ), ) # Force imported callset to be complete @@ -69,48 +49,67 @@ def setUp(self) -> None: ), ) - @mock.patch.dict( - 'v03_pipeline.lib.reference_data.compare_globals.CONFIG', - MOCK_CONFIG, - ) - @mock.patch.dict( - 'v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query.CONFIG', - MOCK_CONFIG, - ) def test_relatedness_check_table_task_gnomad_qc_updated( self, ) -> None: - ht = hl.read_table( - self.gnomad_qc_path, - ) - self.assertEqual( - hl.eval(ht.versions.gnomad_qc), - 'v3.1', - ) - worker = luigi.worker.Worker() - task = WriteRelatednessCheckTableTask( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - run_id=TEST_RUN_ID, - sample_type=SampleType.WGS, - callset_path=TEST_VCF, - ) - worker.add(task) - worker.run() - self.assertTrue(task.complete()) - ht = hl.read_table(self.gnomad_qc_path) self.assertEqual( - hl.eval(ht.versions.gnomad_qc), - '4.0', - ) - ht = hl.read_table( - relatedness_check_table_path( - ReferenceGenome.GRCh38, - DatasetType.SNV_INDEL, - TEST_VCF, + hl.eval( + hl.read_table( + valid_reference_dataset_path( + ReferenceGenome.GRCh38, + ReferenceDataset.gnomad_qc, + ), + ).version, ), + '1.0', ) - self.assertEqual( - ht.collect(), - [], - ) + with patch.object( + ReferenceDataset, + 'version', + return_value='2.0', + ), patch.object( + ReferenceDataset, + 'get_ht', + lambda *_: hl.Table.parallelize( + [], + hl.tstruct( + locus=hl.tlocus('GRCh38'), + alleles=hl.tarray(hl.tstr), + ), + key=['locus', 'alleles'], + globals=hl.Struct(version='2.0'), + ), + ): + worker = luigi.worker.Worker() + task = WriteRelatednessCheckTableTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + run_id=TEST_RUN_ID, + sample_type=SampleType.WGS, + callset_path=TEST_VCF, + ) + worker.add(task) + worker.run() + self.assertTrue(task.complete()) + self.assertEqual( + hl.eval( + hl.read_table( + valid_reference_dataset_path( + ReferenceGenome.GRCh38, + ReferenceDataset.gnomad_qc, + ), + ).version, + ), + '2.0', + ) + ht = hl.read_table( + relatedness_check_table_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + TEST_VCF, + ), + ) + self.assertEqual( + ht.collect(), + [], + ) diff --git a/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py b/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py index 4a6b4baec..0ca2ee0ef 100644 --- a/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py +++ b/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py @@ -41,15 +41,15 @@ class WriteVariantAnnotationsVCFTest(MockedDatarootTestCase): 'v03_pipeline.lib.tasks.write_new_variants_table.load_gencode_gene_symbol_to_gene_id', ) @patch( - 'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.UpdateCachedReferenceDatasetQueries', + 'v03_pipeline.lib.tasks.base.base_update_variant_annotations_table.UpdatedReferenceDatasetQueryTask', ) def test_sv_export_vcf( self, - mock_update_crdqs_task: Mock, + mock_rd_query_task: Mock, mock_load_gencode: Mock, ) -> None: mock_load_gencode.return_value = GENE_ID_MAPPING - mock_update_crdqs_task.return_value = MockCompleteTask() + mock_rd_query_task.return_value = MockCompleteTask() worker = luigi.worker.Worker() update_variant_annotations_task = ( UpdateVariantAnnotationsTableWithNewSamplesTask( diff --git a/v03_pipeline/lib/test/mock_clinvar_urls.py b/v03_pipeline/lib/test/mock_clinvar_urls.py new file mode 100644 index 000000000..767fe6cf0 --- /dev/null +++ b/v03_pipeline/lib/test/mock_clinvar_urls.py @@ -0,0 +1,37 @@ +import gzip +import tempfile +from contextlib import contextmanager + +import pysam +import responses + +from v03_pipeline.lib.model.definitions import ReferenceGenome +from v03_pipeline.lib.reference_datasets.clinvar import CLINVAR_SUBMISSION_SUMMARY_URL +from v03_pipeline.lib.reference_datasets.reference_dataset import ( + ReferenceDataset, +) + +CLINVAR_VCF = 'v03_pipeline/var/test/reference_datasets/raw/clinvar.vcf' +CLINVAR_SUBMISSION_SUMMARY = ( + 'v03_pipeline/var/test/reference_datasets/raw/submission_summary.txt' +) + + +@contextmanager +def mock_clinvar_urls(reference_genome=ReferenceGenome.GRCh38): + with tempfile.NamedTemporaryFile( + suffix='.vcf.bgz', + ) as f1, open(CLINVAR_SUBMISSION_SUMMARY, 'rb') as f2: + responses.add_passthru('http://localhost') + # pysam is being used as it was the cleanest way to + # get a bgzip formatted file :/ + pysam.tabix_compress(CLINVAR_VCF, f1.name, force=True) + responses.get( + ReferenceDataset.clinvar.path(reference_genome), + body=f1.read(), + ) + responses.get( + CLINVAR_SUBMISSION_SUMMARY_URL, + body=gzip.compress(f2.read()), + ) + yield diff --git a/v03_pipeline/lib/test/mocked_reference_datasets_testcase.py b/v03_pipeline/lib/test/mocked_reference_datasets_testcase.py new file mode 100644 index 000000000..569996158 --- /dev/null +++ b/v03_pipeline/lib/test/mocked_reference_datasets_testcase.py @@ -0,0 +1,40 @@ +import os +import shutil + +import responses + +from v03_pipeline.lib.model.definitions import ReferenceGenome +from v03_pipeline.lib.paths import valid_reference_dataset_path +from v03_pipeline.lib.reference_datasets.reference_dataset import ReferenceDataset +from v03_pipeline.lib.test.mock_clinvar_urls import mock_clinvar_urls +from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase + +REFERENCE_DATASETS_PATH = 'v03_pipeline/var/test/reference_datasets' + + +class MockedReferenceDatasetsTestCase(MockedDatarootTestCase): + @responses.activate + def setUp(self) -> None: + super().setUp() + for reference_genome in ReferenceGenome: + with mock_clinvar_urls(reference_genome): + path = os.path.join( + REFERENCE_DATASETS_PATH, + reference_genome.value, + ) + # Use listdir, allowing for missing datasets + # in the tests. + for dataset_name in os.listdir( + path, + ): + # Copy the entire directory tree under + # the dataset name. + shutil.copytree( + os.path.join(path, dataset_name), + os.path.dirname( + valid_reference_dataset_path( + reference_genome, + ReferenceDataset(dataset_name), + ), + ), + ) diff --git a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/.README.txt.crc deleted file mode 100644 index 0cd2ba4fd..000000000 Binary files a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/.README.txt.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/.metadata.json.gz.crc deleted file mode 100644 index ae7d0b4c6..000000000 Binary files a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/README.txt b/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/README.txt deleted file mode 100644 index a72ec5dc8..000000000 --- a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/README.txt +++ /dev/null @@ -1,3 +0,0 @@ -This folder comprises a Hail (www.hail.is) native Table or MatrixTable. - Written with version 0.2.128-eead8100a1c1 - Created at 2024/05/10 15:28:58 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/globals/.metadata.json.gz.crc deleted file mode 100644 index d972cdba4..000000000 Binary files a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/globals/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/globals/metadata.json.gz deleted file mode 100644 index 9a1b8b45d..000000000 Binary files a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/globals/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/globals/parts/.part-0.crc deleted file mode 100644 index 5b13c230e..000000000 Binary files a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/globals/parts/.part-0.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/globals/parts/part-0 deleted file mode 100644 index 874be1467..000000000 Binary files a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/globals/parts/part-0 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/index/part-0-fc4518f0-e0cb-4157-b60d-b6ab4c5f4a75.idx/.index.crc b/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/index/part-0-fc4518f0-e0cb-4157-b60d-b6ab4c5f4a75.idx/.index.crc deleted file mode 100644 index 92c93dceb..000000000 Binary files a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/index/part-0-fc4518f0-e0cb-4157-b60d-b6ab4c5f4a75.idx/.index.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/index/part-0-fc4518f0-e0cb-4157-b60d-b6ab4c5f4a75.idx/index b/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/index/part-0-fc4518f0-e0cb-4157-b60d-b6ab4c5f4a75.idx/index deleted file mode 100644 index 72b3e57dc..000000000 Binary files a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/index/part-0-fc4518f0-e0cb-4157-b60d-b6ab4c5f4a75.idx/index and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/metadata.json.gz deleted file mode 100644 index fcf1ca8c3..000000000 Binary files a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/rows/.metadata.json.gz.crc deleted file mode 100644 index a5f8539c8..000000000 Binary files a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/rows/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/rows/metadata.json.gz deleted file mode 100644 index 483a5fe14..000000000 Binary files a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/rows/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/rows/parts/.part-0-fc4518f0-e0cb-4157-b60d-b6ab4c5f4a75.crc b/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/rows/parts/.part-0-fc4518f0-e0cb-4157-b60d-b6ab4c5f4a75.crc deleted file mode 100644 index 3003ab7a8..000000000 Binary files a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/rows/parts/.part-0-fc4518f0-e0cb-4157-b60d-b6ab4c5f4a75.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/rows/parts/part-0-fc4518f0-e0cb-4157-b60d-b6ab4c5f4a75 b/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/rows/parts/part-0-fc4518f0-e0cb-4157-b60d-b6ab4c5f4a75 deleted file mode 100644 index b28df25c3..000000000 Binary files a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/rows/parts/part-0-fc4518f0-e0cb-4157-b60d-b6ab4c5f4a75 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/.README.txt.crc deleted file mode 100644 index add5a1942..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/.README.txt.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/.metadata.json.gz.crc deleted file mode 100644 index 3a7d8101c..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/.metadata.json.gz.crc deleted file mode 100644 index a3c8757f4..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/metadata.json.gz deleted file mode 100644 index b6d39681d..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/parts/.part-0.crc deleted file mode 100644 index c96ad70c9..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/parts/.part-0.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/parts/part-0 deleted file mode 100644 index bb1d53943..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/parts/part-0 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/metadata.json.gz deleted file mode 100644 index 5aed747bc..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/.metadata.json.gz.crc deleted file mode 100644 index 682fea6e7..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/metadata.json.gz deleted file mode 100644 index d37774da9..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/.part-0-9e75273d-7113-40e4-a327-453f3451dc8c.crc b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/.part-0-9e75273d-7113-40e4-a327-453f3451dc8c.crc deleted file mode 100644 index df78faed7..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/.part-0-9e75273d-7113-40e4-a327-453f3451dc8c.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/part-0-9e75273d-7113-40e4-a327-453f3451dc8c b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/part-0-9e75273d-7113-40e4-a327-453f3451dc8c deleted file mode 100644 index 4bc78c56b..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/part-0-9e75273d-7113-40e4-a327-453f3451dc8c and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/.README.txt.crc deleted file mode 100644 index e175e8da4..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/.README.txt.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/.metadata.json.gz.crc deleted file mode 100644 index 5def68f7f..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/.metadata.json.gz.crc deleted file mode 100644 index 92c2ee4f3..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/metadata.json.gz deleted file mode 100644 index 26e678a01..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/parts/.part-0.crc deleted file mode 100644 index 66c495184..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/parts/.part-0.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/parts/part-0 deleted file mode 100644 index 31232639d..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/parts/part-0 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/metadata.json.gz deleted file mode 100644 index 351b9c8a1..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/.metadata.json.gz.crc deleted file mode 100644 index edeb97082..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/metadata.json.gz deleted file mode 100644 index 8ab2a9563..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/parts/.part-0-3569201c-d630-43c4-9056-cbace806fe8d.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/parts/.part-0-3569201c-d630-43c4-9056-cbace806fe8d.crc deleted file mode 100644 index dd555f553..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/parts/.part-0-3569201c-d630-43c4-9056-cbace806fe8d.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/parts/part-0-3569201c-d630-43c4-9056-cbace806fe8d b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/parts/part-0-3569201c-d630-43c4-9056-cbace806fe8d deleted file mode 100644 index 446fb5491..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/parts/part-0-3569201c-d630-43c4-9056-cbace806fe8d and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/.README.txt.crc deleted file mode 100644 index 2796480e9..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/.README.txt.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/.metadata.json.gz.crc deleted file mode 100644 index 5def68f7f..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/README.txt b/v03_pipeline/var/test/reference_data/test_combined_1.ht/README.txt deleted file mode 100644 index 9b284affa..000000000 --- a/v03_pipeline/var/test/reference_data/test_combined_1.ht/README.txt +++ /dev/null @@ -1,3 +0,0 @@ -This folder comprises a Hail (www.hail.is) native Table or MatrixTable. - Written with version 0.2.133-4c60fddb171a - Created at 2024/11/02 15:22:20 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/.metadata.json.gz.crc deleted file mode 100644 index 92c2ee4f3..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/metadata.json.gz deleted file mode 100644 index 26e678a01..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/parts/.part-0.crc deleted file mode 100644 index 66c495184..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/parts/.part-0.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/parts/part-0 deleted file mode 100644 index 31232639d..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/parts/part-0 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht/metadata.json.gz deleted file mode 100644 index 351b9c8a1..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/.metadata.json.gz.crc deleted file mode 100644 index e7c96acca..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/metadata.json.gz deleted file mode 100644 index d2c7ccb1c..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.crc deleted file mode 100644 index dd555f553..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2 b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2 deleted file mode 100644 index 446fb5491..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_2.ht/.README.txt.crc deleted file mode 100644 index 974f9ff55..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_2.ht/.README.txt.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_2.ht/.metadata.json.gz.crc deleted file mode 100644 index 522a8fe04..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_2.ht/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/README.txt b/v03_pipeline/var/test/reference_data/test_combined_2.ht/README.txt deleted file mode 100644 index 743f636a9..000000000 --- a/v03_pipeline/var/test/reference_data/test_combined_2.ht/README.txt +++ /dev/null @@ -1,3 +0,0 @@ -This folder comprises a Hail (www.hail.is) native Table or MatrixTable. - Written with version 0.2.122-be9d88a80695 - Created at 2024/01/08 10:16:02 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_2.ht/globals/.metadata.json.gz.crc deleted file mode 100644 index 5098753b6..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_2.ht/globals/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_2.ht/globals/metadata.json.gz deleted file mode 100644 index b6841a0e8..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_2.ht/globals/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_combined_2.ht/globals/parts/.part-0.crc deleted file mode 100644 index fd11dbde5..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_2.ht/globals/parts/.part-0.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_combined_2.ht/globals/parts/part-0 deleted file mode 100644 index 152082d8c..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_2.ht/globals/parts/part-0 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_2.ht/metadata.json.gz deleted file mode 100644 index 8d61acb47..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_2.ht/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_2.ht/rows/.metadata.json.gz.crc deleted file mode 100644 index be3f140cc..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_2.ht/rows/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_2.ht/rows/metadata.json.gz deleted file mode 100644 index e79c13ca4..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_2.ht/rows/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/rows/parts/.part-0-20336911-c437-4deb-9fa4-7c7fe61f0408.crc b/v03_pipeline/var/test/reference_data/test_combined_2.ht/rows/parts/.part-0-20336911-c437-4deb-9fa4-7c7fe61f0408.crc deleted file mode 100644 index e372e0ff1..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_2.ht/rows/parts/.part-0-20336911-c437-4deb-9fa4-7c7fe61f0408.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/rows/parts/.part-0-7d0599cd-6874-47f8-b6de-a7db0b41817c.crc b/v03_pipeline/var/test/reference_data/test_combined_2.ht/rows/parts/.part-0-7d0599cd-6874-47f8-b6de-a7db0b41817c.crc deleted file mode 100644 index 9fe01ae3c..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_2.ht/rows/parts/.part-0-7d0599cd-6874-47f8-b6de-a7db0b41817c.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/rows/parts/part-0-20336911-c437-4deb-9fa4-7c7fe61f0408 b/v03_pipeline/var/test/reference_data/test_combined_2.ht/rows/parts/part-0-20336911-c437-4deb-9fa4-7c7fe61f0408 deleted file mode 100644 index 3a360f426..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_2.ht/rows/parts/part-0-20336911-c437-4deb-9fa4-7c7fe61f0408 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/rows/parts/part-0-7d0599cd-6874-47f8-b6de-a7db0b41817c b/v03_pipeline/var/test/reference_data/test_combined_2.ht/rows/parts/part-0-7d0599cd-6874-47f8-b6de-a7db0b41817c deleted file mode 100644 index dec0aea3c..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_2.ht/rows/parts/part-0-7d0599cd-6874-47f8-b6de-a7db0b41817c and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/.README.txt.crc deleted file mode 100644 index 394adb99d..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/.README.txt.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/.metadata.json.gz.crc deleted file mode 100644 index 6b72fb1f0..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/.metadata.json.gz.crc deleted file mode 100644 index 58d41345e..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/metadata.json.gz deleted file mode 100644 index 5f82fdafe..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/parts/.part-0.crc deleted file mode 100644 index 3181e5991..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/parts/.part-0.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/parts/part-0 deleted file mode 100644 index 92eda86fb..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/parts/part-0 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx/.index.crc deleted file mode 100644 index 26d303267..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx/.index.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx/.metadata.json.gz.crc deleted file mode 100644 index 0e401dc36..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx/index b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx/index deleted file mode 100644 index df93a68fe..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx/index and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx/metadata.json.gz deleted file mode 100644 index 9152e863f..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_37.ht/metadata.json.gz deleted file mode 100644 index 91f89d511..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/.metadata.json.gz.crc deleted file mode 100644 index 580630336..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/metadata.json.gz deleted file mode 100644 index c22d07b9f..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/.part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/.part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.crc deleted file mode 100644 index b4cce3d8b..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/.part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2 b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2 deleted file mode 100644 index 07ea95686..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc deleted file mode 100644 index 50813df17..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.metadata.json.gz.crc deleted file mode 100644 index 2c25d63fb..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/.metadata.json.gz.crc deleted file mode 100644 index 45f3552fc..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/metadata.json.gz deleted file mode 100644 index 9ecc64e4e..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/.part-0.crc deleted file mode 100644 index ebb29bf6d..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/.part-0.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/part-0 deleted file mode 100644 index 20df26721..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/part-0 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-f96f626e-c873-4613-a02b-88ee1e3f2923.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-f96f626e-c873-4613-a02b-88ee1e3f2923.idx/.index.crc deleted file mode 100644 index e1e6a76c1..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-f96f626e-c873-4613-a02b-88ee1e3f2923.idx/.index.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-f96f626e-c873-4613-a02b-88ee1e3f2923.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-f96f626e-c873-4613-a02b-88ee1e3f2923.idx/.metadata.json.gz.crc deleted file mode 100644 index 592562995..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-f96f626e-c873-4613-a02b-88ee1e3f2923.idx/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-f96f626e-c873-4613-a02b-88ee1e3f2923.idx/index b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-f96f626e-c873-4613-a02b-88ee1e3f2923.idx/index deleted file mode 100644 index 3ba37303e..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-f96f626e-c873-4613-a02b-88ee1e3f2923.idx/index and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-f96f626e-c873-4613-a02b-88ee1e3f2923.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-f96f626e-c873-4613-a02b-88ee1e3f2923.idx/metadata.json.gz deleted file mode 100644 index 77fcac8ff..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-f96f626e-c873-4613-a02b-88ee1e3f2923.idx/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz deleted file mode 100644 index d280eb0bc..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc deleted file mode 100644 index 30be10170..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz deleted file mode 100644 index 502e21b2a..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-f96f626e-c873-4613-a02b-88ee1e3f2923.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-f96f626e-c873-4613-a02b-88ee1e3f2923.crc deleted file mode 100644 index b6a81226c..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-f96f626e-c873-4613-a02b-88ee1e3f2923.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-f96f626e-c873-4613-a02b-88ee1e3f2923 b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-f96f626e-c873-4613-a02b-88ee1e3f2923 deleted file mode 100644 index 022cb6a7f..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-f96f626e-c873-4613-a02b-88ee1e3f2923 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/.README.txt.crc deleted file mode 100644 index ab521f2a7..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/.README.txt.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/.metadata.json.gz.crc deleted file mode 100644 index 4aaf4823c..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/globals/.metadata.json.gz.crc deleted file mode 100644 index 6cfdce10c..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/globals/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/globals/metadata.json.gz deleted file mode 100644 index c4fac28d7..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/globals/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/globals/parts/.part-0.crc deleted file mode 100644 index 9f38e338a..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/globals/parts/.part-0.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/globals/parts/part-0 deleted file mode 100644 index a1d4bbde5..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/globals/parts/part-0 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/metadata.json.gz deleted file mode 100644 index a1a22ff87..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/rows/.metadata.json.gz.crc deleted file mode 100644 index 60944c316..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/rows/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/rows/metadata.json.gz deleted file mode 100644 index 6e2cc7493..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/rows/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/.README.txt.crc deleted file mode 100644 index 10a80e078..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/.README.txt.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/.metadata.json.gz.crc deleted file mode 100644 index 07a31a448..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/README.txt b/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/README.txt deleted file mode 100644 index 17101d717..000000000 --- a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/README.txt +++ /dev/null @@ -1,3 +0,0 @@ -This folder comprises a Hail (www.hail.is) native Table or MatrixTable. - Written with version 0.2.120-f00f916faf78 - Created at 2024/03/28 16:27:47 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/globals/.metadata.json.gz.crc deleted file mode 100644 index 96183ebc1..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/globals/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/globals/metadata.json.gz deleted file mode 100644 index 63a2ed3a6..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/globals/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/globals/parts/.part-0.crc deleted file mode 100644 index 001836877..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/globals/parts/.part-0.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/globals/parts/part-0 deleted file mode 100644 index 596900d49..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/globals/parts/part-0 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/metadata.json.gz deleted file mode 100644 index e1a91c3ff..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/rows/.metadata.json.gz.crc deleted file mode 100644 index 951c4fab8..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/rows/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/rows/metadata.json.gz deleted file mode 100644 index 8df73fea7..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/rows/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/.README.txt.crc deleted file mode 100644 index 0cf6165af..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/.README.txt.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/.metadata.json.gz.crc deleted file mode 100644 index 4d8d0ca9f..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/README.txt b/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/README.txt deleted file mode 100644 index 0da7c8796..000000000 --- a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/README.txt +++ /dev/null @@ -1,3 +0,0 @@ -This folder comprises a Hail (www.hail.is) native Table or MatrixTable. - Written with version 0.2.120-f00f916faf78 - Created at 2024/03/28 16:28:17 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/globals/.metadata.json.gz.crc deleted file mode 100644 index 0f75b1ff0..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/globals/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/globals/metadata.json.gz deleted file mode 100644 index a5bde9f31..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/globals/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/globals/parts/.part-0.crc deleted file mode 100644 index c16ad7d55..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/globals/parts/.part-0.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/globals/parts/part-0 deleted file mode 100644 index 9c7401d0d..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/globals/parts/part-0 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/metadata.json.gz deleted file mode 100644 index 83bfe37a0..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/rows/.metadata.json.gz.crc deleted file mode 100644 index db072ce74..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/rows/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/rows/metadata.json.gz deleted file mode 100644 index 33aecb87c..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/rows/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/.README.txt.crc deleted file mode 100644 index 7132fdfe3..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/.README.txt.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/.metadata.json.gz.crc deleted file mode 100644 index 2cacfc95c..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/.metadata.json.gz.crc deleted file mode 100644 index d68bbbd08..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/metadata.json.gz deleted file mode 100644 index c16ad768c..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/parts/.part-0.crc deleted file mode 100644 index c56f0f37b..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/parts/.part-0.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/parts/part-0 deleted file mode 100644 index 7fc519095..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/globals/parts/part-0 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_interval_1.ht/metadata.json.gz deleted file mode 100644 index 2654291a9..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/.metadata.json.gz.crc deleted file mode 100644 index 017eef36d..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/metadata.json.gz deleted file mode 100644 index 668d507fd..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/.part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.crc b/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/.part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.crc deleted file mode 100644 index a4b13f78f..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/.part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683 b/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683 deleted file mode 100644 index 1d5c39801..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_1.ht/rows/parts/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/.README.txt.crc deleted file mode 100644 index 6f6856dcc..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/.README.txt.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/.metadata.json.gz.crc deleted file mode 100644 index 0cfa73ca9..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/README.txt b/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/README.txt deleted file mode 100644 index 6e5794f88..000000000 --- a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/README.txt +++ /dev/null @@ -1,3 +0,0 @@ -This folder comprises a Hail (www.hail.is) native Table or MatrixTable. - Written with version 0.2.114-cc8d36408b36 - Created at 2023/07/25 00:52:16 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/globals/.metadata.json.gz.crc deleted file mode 100644 index 69cec0890..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/globals/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/globals/metadata.json.gz deleted file mode 100644 index 737528952..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/globals/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/globals/parts/.part-0.crc deleted file mode 100644 index 58a55f6f4..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/globals/parts/.part-0.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/globals/parts/part-0 deleted file mode 100644 index 785c16ba7..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/globals/parts/part-0 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/index/part-0-271a7dfb-7fc1-4e43-ac16-af1cf05d0ae0.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/index/part-0-271a7dfb-7fc1-4e43-ac16-af1cf05d0ae0.idx/.index.crc deleted file mode 100644 index 24103e0df..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/index/part-0-271a7dfb-7fc1-4e43-ac16-af1cf05d0ae0.idx/.index.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/index/part-0-271a7dfb-7fc1-4e43-ac16-af1cf05d0ae0.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/index/part-0-271a7dfb-7fc1-4e43-ac16-af1cf05d0ae0.idx/.metadata.json.gz.crc deleted file mode 100644 index 4af1b141a..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/index/part-0-271a7dfb-7fc1-4e43-ac16-af1cf05d0ae0.idx/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/index/part-0-271a7dfb-7fc1-4e43-ac16-af1cf05d0ae0.idx/index b/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/index/part-0-271a7dfb-7fc1-4e43-ac16-af1cf05d0ae0.idx/index deleted file mode 100644 index 9755f9219..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/index/part-0-271a7dfb-7fc1-4e43-ac16-af1cf05d0ae0.idx/index and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/index/part-0-271a7dfb-7fc1-4e43-ac16-af1cf05d0ae0.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/index/part-0-271a7dfb-7fc1-4e43-ac16-af1cf05d0ae0.idx/metadata.json.gz deleted file mode 100644 index 81af190be..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/index/part-0-271a7dfb-7fc1-4e43-ac16-af1cf05d0ae0.idx/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/metadata.json.gz deleted file mode 100644 index cd0503811..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/rows/.metadata.json.gz.crc deleted file mode 100644 index 3b2baab10..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/rows/.metadata.json.gz.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/rows/metadata.json.gz deleted file mode 100644 index e131355c9..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/rows/metadata.json.gz and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/rows/parts/.part-0-271a7dfb-7fc1-4e43-ac16-af1cf05d0ae0.crc b/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/rows/parts/.part-0-271a7dfb-7fc1-4e43-ac16-af1cf05d0ae0.crc deleted file mode 100644 index 55bcdaf13..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/rows/parts/.part-0-271a7dfb-7fc1-4e43-ac16-af1cf05d0ae0.crc and /dev/null differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/rows/parts/part-0-271a7dfb-7fc1-4e43-ac16-af1cf05d0ae0 b/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/rows/parts/part-0-271a7dfb-7fc1-4e43-ac16-af1cf05d0ae0 deleted file mode 100644 index 3bb0e2240..000000000 Binary files a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/rows/parts/part-0-271a7dfb-7fc1-4e43-ac16-af1cf05d0ae0 and /dev/null differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/.README.txt.crc new file mode 100644 index 000000000..0e7ebec47 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/._SUCCESS.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/._SUCCESS.crc rename to v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/._SUCCESS.crc diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..a91293ea4 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/README.txt similarity index 78% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/README.txt rename to v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/README.txt index ae1371f9b..a31bb6640 100644 --- a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/README.txt +++ b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. Written with version 0.2.132-678e1f52b999 - Created at 2024/11/07 10:28:31 \ No newline at end of file + Created at 2024/11/21 18:34:48 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/_SUCCESS similarity index 100% rename from v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/_SUCCESS rename to v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/_SUCCESS diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..28013b1bc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/globals/metadata.json.gz new file mode 100644 index 000000000..1a57f3c95 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..4a2b0ad21 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/globals/parts/part-0 new file mode 100644 index 000000000..1abb6a203 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/index/part-0-16d3574b-02c6-4ade-8054-836f2bbce002.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/index/part-0-16d3574b-02c6-4ade-8054-836f2bbce002.idx/.index.crc new file mode 100644 index 000000000..3dfccd27d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/index/part-0-16d3574b-02c6-4ade-8054-836f2bbce002.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/index/part-0-16d3574b-02c6-4ade-8054-836f2bbce002.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/index/part-0-16d3574b-02c6-4ade-8054-836f2bbce002.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..f5eb925f2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/index/part-0-16d3574b-02c6-4ade-8054-836f2bbce002.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/index/part-0-16d3574b-02c6-4ade-8054-836f2bbce002.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/index/part-0-16d3574b-02c6-4ade-8054-836f2bbce002.idx/index new file mode 100644 index 000000000..e543726c1 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/index/part-0-16d3574b-02c6-4ade-8054-836f2bbce002.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/index/part-0-16d3574b-02c6-4ade-8054-836f2bbce002.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/index/part-0-16d3574b-02c6-4ade-8054-836f2bbce002.idx/metadata.json.gz new file mode 100644 index 000000000..2d8bd5ec5 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/index/part-0-16d3574b-02c6-4ade-8054-836f2bbce002.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/metadata.json.gz new file mode 100644 index 000000000..3e5b386da Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..05c8d99a0 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/rows/metadata.json.gz new file mode 100644 index 000000000..091147b8f Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/rows/parts/.part-0-16d3574b-02c6-4ade-8054-836f2bbce002.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/rows/parts/.part-0-16d3574b-02c6-4ade-8054-836f2bbce002.crc new file mode 100644 index 000000000..c06893585 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/rows/parts/.part-0-16d3574b-02c6-4ade-8054-836f2bbce002.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/rows/parts/part-0-16d3574b-02c6-4ade-8054-836f2bbce002 b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/rows/parts/part-0-16d3574b-02c6-4ade-8054-836f2bbce002 new file mode 100644 index 000000000..a0f3274ab Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/clinvar/2024-11-11.ht/rows/parts/part-0-16d3574b-02c6-4ade-8054-836f2bbce002 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..b308c3ea6 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/._SUCCESS.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/._SUCCESS.crc rename to v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/._SUCCESS.crc diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..4df064edb Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/README.txt new file mode 100644 index 000000000..706fac138 --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 18:28:11 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/_SUCCESS similarity index 100% rename from v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/_SUCCESS rename to v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/_SUCCESS diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..9fda0e926 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..ff78cc258 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..097026b06 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..0e97c1b5e Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/index/part-0-67410585-d883-48cc-8d33-933fff287418.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/index/part-0-67410585-d883-48cc-8d33-933fff287418.idx/.index.crc new file mode 100644 index 000000000..1f307251a Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/index/part-0-67410585-d883-48cc-8d33-933fff287418.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/index/part-0-67410585-d883-48cc-8d33-933fff287418.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/index/part-0-67410585-d883-48cc-8d33-933fff287418.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..f5eb925f2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/index/part-0-67410585-d883-48cc-8d33-933fff287418.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/index/part-0-67410585-d883-48cc-8d33-933fff287418.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/index/part-0-67410585-d883-48cc-8d33-933fff287418.idx/index new file mode 100644 index 000000000..346dfe243 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/index/part-0-67410585-d883-48cc-8d33-933fff287418.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/index/part-0-67410585-d883-48cc-8d33-933fff287418.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/index/part-0-67410585-d883-48cc-8d33-933fff287418.idx/metadata.json.gz new file mode 100644 index 000000000..2d8bd5ec5 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/index/part-0-67410585-d883-48cc-8d33-933fff287418.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..841e8d0f9 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..d3fa58d4f Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..073f05922 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/rows/parts/.part-0-67410585-d883-48cc-8d33-933fff287418.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/rows/parts/.part-0-67410585-d883-48cc-8d33-933fff287418.crc new file mode 100644 index 000000000..6ad8acdbf Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/rows/parts/.part-0-67410585-d883-48cc-8d33-933fff287418.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/rows/parts/part-0-67410585-d883-48cc-8d33-933fff287418 b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/rows/parts/part-0-67410585-d883-48cc-8d33-933fff287418 new file mode 100644 index 000000000..95bff341d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/dbnsfp/1.0.ht/rows/parts/part-0-67410585-d883-48cc-8d33-933fff287418 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..2f0fb4487 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/._SUCCESS.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/._SUCCESS.crc rename to v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/._SUCCESS.crc diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..5a70299a2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/README.txt new file mode 100644 index 000000000..2fe7da443 --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 18:02:58 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/_SUCCESS similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/_SUCCESS rename to v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/_SUCCESS diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..fb5ed3f93 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..c945547ba Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..1fdaa2e8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..b88778abc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/index/part-0-04c0af8a-a562-4e97-a303-1047deca5f45.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/index/part-0-04c0af8a-a562-4e97-a303-1047deca5f45.idx/.index.crc new file mode 100644 index 000000000..0822dac8b Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/index/part-0-04c0af8a-a562-4e97-a303-1047deca5f45.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/index/part-0-04c0af8a-a562-4e97-a303-1047deca5f45.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/index/part-0-04c0af8a-a562-4e97-a303-1047deca5f45.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..f5eb925f2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/index/part-0-04c0af8a-a562-4e97-a303-1047deca5f45.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/index/part-0-04c0af8a-a562-4e97-a303-1047deca5f45.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/index/part-0-04c0af8a-a562-4e97-a303-1047deca5f45.idx/index new file mode 100644 index 000000000..73b40e8a5 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/index/part-0-04c0af8a-a562-4e97-a303-1047deca5f45.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/index/part-0-04c0af8a-a562-4e97-a303-1047deca5f45.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/index/part-0-04c0af8a-a562-4e97-a303-1047deca5f45.idx/metadata.json.gz new file mode 100644 index 000000000..2d8bd5ec5 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/index/part-0-04c0af8a-a562-4e97-a303-1047deca5f45.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..c71597e74 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..3c794c785 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..6e0dec6fe Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/rows/parts/.part-0-04c0af8a-a562-4e97-a303-1047deca5f45.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/rows/parts/.part-0-04c0af8a-a562-4e97-a303-1047deca5f45.crc new file mode 100644 index 000000000..80f391d3e Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/rows/parts/.part-0-04c0af8a-a562-4e97-a303-1047deca5f45.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/rows/parts/part-0-04c0af8a-a562-4e97-a303-1047deca5f45 b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/rows/parts/part-0-04c0af8a-a562-4e97-a303-1047deca5f45 new file mode 100644 index 000000000..0e47e7016 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/eigen/1.0.ht/rows/parts/part-0-04c0af8a-a562-4e97-a303-1047deca5f45 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..f971c0853 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/._SUCCESS.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/._SUCCESS.crc rename to v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/._SUCCESS.crc diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..0d0c855af Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/README.txt new file mode 100644 index 000000000..3d248eeaf --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 18:12:31 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/_SUCCESS similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/_SUCCESS rename to v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/_SUCCESS diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..fb5ed3f93 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..c945547ba Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..1fdaa2e8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..b88778abc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/index/part-0-dc3793f5-157b-42ff-8a87-4e367441c4b7.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/index/part-0-dc3793f5-157b-42ff-8a87-4e367441c4b7.idx/.index.crc new file mode 100644 index 000000000..231013f32 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/index/part-0-dc3793f5-157b-42ff-8a87-4e367441c4b7.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/index/part-0-dc3793f5-157b-42ff-8a87-4e367441c4b7.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/index/part-0-dc3793f5-157b-42ff-8a87-4e367441c4b7.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..f5eb925f2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/index/part-0-dc3793f5-157b-42ff-8a87-4e367441c4b7.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/index/part-0-dc3793f5-157b-42ff-8a87-4e367441c4b7.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/index/part-0-dc3793f5-157b-42ff-8a87-4e367441c4b7.idx/index new file mode 100644 index 000000000..4d7f6aa63 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/index/part-0-dc3793f5-157b-42ff-8a87-4e367441c4b7.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/index/part-0-dc3793f5-157b-42ff-8a87-4e367441c4b7.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/index/part-0-dc3793f5-157b-42ff-8a87-4e367441c4b7.idx/metadata.json.gz new file mode 100644 index 000000000..2d8bd5ec5 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/index/part-0-dc3793f5-157b-42ff-8a87-4e367441c4b7.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..db192de45 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..a71b9e605 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..5cd86ef8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/rows/parts/.part-0-dc3793f5-157b-42ff-8a87-4e367441c4b7.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/rows/parts/.part-0-dc3793f5-157b-42ff-8a87-4e367441c4b7.crc new file mode 100644 index 000000000..0e6bf3ecb Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/rows/parts/.part-0-dc3793f5-157b-42ff-8a87-4e367441c4b7.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/rows/parts/part-0-dc3793f5-157b-42ff-8a87-4e367441c4b7 b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/rows/parts/part-0-dc3793f5-157b-42ff-8a87-4e367441c4b7 new file mode 100644 index 000000000..ccdb1b148 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/exac/1.0.ht/rows/parts/part-0-dc3793f5-157b-42ff-8a87-4e367441c4b7 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..210b5569a Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/._SUCCESS.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_2.ht/._SUCCESS.crc rename to v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/._SUCCESS.crc diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..43f814d05 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/README.txt similarity index 78% rename from v03_pipeline/var/test/reference_data/test_combined_37.ht/README.txt rename to v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/README.txt index f5927612a..cf6e4ffb5 100644 --- a/v03_pipeline/var/test/reference_data/test_combined_37.ht/README.txt +++ b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. Written with version 0.2.133-4c60fddb171a - Created at 2024/11/02 13:18:45 \ No newline at end of file + Created at 2024/11/23 20:34:42 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/_SUCCESS similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_2.ht/_SUCCESS rename to v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/_SUCCESS diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..036f69746 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..40495e5c5 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..5d34707bb Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..8318eee0b Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/index/part-0-595a2be1-bb68-41eb-8367-dc7333299edc.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/index/part-0-690f60f1-5897-4a95-9d74-fce92d3e5de7.idx/.index.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_hgmd_37.ht/index/part-0-595a2be1-bb68-41eb-8367-dc7333299edc.idx/.index.crc rename to v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/index/part-0-690f60f1-5897-4a95-9d74-fce92d3e5de7.idx/.index.crc diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/index/part-0-595a2be1-bb68-41eb-8367-dc7333299edc.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/index/part-0-690f60f1-5897-4a95-9d74-fce92d3e5de7.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_hgmd_37.ht/index/part-0-595a2be1-bb68-41eb-8367-dc7333299edc.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/index/part-0-690f60f1-5897-4a95-9d74-fce92d3e5de7.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/index/part-0-595a2be1-bb68-41eb-8367-dc7333299edc.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/index/part-0-690f60f1-5897-4a95-9d74-fce92d3e5de7.idx/index similarity index 100% rename from v03_pipeline/var/test/reference_data/test_hgmd_37.ht/index/part-0-595a2be1-bb68-41eb-8367-dc7333299edc.idx/index rename to v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/index/part-0-690f60f1-5897-4a95-9d74-fce92d3e5de7.idx/index diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/index/part-0-595a2be1-bb68-41eb-8367-dc7333299edc.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/index/part-0-690f60f1-5897-4a95-9d74-fce92d3e5de7.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_hgmd_37.ht/index/part-0-595a2be1-bb68-41eb-8367-dc7333299edc.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/index/part-0-690f60f1-5897-4a95-9d74-fce92d3e5de7.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..44bf87473 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..f14ed6e1a Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..4745a0f5d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/rows/parts/.part-0-595a2be1-bb68-41eb-8367-dc7333299edc.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/rows/parts/.part-0-690f60f1-5897-4a95-9d74-fce92d3e5de7.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_hgmd_37.ht/rows/parts/.part-0-595a2be1-bb68-41eb-8367-dc7333299edc.crc rename to v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/rows/parts/.part-0-690f60f1-5897-4a95-9d74-fce92d3e5de7.crc diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/rows/parts/part-0-595a2be1-bb68-41eb-8367-dc7333299edc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/rows/parts/part-0-690f60f1-5897-4a95-9d74-fce92d3e5de7 similarity index 100% rename from v03_pipeline/var/test/reference_data/test_hgmd_37.ht/rows/parts/part-0-595a2be1-bb68-41eb-8367-dc7333299edc rename to v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_coding_and_noncoding/1.0.ht/rows/parts/part-0-690f60f1-5897-4a95-9d74-fce92d3e5de7 diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..7161b45fd Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/._SUCCESS.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_37.ht/._SUCCESS.crc rename to v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/._SUCCESS.crc diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..87b299b99 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/README.txt new file mode 100644 index 000000000..8ac063914 --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 18:22:28 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/_SUCCESS similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_37.ht/_SUCCESS rename to v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/_SUCCESS diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..fb5ed3f93 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..c945547ba Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..1fdaa2e8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..b88778abc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/index/part-0-5419bf36-548c-4524-b44c-cd77ed3f191e.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/index/part-0-5419bf36-548c-4524-b44c-cd77ed3f191e.idx/.index.crc new file mode 100644 index 000000000..1d4cef495 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/index/part-0-5419bf36-548c-4524-b44c-cd77ed3f191e.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/index/part-0-5419bf36-548c-4524-b44c-cd77ed3f191e.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/index/part-0-5419bf36-548c-4524-b44c-cd77ed3f191e.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..f5eb925f2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/index/part-0-5419bf36-548c-4524-b44c-cd77ed3f191e.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/index/part-0-5419bf36-548c-4524-b44c-cd77ed3f191e.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/index/part-0-5419bf36-548c-4524-b44c-cd77ed3f191e.idx/index new file mode 100644 index 000000000..17abb0bb1 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/index/part-0-5419bf36-548c-4524-b44c-cd77ed3f191e.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/index/part-0-5419bf36-548c-4524-b44c-cd77ed3f191e.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/index/part-0-5419bf36-548c-4524-b44c-cd77ed3f191e.idx/metadata.json.gz new file mode 100644 index 000000000..2d8bd5ec5 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/index/part-0-5419bf36-548c-4524-b44c-cd77ed3f191e.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..ef51904e0 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..96657c79c Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..782f0d8ff Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/rows/parts/.part-0-5419bf36-548c-4524-b44c-cd77ed3f191e.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/rows/parts/.part-0-5419bf36-548c-4524-b44c-cd77ed3f191e.crc new file mode 100644 index 000000000..d293389ed Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/rows/parts/.part-0-5419bf36-548c-4524-b44c-cd77ed3f191e.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/rows/parts/part-0-5419bf36-548c-4524-b44c-cd77ed3f191e b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/rows/parts/part-0-5419bf36-548c-4524-b44c-cd77ed3f191e new file mode 100644 index 000000000..14036f84e Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_exomes/1.0.ht/rows/parts/part-0-5419bf36-548c-4524-b44c-cd77ed3f191e differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..41c7119f5 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/._SUCCESS.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/._SUCCESS.crc rename to v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/._SUCCESS.crc diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..87b299b99 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/README.txt new file mode 100644 index 000000000..c67c41892 --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 18:24:04 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/_SUCCESS similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/_SUCCESS rename to v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/_SUCCESS diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..fb5ed3f93 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..c945547ba Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..1fdaa2e8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..b88778abc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/index/part-0-ef7f1a2e-5a3b-443d-992c-32cbd5d9ceb8.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/index/part-0-ef7f1a2e-5a3b-443d-992c-32cbd5d9ceb8.idx/.index.crc new file mode 100644 index 000000000..3dfccd27d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/index/part-0-ef7f1a2e-5a3b-443d-992c-32cbd5d9ceb8.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/index/part-0-ef7f1a2e-5a3b-443d-992c-32cbd5d9ceb8.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/index/part-0-ef7f1a2e-5a3b-443d-992c-32cbd5d9ceb8.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..f5eb925f2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/index/part-0-ef7f1a2e-5a3b-443d-992c-32cbd5d9ceb8.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/index/part-0-ef7f1a2e-5a3b-443d-992c-32cbd5d9ceb8.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/index/part-0-ef7f1a2e-5a3b-443d-992c-32cbd5d9ceb8.idx/index new file mode 100644 index 000000000..e543726c1 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/index/part-0-ef7f1a2e-5a3b-443d-992c-32cbd5d9ceb8.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/index/part-0-ef7f1a2e-5a3b-443d-992c-32cbd5d9ceb8.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/index/part-0-ef7f1a2e-5a3b-443d-992c-32cbd5d9ceb8.idx/metadata.json.gz new file mode 100644 index 000000000..2d8bd5ec5 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/index/part-0-ef7f1a2e-5a3b-443d-992c-32cbd5d9ceb8.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..ef51904e0 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..73b9be35b Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..d6d0a81ca Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/rows/parts/.part-0-ef7f1a2e-5a3b-443d-992c-32cbd5d9ceb8.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/rows/parts/.part-0-ef7f1a2e-5a3b-443d-992c-32cbd5d9ceb8.crc new file mode 100644 index 000000000..c06893585 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/rows/parts/.part-0-ef7f1a2e-5a3b-443d-992c-32cbd5d9ceb8.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/rows/parts/part-0-ef7f1a2e-5a3b-443d-992c-32cbd5d9ceb8 b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/rows/parts/part-0-ef7f1a2e-5a3b-443d-992c-32cbd5d9ceb8 new file mode 100644 index 000000000..a0f3274ab Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_genomes/1.0.ht/rows/parts/part-0-ef7f1a2e-5a3b-443d-992c-32cbd5d9ceb8 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..04a6ed182 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/._SUCCESS.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/._SUCCESS.crc rename to v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/._SUCCESS.crc diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..3619124af Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/README.txt similarity index 78% rename from v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/README.txt rename to v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/README.txt index 9aea8fa4b..97552d24a 100644 --- a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/README.txt +++ b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. Written with version 0.2.133-4c60fddb171a - Created at 2024/11/02 13:12:12 \ No newline at end of file + Created at 2024/11/23 12:20:19 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/_SUCCESS similarity index 100% rename from v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/_SUCCESS rename to v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/_SUCCESS diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..ed78b2892 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..10338f5f2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..1fdaa2e8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..b88778abc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/index/part-0-60dc0150-c0ed-4ee2-aa12-a4459d0ae33b.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/index/part-0-60dc0150-c0ed-4ee2-aa12-a4459d0ae33b.idx/.index.crc new file mode 100644 index 000000000..748a41603 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/index/part-0-60dc0150-c0ed-4ee2-aa12-a4459d0ae33b.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/index/part-0-60dc0150-c0ed-4ee2-aa12-a4459d0ae33b.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/index/part-0-60dc0150-c0ed-4ee2-aa12-a4459d0ae33b.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..7b9ae4ad7 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/index/part-0-60dc0150-c0ed-4ee2-aa12-a4459d0ae33b.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/index/part-0-60dc0150-c0ed-4ee2-aa12-a4459d0ae33b.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/index/part-0-60dc0150-c0ed-4ee2-aa12-a4459d0ae33b.idx/index new file mode 100644 index 000000000..73ad7946b Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/index/part-0-60dc0150-c0ed-4ee2-aa12-a4459d0ae33b.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/index/part-0-60dc0150-c0ed-4ee2-aa12-a4459d0ae33b.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/index/part-0-60dc0150-c0ed-4ee2-aa12-a4459d0ae33b.idx/metadata.json.gz new file mode 100644 index 000000000..5f7a34128 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/index/part-0-60dc0150-c0ed-4ee2-aa12-a4459d0ae33b.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..74c5ccad0 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..294202669 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..8fbf7562a Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/rows/parts/.part-0-60dc0150-c0ed-4ee2-aa12-a4459d0ae33b.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/rows/parts/.part-0-60dc0150-c0ed-4ee2-aa12-a4459d0ae33b.crc new file mode 100644 index 000000000..487e14e81 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/rows/parts/.part-0-60dc0150-c0ed-4ee2-aa12-a4459d0ae33b.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/rows/parts/part-0-60dc0150-c0ed-4ee2-aa12-a4459d0ae33b b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/rows/parts/part-0-60dc0150-c0ed-4ee2-aa12-a4459d0ae33b new file mode 100644 index 000000000..6b0630803 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/gnomad_qc/1.0.ht/rows/parts/part-0-60dc0150-c0ed-4ee2-aa12-a4459d0ae33b differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..b0c586c92 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/._SUCCESS.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_hgmd_1.ht/._SUCCESS.crc rename to v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/._SUCCESS.crc diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..a90712850 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/README.txt new file mode 100644 index 000000000..1cbf42ffc --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 18:19:21 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/_SUCCESS similarity index 100% rename from v03_pipeline/var/test/reference_data/test_hgmd_1.ht/_SUCCESS rename to v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/_SUCCESS diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..83b98791e Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..f4a48b8e4 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..9c24a2765 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..50fd51ef8 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/index/part-0-182502ba-0456-4d1b-a8ac-1cdd20cfa893.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/index/part-0-182502ba-0456-4d1b-a8ac-1cdd20cfa893.idx/.index.crc new file mode 100644 index 000000000..13f00a251 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/index/part-0-182502ba-0456-4d1b-a8ac-1cdd20cfa893.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/index/part-0-182502ba-0456-4d1b-a8ac-1cdd20cfa893.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/index/part-0-182502ba-0456-4d1b-a8ac-1cdd20cfa893.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..d05ce69d6 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/index/part-0-182502ba-0456-4d1b-a8ac-1cdd20cfa893.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/index/part-0-182502ba-0456-4d1b-a8ac-1cdd20cfa893.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/index/part-0-182502ba-0456-4d1b-a8ac-1cdd20cfa893.idx/index new file mode 100644 index 000000000..0a9ded614 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/index/part-0-182502ba-0456-4d1b-a8ac-1cdd20cfa893.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/index/part-0-182502ba-0456-4d1b-a8ac-1cdd20cfa893.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/index/part-0-182502ba-0456-4d1b-a8ac-1cdd20cfa893.idx/metadata.json.gz new file mode 100644 index 000000000..f4cbe64c4 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/index/part-0-182502ba-0456-4d1b-a8ac-1cdd20cfa893.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..a9d578a80 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..4ed1c5fa3 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..f90d95be0 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/rows/parts/.part-0-182502ba-0456-4d1b-a8ac-1cdd20cfa893.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/rows/parts/.part-0-182502ba-0456-4d1b-a8ac-1cdd20cfa893.crc new file mode 100644 index 000000000..c2966988d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/rows/parts/.part-0-182502ba-0456-4d1b-a8ac-1cdd20cfa893.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/rows/parts/part-0-182502ba-0456-4d1b-a8ac-1cdd20cfa893 b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/rows/parts/part-0-182502ba-0456-4d1b-a8ac-1cdd20cfa893 new file mode 100644 index 000000000..7e0d00872 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/hgmd/1.0.ht/rows/parts/part-0-182502ba-0456-4d1b-a8ac-1cdd20cfa893 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..cb2b959a1 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/._SUCCESS.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_hgmd_37.ht/._SUCCESS.crc rename to v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/._SUCCESS.crc diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..3f7cea528 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/README.txt new file mode 100644 index 000000000..72aa75089 --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 18:12:44 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_37.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/_SUCCESS similarity index 100% rename from v03_pipeline/var/test/reference_data/test_hgmd_37.ht/_SUCCESS rename to v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/_SUCCESS diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..054449578 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..98990f90a Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..131000afe Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..8585d729b Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/index/part-0-592864e3-2b8f-4984-b6ac-79d57ab6be5e.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/index/part-0-592864e3-2b8f-4984-b6ac-79d57ab6be5e.idx/.index.crc new file mode 100644 index 000000000..7d6b96374 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/index/part-0-592864e3-2b8f-4984-b6ac-79d57ab6be5e.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/index/part-0-592864e3-2b8f-4984-b6ac-79d57ab6be5e.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/index/part-0-592864e3-2b8f-4984-b6ac-79d57ab6be5e.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..f5eb925f2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/index/part-0-592864e3-2b8f-4984-b6ac-79d57ab6be5e.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/index/part-0-592864e3-2b8f-4984-b6ac-79d57ab6be5e.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/index/part-0-592864e3-2b8f-4984-b6ac-79d57ab6be5e.idx/index new file mode 100644 index 000000000..c05ecffd0 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/index/part-0-592864e3-2b8f-4984-b6ac-79d57ab6be5e.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/index/part-0-592864e3-2b8f-4984-b6ac-79d57ab6be5e.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/index/part-0-592864e3-2b8f-4984-b6ac-79d57ab6be5e.idx/metadata.json.gz new file mode 100644 index 000000000..2d8bd5ec5 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/index/part-0-592864e3-2b8f-4984-b6ac-79d57ab6be5e.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..71fc4e765 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..ccd127531 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..1a0ea5cb8 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/rows/parts/.part-0-592864e3-2b8f-4984-b6ac-79d57ab6be5e.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/rows/parts/.part-0-592864e3-2b8f-4984-b6ac-79d57ab6be5e.crc new file mode 100644 index 000000000..412869f0e Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/rows/parts/.part-0-592864e3-2b8f-4984-b6ac-79d57ab6be5e.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/rows/parts/part-0-592864e3-2b8f-4984-b6ac-79d57ab6be5e b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/rows/parts/part-0-592864e3-2b8f-4984-b6ac-79d57ab6be5e new file mode 100644 index 000000000..6dc8f7d47 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/splice_ai/1.0.ht/rows/parts/part-0-592864e3-2b8f-4984-b6ac-79d57ab6be5e differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..3a1d6c82b Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/._SUCCESS.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_interval_1.ht/._SUCCESS.crc rename to v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/._SUCCESS.crc diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..8c7544dcb Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/README.txt new file mode 100644 index 000000000..664e0a800 --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 18:15:58 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/_SUCCESS similarity index 100% rename from v03_pipeline/var/test/reference_data/test_interval_1.ht/_SUCCESS rename to v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/_SUCCESS diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..fb5ed3f93 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..c945547ba Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..1fdaa2e8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..b88778abc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/index/part-0-c09ec7db-1671-4dc3-95d4-6426532e00f1.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/index/part-0-c09ec7db-1671-4dc3-95d4-6426532e00f1.idx/.index.crc new file mode 100644 index 000000000..93f7867ef Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/index/part-0-c09ec7db-1671-4dc3-95d4-6426532e00f1.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/index/part-0-c09ec7db-1671-4dc3-95d4-6426532e00f1.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/index/part-0-c09ec7db-1671-4dc3-95d4-6426532e00f1.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..86fea9936 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/index/part-0-c09ec7db-1671-4dc3-95d4-6426532e00f1.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/index/part-0-c09ec7db-1671-4dc3-95d4-6426532e00f1.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/index/part-0-c09ec7db-1671-4dc3-95d4-6426532e00f1.idx/index new file mode 100644 index 000000000..eed268fe3 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/index/part-0-c09ec7db-1671-4dc3-95d4-6426532e00f1.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/index/part-0-c09ec7db-1671-4dc3-95d4-6426532e00f1.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/index/part-0-c09ec7db-1671-4dc3-95d4-6426532e00f1.idx/metadata.json.gz new file mode 100644 index 000000000..aaacf6d64 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/index/part-0-c09ec7db-1671-4dc3-95d4-6426532e00f1.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..9b30f2bd7 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..8f4c9c0ad Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..215767246 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/rows/parts/.part-0-c09ec7db-1671-4dc3-95d4-6426532e00f1.crc b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/rows/parts/.part-0-c09ec7db-1671-4dc3-95d4-6426532e00f1.crc new file mode 100644 index 000000000..2b9965745 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/rows/parts/.part-0-c09ec7db-1671-4dc3-95d4-6426532e00f1.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/rows/parts/part-0-c09ec7db-1671-4dc3-95d4-6426532e00f1 b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/rows/parts/part-0-c09ec7db-1671-4dc3-95d4-6426532e00f1 new file mode 100644 index 000000000..cbff0bee7 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh37/topmed/1.0.ht/rows/parts/part-0-c09ec7db-1671-4dc3-95d4-6426532e00f1 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/.README.txt.crc new file mode 100644 index 000000000..0d08749ac Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/._SUCCESS.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/._SUCCESS.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/._SUCCESS.crc diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..101771435 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/README.txt new file mode 100644 index 000000000..4341aff5f --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 17:40:02 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/_SUCCESS similarity index 100% rename from v03_pipeline/var/test/reference_data/test_interval_mito_1.ht/_SUCCESS rename to v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/_SUCCESS diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..28013b1bc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/globals/metadata.json.gz new file mode 100644 index 000000000..1a57f3c95 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..4a2b0ad21 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/globals/parts/part-0 new file mode 100644 index 000000000..1abb6a203 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-0-a71ea1dc-61b1-4cba-985b-155a977bebff.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-0-a71ea1dc-61b1-4cba-985b-155a977bebff.idx/.index.crc new file mode 100644 index 000000000..88704d58d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-0-a71ea1dc-61b1-4cba-985b-155a977bebff.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/index/part-0-fc4518f0-e0cb-4157-b60d-b6ab4c5f4a75.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-0-a71ea1dc-61b1-4cba-985b-155a977bebff.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/index/part-0-fc4518f0-e0cb-4157-b60d-b6ab4c5f4a75.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-0-a71ea1dc-61b1-4cba-985b-155a977bebff.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-0-a71ea1dc-61b1-4cba-985b-155a977bebff.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-0-a71ea1dc-61b1-4cba-985b-155a977bebff.idx/index new file mode 100644 index 000000000..2cbc16fbb Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-0-a71ea1dc-61b1-4cba-985b-155a977bebff.idx/index differ diff --git a/v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/index/part-0-fc4518f0-e0cb-4157-b60d-b6ab4c5f4a75.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-0-a71ea1dc-61b1-4cba-985b-155a977bebff.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/gnomad_qc_crdq.ht/index/part-0-fc4518f0-e0cb-4157-b60d-b6ab4c5f4a75.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-0-a71ea1dc-61b1-4cba-985b-155a977bebff.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-1-eeb8cbde-9d95-4ba8-bf3c-e7682fbf3168.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-1-eeb8cbde-9d95-4ba8-bf3c-e7682fbf3168.idx/.index.crc new file mode 100644 index 000000000..d981b2bc0 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-1-eeb8cbde-9d95-4ba8-bf3c-e7682fbf3168.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-1-eeb8cbde-9d95-4ba8-bf3c-e7682fbf3168.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-1-eeb8cbde-9d95-4ba8-bf3c-e7682fbf3168.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..cc8b04991 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-1-eeb8cbde-9d95-4ba8-bf3c-e7682fbf3168.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-1-eeb8cbde-9d95-4ba8-bf3c-e7682fbf3168.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-1-eeb8cbde-9d95-4ba8-bf3c-e7682fbf3168.idx/index new file mode 100644 index 000000000..a167f2c9d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-1-eeb8cbde-9d95-4ba8-bf3c-e7682fbf3168.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-1-eeb8cbde-9d95-4ba8-bf3c-e7682fbf3168.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-1-eeb8cbde-9d95-4ba8-bf3c-e7682fbf3168.idx/metadata.json.gz new file mode 100644 index 000000000..4714eaea2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/index/part-1-eeb8cbde-9d95-4ba8-bf3c-e7682fbf3168.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/metadata.json.gz new file mode 100644 index 000000000..60c00d49c Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..d0ca49c72 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/rows/metadata.json.gz new file mode 100644 index 000000000..448191879 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/rows/parts/.part-0-a71ea1dc-61b1-4cba-985b-155a977bebff.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/rows/parts/.part-0-a71ea1dc-61b1-4cba-985b-155a977bebff.crc new file mode 100644 index 000000000..457206bd3 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/rows/parts/.part-0-a71ea1dc-61b1-4cba-985b-155a977bebff.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/rows/parts/.part-1-eeb8cbde-9d95-4ba8-bf3c-e7682fbf3168.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/rows/parts/.part-1-eeb8cbde-9d95-4ba8-bf3c-e7682fbf3168.crc new file mode 100644 index 000000000..1050557d2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/rows/parts/.part-1-eeb8cbde-9d95-4ba8-bf3c-e7682fbf3168.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/rows/parts/part-0-a71ea1dc-61b1-4cba-985b-155a977bebff b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/rows/parts/part-0-a71ea1dc-61b1-4cba-985b-155a977bebff new file mode 100644 index 000000000..cca9ee963 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/rows/parts/part-0-a71ea1dc-61b1-4cba-985b-155a977bebff differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/rows/parts/part-1-eeb8cbde-9d95-4ba8-bf3c-e7682fbf3168 b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/rows/parts/part-1-eeb8cbde-9d95-4ba8-bf3c-e7682fbf3168 new file mode 100644 index 000000000..290090b73 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/clinvar/2024-11-11.ht/rows/parts/part-1-eeb8cbde-9d95-4ba8-bf3c-e7682fbf3168 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..3a594799a Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..00afa98b6 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/README.txt new file mode 100644 index 000000000..ba5be899a --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 17:44:14 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..9fda0e926 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..ff78cc258 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..097026b06 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..0e97c1b5e Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-0-113d0935-f89b-4d20-9f25-225c16c2f941.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-0-113d0935-f89b-4d20-9f25-225c16c2f941.idx/.index.crc new file mode 100644 index 000000000..88704d58d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-0-113d0935-f89b-4d20-9f25-225c16c2f941.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-0-113d0935-f89b-4d20-9f25-225c16c2f941.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-0-113d0935-f89b-4d20-9f25-225c16c2f941.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..553241221 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-0-113d0935-f89b-4d20-9f25-225c16c2f941.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-0-113d0935-f89b-4d20-9f25-225c16c2f941.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-0-113d0935-f89b-4d20-9f25-225c16c2f941.idx/index new file mode 100644 index 000000000..2cbc16fbb Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-0-113d0935-f89b-4d20-9f25-225c16c2f941.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-0-113d0935-f89b-4d20-9f25-225c16c2f941.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-0-113d0935-f89b-4d20-9f25-225c16c2f941.idx/metadata.json.gz new file mode 100644 index 000000000..847060b9a Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-0-113d0935-f89b-4d20-9f25-225c16c2f941.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-1-a918a0a7-ef41-490f-9d13-73a3e17beead.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-1-a918a0a7-ef41-490f-9d13-73a3e17beead.idx/.index.crc new file mode 100644 index 000000000..df5d7a51d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-1-a918a0a7-ef41-490f-9d13-73a3e17beead.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-1-a918a0a7-ef41-490f-9d13-73a3e17beead.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-1-a918a0a7-ef41-490f-9d13-73a3e17beead.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..cc8b04991 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-1-a918a0a7-ef41-490f-9d13-73a3e17beead.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-1-a918a0a7-ef41-490f-9d13-73a3e17beead.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-1-a918a0a7-ef41-490f-9d13-73a3e17beead.idx/index new file mode 100644 index 000000000..d566b1259 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-1-a918a0a7-ef41-490f-9d13-73a3e17beead.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-1-a918a0a7-ef41-490f-9d13-73a3e17beead.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-1-a918a0a7-ef41-490f-9d13-73a3e17beead.idx/metadata.json.gz new file mode 100644 index 000000000..4714eaea2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/index/part-1-a918a0a7-ef41-490f-9d13-73a3e17beead.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..be3ed2290 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..8b18cd631 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..514346568 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/rows/parts/.part-0-113d0935-f89b-4d20-9f25-225c16c2f941.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/rows/parts/.part-0-113d0935-f89b-4d20-9f25-225c16c2f941.crc new file mode 100644 index 000000000..76c67a13e Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/rows/parts/.part-0-113d0935-f89b-4d20-9f25-225c16c2f941.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/rows/parts/.part-1-a918a0a7-ef41-490f-9d13-73a3e17beead.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/rows/parts/.part-1-a918a0a7-ef41-490f-9d13-73a3e17beead.crc new file mode 100644 index 000000000..3a792717d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/rows/parts/.part-1-a918a0a7-ef41-490f-9d13-73a3e17beead.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/rows/parts/part-0-113d0935-f89b-4d20-9f25-225c16c2f941 b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/rows/parts/part-0-113d0935-f89b-4d20-9f25-225c16c2f941 new file mode 100644 index 000000000..c1df695a4 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/rows/parts/part-0-113d0935-f89b-4d20-9f25-225c16c2f941 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/rows/parts/part-1-a918a0a7-ef41-490f-9d13-73a3e17beead b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/rows/parts/part-1-a918a0a7-ef41-490f-9d13-73a3e17beead new file mode 100644 index 000000000..85a1e8b94 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/dbnsfp/1.0.ht/rows/parts/part-1-a918a0a7-ef41-490f-9d13-73a3e17beead differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..da33c11b7 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..d48f4ee9d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/README.txt new file mode 100644 index 000000000..ad58e0ac8 --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 11:56:06 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..fb5ed3f93 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..c945547ba Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..1fdaa2e8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..b88778abc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/index/part-0-24084335-917b-4b51-8a30-4fe509d64745.idx/.index.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/.index.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/index/part-0-24084335-917b-4b51-8a30-4fe509d64745.idx/.index.crc diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/index/part-0-24084335-917b-4b51-8a30-4fe509d64745.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/index/part-0-24084335-917b-4b51-8a30-4fe509d64745.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/index/part-0-24084335-917b-4b51-8a30-4fe509d64745.idx/index similarity index 100% rename from v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/index rename to v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/index/part-0-24084335-917b-4b51-8a30-4fe509d64745.idx/index diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/index/part-0-24084335-917b-4b51-8a30-4fe509d64745.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/index/part-0-24084335-917b-4b51-8a30-4fe509d64745.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..fe5b02595 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..34ddebd36 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..1776a2d13 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/rows/parts/.part-0-24084335-917b-4b51-8a30-4fe509d64745.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/rows/parts/.part-0-24084335-917b-4b51-8a30-4fe509d64745.crc new file mode 100644 index 000000000..541eeefc3 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/rows/parts/.part-0-24084335-917b-4b51-8a30-4fe509d64745.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/rows/parts/part-0-24084335-917b-4b51-8a30-4fe509d64745 b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/rows/parts/part-0-24084335-917b-4b51-8a30-4fe509d64745 new file mode 100644 index 000000000..0ab20ea11 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/eigen/1.0.ht/rows/parts/part-0-24084335-917b-4b51-8a30-4fe509d64745 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..e5fef16a2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..5c09dc9af Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/README.txt new file mode 100644 index 000000000..b973cc8e0 --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 12:17:23 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..fb5ed3f93 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..c945547ba Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..1fdaa2e8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..b88778abc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/index/part-0-3569201c-d630-43c4-9056-cbace806fe8d.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/index/part-0-018c9528-a303-4d50-8cf8-eb42ad4d7486.idx/.index.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/index/part-0-3569201c-d630-43c4-9056-cbace806fe8d.idx/.index.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/index/part-0-018c9528-a303-4d50-8cf8-eb42ad4d7486.idx/.index.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/index/part-0-3569201c-d630-43c4-9056-cbace806fe8d.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/index/part-0-018c9528-a303-4d50-8cf8-eb42ad4d7486.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/index/part-0-3569201c-d630-43c4-9056-cbace806fe8d.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/index/part-0-018c9528-a303-4d50-8cf8-eb42ad4d7486.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/index/part-0-3569201c-d630-43c4-9056-cbace806fe8d.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/index/part-0-018c9528-a303-4d50-8cf8-eb42ad4d7486.idx/index similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/index/part-0-3569201c-d630-43c4-9056-cbace806fe8d.idx/index rename to v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/index/part-0-018c9528-a303-4d50-8cf8-eb42ad4d7486.idx/index diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/index/part-0-3569201c-d630-43c4-9056-cbace806fe8d.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/index/part-0-018c9528-a303-4d50-8cf8-eb42ad4d7486.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/index/part-0-3569201c-d630-43c4-9056-cbace806fe8d.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/index/part-0-018c9528-a303-4d50-8cf8-eb42ad4d7486.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..a4b986266 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..97942d244 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..175ed7640 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/rows/parts/.part-0-018c9528-a303-4d50-8cf8-eb42ad4d7486.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/rows/parts/.part-0-018c9528-a303-4d50-8cf8-eb42ad4d7486.crc new file mode 100644 index 000000000..44e31965d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/rows/parts/.part-0-018c9528-a303-4d50-8cf8-eb42ad4d7486.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/rows/parts/part-0-018c9528-a303-4d50-8cf8-eb42ad4d7486 b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/rows/parts/part-0-018c9528-a303-4d50-8cf8-eb42ad4d7486 new file mode 100644 index 000000000..72ff7f655 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/exac/1.0.ht/rows/parts/part-0-018c9528-a303-4d50-8cf8-eb42ad4d7486 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..d27973430 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..3b3bb60e5 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/README.txt new file mode 100644 index 000000000..44c8ac219 --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/25 12:24:35 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..ed78b2892 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..10338f5f2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..1fdaa2e8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..b88778abc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/index/part-0-86ec8a00-137f-41a6-a098-8ef6bea1cded.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-0-345f1488-be53-4c4b-8207-b052e86084d6.idx/.index.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/index/part-0-86ec8a00-137f-41a6-a098-8ef6bea1cded.idx/.index.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-0-345f1488-be53-4c4b-8207-b052e86084d6.idx/.index.crc diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/index/part-0-86ec8a00-137f-41a6-a098-8ef6bea1cded.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-0-345f1488-be53-4c4b-8207-b052e86084d6.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/index/part-0-86ec8a00-137f-41a6-a098-8ef6bea1cded.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-0-345f1488-be53-4c4b-8207-b052e86084d6.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/index/part-0-86ec8a00-137f-41a6-a098-8ef6bea1cded.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-0-345f1488-be53-4c4b-8207-b052e86084d6.idx/index similarity index 100% rename from v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/index/part-0-86ec8a00-137f-41a6-a098-8ef6bea1cded.idx/index rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-0-345f1488-be53-4c4b-8207-b052e86084d6.idx/index diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/index/part-0-86ec8a00-137f-41a6-a098-8ef6bea1cded.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-0-345f1488-be53-4c4b-8207-b052e86084d6.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/index/part-0-86ec8a00-137f-41a6-a098-8ef6bea1cded.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-0-345f1488-be53-4c4b-8207-b052e86084d6.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-0-90a40f33-45f1-4319-b895-a6f9f6f3364c.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-0-90a40f33-45f1-4319-b895-a6f9f6f3364c.idx/.index.crc new file mode 100644 index 000000000..13f00a251 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-0-90a40f33-45f1-4319-b895-a6f9f6f3364c.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-0-90a40f33-45f1-4319-b895-a6f9f6f3364c.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-0-90a40f33-45f1-4319-b895-a6f9f6f3364c.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..9c040e102 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-0-90a40f33-45f1-4319-b895-a6f9f6f3364c.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-0-90a40f33-45f1-4319-b895-a6f9f6f3364c.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-0-90a40f33-45f1-4319-b895-a6f9f6f3364c.idx/index new file mode 100644 index 000000000..0a9ded614 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-0-90a40f33-45f1-4319-b895-a6f9f6f3364c.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-0-90a40f33-45f1-4319-b895-a6f9f6f3364c.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-0-90a40f33-45f1-4319-b895-a6f9f6f3364c.idx/metadata.json.gz new file mode 100644 index 000000000..56e0d13b4 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-0-90a40f33-45f1-4319-b895-a6f9f6f3364c.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/index/part-1-76400422-6fd3-4b0f-9c37-42546b3e19ff.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-1-f6cdce1a-0e07-4a8e-80de-c0b568f5fa07.idx/.index.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/index/part-1-76400422-6fd3-4b0f-9c37-42546b3e19ff.idx/.index.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-1-f6cdce1a-0e07-4a8e-80de-c0b568f5fa07.idx/.index.crc diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/index/part-1-76400422-6fd3-4b0f-9c37-42546b3e19ff.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-1-f6cdce1a-0e07-4a8e-80de-c0b568f5fa07.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/index/part-1-76400422-6fd3-4b0f-9c37-42546b3e19ff.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-1-f6cdce1a-0e07-4a8e-80de-c0b568f5fa07.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/index/part-1-76400422-6fd3-4b0f-9c37-42546b3e19ff.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-1-f6cdce1a-0e07-4a8e-80de-c0b568f5fa07.idx/index similarity index 100% rename from v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/index/part-1-76400422-6fd3-4b0f-9c37-42546b3e19ff.idx/index rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-1-f6cdce1a-0e07-4a8e-80de-c0b568f5fa07.idx/index diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/index/part-1-76400422-6fd3-4b0f-9c37-42546b3e19ff.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-1-f6cdce1a-0e07-4a8e-80de-c0b568f5fa07.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/index/part-1-76400422-6fd3-4b0f-9c37-42546b3e19ff.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/index/part-1-f6cdce1a-0e07-4a8e-80de-c0b568f5fa07.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..5d1f2440f Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..08f95ba00 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..a8dd96cce Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/rows/parts/.part-0-86ec8a00-137f-41a6-a098-8ef6bea1cded.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/rows/parts/.part-0-345f1488-be53-4c4b-8207-b052e86084d6.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/rows/parts/.part-0-86ec8a00-137f-41a6-a098-8ef6bea1cded.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/rows/parts/.part-0-345f1488-be53-4c4b-8207-b052e86084d6.crc diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/rows/parts/.part-0-90a40f33-45f1-4319-b895-a6f9f6f3364c.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/rows/parts/.part-0-90a40f33-45f1-4319-b895-a6f9f6f3364c.crc new file mode 100644 index 000000000..c2966988d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/rows/parts/.part-0-90a40f33-45f1-4319-b895-a6f9f6f3364c.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/rows/parts/.part-1-76400422-6fd3-4b0f-9c37-42546b3e19ff.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/rows/parts/.part-1-f6cdce1a-0e07-4a8e-80de-c0b568f5fa07.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/rows/parts/.part-1-76400422-6fd3-4b0f-9c37-42546b3e19ff.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/rows/parts/.part-1-f6cdce1a-0e07-4a8e-80de-c0b568f5fa07.crc diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/rows/parts/part-0-86ec8a00-137f-41a6-a098-8ef6bea1cded b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/rows/parts/part-0-345f1488-be53-4c4b-8207-b052e86084d6 similarity index 100% rename from v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/rows/parts/part-0-86ec8a00-137f-41a6-a098-8ef6bea1cded rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/rows/parts/part-0-345f1488-be53-4c4b-8207-b052e86084d6 diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/rows/parts/part-0-90a40f33-45f1-4319-b895-a6f9f6f3364c b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/rows/parts/part-0-90a40f33-45f1-4319-b895-a6f9f6f3364c new file mode 100644 index 000000000..7e0d00872 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/rows/parts/part-0-90a40f33-45f1-4319-b895-a6f9f6f3364c differ diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/rows/parts/part-1-76400422-6fd3-4b0f-9c37-42546b3e19ff b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/rows/parts/part-1-f6cdce1a-0e07-4a8e-80de-c0b568f5fa07 similarity index 100% rename from v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/rows/parts/part-1-76400422-6fd3-4b0f-9c37-42546b3e19ff rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_coding_and_noncoding/1.0.ht/rows/parts/part-1-f6cdce1a-0e07-4a8e-80de-c0b568f5fa07 diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..f4a83594a Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..b60810963 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/README.txt new file mode 100644 index 000000000..7f128301c --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 15:48:08 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..fb5ed3f93 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..c945547ba Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..1fdaa2e8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..b88778abc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/index/part-0-3ff9afe8-37ef-4f6d-a894-cfc7eb27f97d.idx/.index.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/.index.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/index/part-0-3ff9afe8-37ef-4f6d-a894-cfc7eb27f97d.idx/.index.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/index/part-0-3ff9afe8-37ef-4f6d-a894-cfc7eb27f97d.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/index/part-0-3ff9afe8-37ef-4f6d-a894-cfc7eb27f97d.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/index/part-0-3ff9afe8-37ef-4f6d-a894-cfc7eb27f97d.idx/index similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/index rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/index/part-0-3ff9afe8-37ef-4f6d-a894-cfc7eb27f97d.idx/index diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/index/part-0-3ff9afe8-37ef-4f6d-a894-cfc7eb27f97d.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/index/part-0-3ff9afe8-37ef-4f6d-a894-cfc7eb27f97d.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..c11d3ce6f Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..ed4ea68e7 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..f4200fcbb Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/rows/parts/.part-0-3ff9afe8-37ef-4f6d-a894-cfc7eb27f97d.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/rows/parts/.part-0-3ff9afe8-37ef-4f6d-a894-cfc7eb27f97d.crc new file mode 100644 index 000000000..1452464ce Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/rows/parts/.part-0-3ff9afe8-37ef-4f6d-a894-cfc7eb27f97d.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/rows/parts/part-0-3ff9afe8-37ef-4f6d-a894-cfc7eb27f97d b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/rows/parts/part-0-3ff9afe8-37ef-4f6d-a894-cfc7eb27f97d new file mode 100644 index 000000000..c3a7b51ea Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_exomes/1.0.ht/rows/parts/part-0-3ff9afe8-37ef-4f6d-a894-cfc7eb27f97d differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..f51ca0eba Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..b60810963 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/README.txt new file mode 100644 index 000000000..92e64ba24 --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 15:52:38 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..fb5ed3f93 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..c945547ba Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..1fdaa2e8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..b88778abc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/index/part-0-20336911-c437-4deb-9fa4-7c7fe61f0408.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/index/part-0-7791073a-d4da-48f7-903f-59f1ac95d459.idx/.index.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_2.ht/index/part-0-20336911-c437-4deb-9fa4-7c7fe61f0408.idx/.index.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/index/part-0-7791073a-d4da-48f7-903f-59f1ac95d459.idx/.index.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/index/part-0-20336911-c437-4deb-9fa4-7c7fe61f0408.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/index/part-0-7791073a-d4da-48f7-903f-59f1ac95d459.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_2.ht/index/part-0-20336911-c437-4deb-9fa4-7c7fe61f0408.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/index/part-0-7791073a-d4da-48f7-903f-59f1ac95d459.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/index/part-0-20336911-c437-4deb-9fa4-7c7fe61f0408.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/index/part-0-7791073a-d4da-48f7-903f-59f1ac95d459.idx/index similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_2.ht/index/part-0-20336911-c437-4deb-9fa4-7c7fe61f0408.idx/index rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/index/part-0-7791073a-d4da-48f7-903f-59f1ac95d459.idx/index diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/index/part-0-20336911-c437-4deb-9fa4-7c7fe61f0408.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/index/part-0-7791073a-d4da-48f7-903f-59f1ac95d459.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_2.ht/index/part-0-20336911-c437-4deb-9fa4-7c7fe61f0408.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/index/part-0-7791073a-d4da-48f7-903f-59f1ac95d459.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..c11d3ce6f Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..e66080c25 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..1d1bed106 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/rows/parts/.part-0-7791073a-d4da-48f7-903f-59f1ac95d459.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/rows/parts/.part-0-7791073a-d4da-48f7-903f-59f1ac95d459.crc new file mode 100644 index 000000000..b3dd3cfa7 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/rows/parts/.part-0-7791073a-d4da-48f7-903f-59f1ac95d459.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/rows/parts/part-0-7791073a-d4da-48f7-903f-59f1ac95d459 b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/rows/parts/part-0-7791073a-d4da-48f7-903f-59f1ac95d459 new file mode 100644 index 000000000..9b26dd8f4 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_genomes/1.0.ht/rows/parts/part-0-7791073a-d4da-48f7-903f-59f1ac95d459 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..615160675 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..9deb96132 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/README.txt new file mode 100644 index 000000000..000c2be8a --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 17:27:14 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..fb5ed3f93 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..c945547ba Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..1fdaa2e8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..b88778abc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/index/part-0-bccae774-994f-469e-9b30-01becb2109a0.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/index/part-0-bccae774-994f-469e-9b30-01becb2109a0.idx/.index.crc new file mode 100644 index 000000000..9b404cc06 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/index/part-0-bccae774-994f-469e-9b30-01becb2109a0.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/index/part-0-bccae774-994f-469e-9b30-01becb2109a0.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/index/part-0-bccae774-994f-469e-9b30-01becb2109a0.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..46cb77698 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/index/part-0-bccae774-994f-469e-9b30-01becb2109a0.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/index/part-0-bccae774-994f-469e-9b30-01becb2109a0.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/index/part-0-bccae774-994f-469e-9b30-01becb2109a0.idx/index new file mode 100644 index 000000000..9188ce12d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/index/part-0-bccae774-994f-469e-9b30-01becb2109a0.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/index/part-0-bccae774-994f-469e-9b30-01becb2109a0.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/index/part-0-bccae774-994f-469e-9b30-01becb2109a0.idx/metadata.json.gz new file mode 100644 index 000000000..ebb08f484 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/index/part-0-bccae774-994f-469e-9b30-01becb2109a0.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..d2a7ca8ed Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..7823d5e07 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..b47131dc1 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/rows/parts/.part-0-bccae774-994f-469e-9b30-01becb2109a0.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/rows/parts/.part-0-bccae774-994f-469e-9b30-01becb2109a0.crc new file mode 100644 index 000000000..0e6384955 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/rows/parts/.part-0-bccae774-994f-469e-9b30-01becb2109a0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/rows/parts/part-0-bccae774-994f-469e-9b30-01becb2109a0 b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/rows/parts/part-0-bccae774-994f-469e-9b30-01becb2109a0 new file mode 100644 index 000000000..ea9c8eeed Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_mito/1.0.ht/rows/parts/part-0-bccae774-994f-469e-9b30-01becb2109a0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..26c137306 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..08d9b4aea Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/README.txt new file mode 100644 index 000000000..147b7af6a --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 10:26:06 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..fb5ed3f93 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..c945547ba Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..1fdaa2e8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..b88778abc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/index/part-0-17cf743a-b6dc-4c51-ae0c-c4ffa69513ba.idx/.index.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/.index.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/index/part-0-17cf743a-b6dc-4c51-ae0c-c4ffa69513ba.idx/.index.crc diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/index/part-0-17cf743a-b6dc-4c51-ae0c-c4ffa69513ba.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/index/part-0-17cf743a-b6dc-4c51-ae0c-c4ffa69513ba.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/index/part-0-17cf743a-b6dc-4c51-ae0c-c4ffa69513ba.idx/index similarity index 100% rename from v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/index rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/index/part-0-17cf743a-b6dc-4c51-ae0c-c4ffa69513ba.idx/index diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/index/part-0-17cf743a-b6dc-4c51-ae0c-c4ffa69513ba.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_interval_1.ht/index/part-0-1224c3b3-ab5b-49d7-8d6d-6084ccbbc683.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/index/part-0-17cf743a-b6dc-4c51-ae0c-c4ffa69513ba.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..27165cc50 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..95e91fc0b Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..62f2c57fd Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/rows/parts/.part-0-17cf743a-b6dc-4c51-ae0c-c4ffa69513ba.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/rows/parts/.part-0-17cf743a-b6dc-4c51-ae0c-c4ffa69513ba.crc new file mode 100644 index 000000000..f48202559 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/rows/parts/.part-0-17cf743a-b6dc-4c51-ae0c-c4ffa69513ba.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/rows/parts/part-0-17cf743a-b6dc-4c51-ae0c-c4ffa69513ba b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/rows/parts/part-0-17cf743a-b6dc-4c51-ae0c-c4ffa69513ba new file mode 100644 index 000000000..1a1c97ec8 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_non_coding_constraint/1.0.ht/rows/parts/part-0-17cf743a-b6dc-4c51-ae0c-c4ffa69513ba differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..1601350d2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..be4fcf6c1 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/README.txt similarity index 78% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/README.txt rename to v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/README.txt index 1b764aef2..ca607da80 100644 --- a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/README.txt +++ b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. Written with version 0.2.133-4c60fddb171a - Created at 2024/11/02 13:13:26 \ No newline at end of file + Created at 2024/11/23 12:20:50 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..ed78b2892 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..10338f5f2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..1fdaa2e8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..b88778abc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/index/part-0-46f30121-756f-4290-b7f1-e0f9993c9593.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/index/part-0-46f30121-756f-4290-b7f1-e0f9993c9593.idx/.index.crc new file mode 100644 index 000000000..d5055a353 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/index/part-0-46f30121-756f-4290-b7f1-e0f9993c9593.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/index/part-0-46f30121-756f-4290-b7f1-e0f9993c9593.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/index/part-0-46f30121-756f-4290-b7f1-e0f9993c9593.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..553241221 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/index/part-0-46f30121-756f-4290-b7f1-e0f9993c9593.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/index/part-0-46f30121-756f-4290-b7f1-e0f9993c9593.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/index/part-0-46f30121-756f-4290-b7f1-e0f9993c9593.idx/index new file mode 100644 index 000000000..256ce27b1 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/index/part-0-46f30121-756f-4290-b7f1-e0f9993c9593.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/index/part-0-46f30121-756f-4290-b7f1-e0f9993c9593.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/index/part-0-46f30121-756f-4290-b7f1-e0f9993c9593.idx/metadata.json.gz new file mode 100644 index 000000000..847060b9a Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/index/part-0-46f30121-756f-4290-b7f1-e0f9993c9593.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..368a7d98d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..972f882ed Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..b15436a3a Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/rows/parts/.part-0-46f30121-756f-4290-b7f1-e0f9993c9593.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/rows/parts/.part-0-46f30121-756f-4290-b7f1-e0f9993c9593.crc new file mode 100644 index 000000000..6d005723c Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/rows/parts/.part-0-46f30121-756f-4290-b7f1-e0f9993c9593.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/rows/parts/part-0-46f30121-756f-4290-b7f1-e0f9993c9593 b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/rows/parts/part-0-46f30121-756f-4290-b7f1-e0f9993c9593 new file mode 100644 index 000000000..3d67801f4 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/gnomad_qc/1.0.ht/rows/parts/part-0-46f30121-756f-4290-b7f1-e0f9993c9593 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..fd620dff0 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..fadfe0089 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/README.txt new file mode 100644 index 000000000..0f2fbedac --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 17:14:02 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..fb5ed3f93 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..c945547ba Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..1fdaa2e8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..b88778abc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/index/part-0-eceecf38-7b1a-46ab-98c2-147256aff633.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/index/part-0-eceecf38-7b1a-46ab-98c2-147256aff633.idx/.index.crc new file mode 100644 index 000000000..9b404cc06 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/index/part-0-eceecf38-7b1a-46ab-98c2-147256aff633.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/index/part-0-eceecf38-7b1a-46ab-98c2-147256aff633.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/index/part-0-eceecf38-7b1a-46ab-98c2-147256aff633.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..46cb77698 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/index/part-0-eceecf38-7b1a-46ab-98c2-147256aff633.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/index/part-0-eceecf38-7b1a-46ab-98c2-147256aff633.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/index/part-0-eceecf38-7b1a-46ab-98c2-147256aff633.idx/index new file mode 100644 index 000000000..9188ce12d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/index/part-0-eceecf38-7b1a-46ab-98c2-147256aff633.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/index/part-0-eceecf38-7b1a-46ab-98c2-147256aff633.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/index/part-0-eceecf38-7b1a-46ab-98c2-147256aff633.idx/metadata.json.gz new file mode 100644 index 000000000..ebb08f484 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/index/part-0-eceecf38-7b1a-46ab-98c2-147256aff633.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..cabc0d8d8 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..c18159180 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..de1bce944 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/rows/parts/.part-0-eceecf38-7b1a-46ab-98c2-147256aff633.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/rows/parts/.part-0-eceecf38-7b1a-46ab-98c2-147256aff633.crc new file mode 100644 index 000000000..0e6384955 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/rows/parts/.part-0-eceecf38-7b1a-46ab-98c2-147256aff633.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/rows/parts/part-0-eceecf38-7b1a-46ab-98c2-147256aff633 b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/rows/parts/part-0-eceecf38-7b1a-46ab-98c2-147256aff633 new file mode 100644 index 000000000..ea9c8eeed Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/helix_mito/1.0.ht/rows/parts/part-0-eceecf38-7b1a-46ab-98c2-147256aff633 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..5e2eae6e7 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..7d6d258bd Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/README.txt new file mode 100644 index 000000000..959764f4d --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 12:37:09 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..9c29118d9 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..d0a1a6b6f Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..78890c721 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..55e945577 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/index/part-0-7d0599cd-6874-47f8-b6de-a7db0b41817c.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/index/part-0-2accd7be-40d6-42bd-abc3-f6dc7b382f0a.idx/.index.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_2.ht/index/part-0-7d0599cd-6874-47f8-b6de-a7db0b41817c.idx/.index.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/index/part-0-2accd7be-40d6-42bd-abc3-f6dc7b382f0a.idx/.index.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/index/part-0-7d0599cd-6874-47f8-b6de-a7db0b41817c.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/index/part-0-2accd7be-40d6-42bd-abc3-f6dc7b382f0a.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_2.ht/index/part-0-7d0599cd-6874-47f8-b6de-a7db0b41817c.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/index/part-0-2accd7be-40d6-42bd-abc3-f6dc7b382f0a.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/index/part-0-7d0599cd-6874-47f8-b6de-a7db0b41817c.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/index/part-0-2accd7be-40d6-42bd-abc3-f6dc7b382f0a.idx/index similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_2.ht/index/part-0-7d0599cd-6874-47f8-b6de-a7db0b41817c.idx/index rename to v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/index/part-0-2accd7be-40d6-42bd-abc3-f6dc7b382f0a.idx/index diff --git a/v03_pipeline/var/test/reference_data/test_combined_2.ht/index/part-0-7d0599cd-6874-47f8-b6de-a7db0b41817c.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/index/part-0-2accd7be-40d6-42bd-abc3-f6dc7b382f0a.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_2.ht/index/part-0-7d0599cd-6874-47f8-b6de-a7db0b41817c.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/index/part-0-2accd7be-40d6-42bd-abc3-f6dc7b382f0a.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..c50eefd45 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..739b1660d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..43d1cc0ef Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/rows/parts/.part-0-902230a8-2a45-4126-89b0-fdd919610d79.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/rows/parts/.part-0-2accd7be-40d6-42bd-abc3-f6dc7b382f0a.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_hgmd_1.ht/rows/parts/.part-0-902230a8-2a45-4126-89b0-fdd919610d79.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/rows/parts/.part-0-2accd7be-40d6-42bd-abc3-f6dc7b382f0a.crc diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/rows/parts/part-0-902230a8-2a45-4126-89b0-fdd919610d79 b/v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/rows/parts/part-0-2accd7be-40d6-42bd-abc3-f6dc7b382f0a similarity index 100% rename from v03_pipeline/var/test/reference_data/test_hgmd_1.ht/rows/parts/part-0-902230a8-2a45-4126-89b0-fdd919610d79 rename to v03_pipeline/var/test/reference_datasets/GRCh38/hgmd/1.0.ht/rows/parts/part-0-2accd7be-40d6-42bd-abc3-f6dc7b382f0a diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..cfac2a5bb Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..8c4aa7a94 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/README.txt new file mode 100644 index 000000000..2a9f48152 --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 17:22:01 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..fb5ed3f93 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..c945547ba Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..1fdaa2e8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..b88778abc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/index/part-0-c858683f-c7bf-4a88-baab-d7bdeb020fa5.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/index/part-0-c858683f-c7bf-4a88-baab-d7bdeb020fa5.idx/.index.crc new file mode 100644 index 000000000..ffe028174 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/index/part-0-c858683f-c7bf-4a88-baab-d7bdeb020fa5.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/index/part-0-c858683f-c7bf-4a88-baab-d7bdeb020fa5.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/index/part-0-c858683f-c7bf-4a88-baab-d7bdeb020fa5.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..cc8b04991 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/index/part-0-c858683f-c7bf-4a88-baab-d7bdeb020fa5.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/index/part-0-c858683f-c7bf-4a88-baab-d7bdeb020fa5.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/index/part-0-c858683f-c7bf-4a88-baab-d7bdeb020fa5.idx/index new file mode 100644 index 000000000..8de554cd5 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/index/part-0-c858683f-c7bf-4a88-baab-d7bdeb020fa5.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/index/part-0-c858683f-c7bf-4a88-baab-d7bdeb020fa5.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/index/part-0-c858683f-c7bf-4a88-baab-d7bdeb020fa5.idx/metadata.json.gz new file mode 100644 index 000000000..4714eaea2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/index/part-0-c858683f-c7bf-4a88-baab-d7bdeb020fa5.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..44c67a61d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..ab9871ed8 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..5f7ba7ec7 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/rows/parts/.part-0-c858683f-c7bf-4a88-baab-d7bdeb020fa5.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/rows/parts/.part-0-c858683f-c7bf-4a88-baab-d7bdeb020fa5.crc new file mode 100644 index 000000000..db8f9208a Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/rows/parts/.part-0-c858683f-c7bf-4a88-baab-d7bdeb020fa5.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/rows/parts/part-0-c858683f-c7bf-4a88-baab-d7bdeb020fa5 b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/rows/parts/part-0-c858683f-c7bf-4a88-baab-d7bdeb020fa5 new file mode 100644 index 000000000..d28026cfb Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/hmtvar/1.0.ht/rows/parts/part-0-c858683f-c7bf-4a88-baab-d7bdeb020fa5 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..117a62e4d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..8c4aa7a94 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/README.txt new file mode 100644 index 000000000..157ff427c --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 17:37:22 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..fb5ed3f93 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..c945547ba Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..1fdaa2e8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..b88778abc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/index/part-0-b707f718-6196-4c02-9d68-148cf0c9438e.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/index/part-0-b707f718-6196-4c02-9d68-148cf0c9438e.idx/.index.crc new file mode 100644 index 000000000..8c41c40c2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/index/part-0-b707f718-6196-4c02-9d68-148cf0c9438e.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/index/part-0-b707f718-6196-4c02-9d68-148cf0c9438e.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/index/part-0-b707f718-6196-4c02-9d68-148cf0c9438e.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..cc8b04991 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/index/part-0-b707f718-6196-4c02-9d68-148cf0c9438e.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/index/part-0-b707f718-6196-4c02-9d68-148cf0c9438e.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/index/part-0-b707f718-6196-4c02-9d68-148cf0c9438e.idx/index new file mode 100644 index 000000000..49a1d6782 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/index/part-0-b707f718-6196-4c02-9d68-148cf0c9438e.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/index/part-0-b707f718-6196-4c02-9d68-148cf0c9438e.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/index/part-0-b707f718-6196-4c02-9d68-148cf0c9438e.idx/metadata.json.gz new file mode 100644 index 000000000..4714eaea2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/index/part-0-b707f718-6196-4c02-9d68-148cf0c9438e.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..44c67a61d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..4efc7b89f Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..c7664a7d4 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/rows/parts/.part-0-b707f718-6196-4c02-9d68-148cf0c9438e.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/rows/parts/.part-0-b707f718-6196-4c02-9d68-148cf0c9438e.crc new file mode 100644 index 000000000..00b978607 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/rows/parts/.part-0-b707f718-6196-4c02-9d68-148cf0c9438e.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/rows/parts/part-0-b707f718-6196-4c02-9d68-148cf0c9438e b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/rows/parts/part-0-b707f718-6196-4c02-9d68-148cf0c9438e new file mode 100644 index 000000000..bbad79fbe Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/local_constraint_mito/1.0.ht/rows/parts/part-0-b707f718-6196-4c02-9d68-148cf0c9438e differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..2fd591e84 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..8c4aa7a94 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/README.txt new file mode 100644 index 000000000..7db91b4b6 --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 17:23:59 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..fb5ed3f93 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..c945547ba Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..1fdaa2e8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..b88778abc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/index/part-0-e16f2759-68b2-4794-978c-4bfcd2f29974.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/index/part-0-e16f2759-68b2-4794-978c-4bfcd2f29974.idx/.index.crc new file mode 100644 index 000000000..8c41c40c2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/index/part-0-e16f2759-68b2-4794-978c-4bfcd2f29974.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/index/part-0-e16f2759-68b2-4794-978c-4bfcd2f29974.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/index/part-0-e16f2759-68b2-4794-978c-4bfcd2f29974.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..cc8b04991 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/index/part-0-e16f2759-68b2-4794-978c-4bfcd2f29974.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/index/part-0-e16f2759-68b2-4794-978c-4bfcd2f29974.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/index/part-0-e16f2759-68b2-4794-978c-4bfcd2f29974.idx/index new file mode 100644 index 000000000..49a1d6782 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/index/part-0-e16f2759-68b2-4794-978c-4bfcd2f29974.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/index/part-0-e16f2759-68b2-4794-978c-4bfcd2f29974.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/index/part-0-e16f2759-68b2-4794-978c-4bfcd2f29974.idx/metadata.json.gz new file mode 100644 index 000000000..4714eaea2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/index/part-0-e16f2759-68b2-4794-978c-4bfcd2f29974.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..44c67a61d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..7cf5298eb Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..99e90e7fd Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/rows/parts/.part-0-e16f2759-68b2-4794-978c-4bfcd2f29974.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/rows/parts/.part-0-e16f2759-68b2-4794-978c-4bfcd2f29974.crc new file mode 100644 index 000000000..bef05bb2d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/rows/parts/.part-0-e16f2759-68b2-4794-978c-4bfcd2f29974.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/rows/parts/part-0-e16f2759-68b2-4794-978c-4bfcd2f29974 b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/rows/parts/part-0-e16f2759-68b2-4794-978c-4bfcd2f29974 new file mode 100644 index 000000000..3777937af Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitimpact/1.0.ht/rows/parts/part-0-e16f2759-68b2-4794-978c-4bfcd2f29974 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..643f540f0 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..8be0418bb Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/README.txt new file mode 100644 index 000000000..baa3779b3 --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 17:25:22 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..fb5ed3f93 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..c945547ba Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..1fdaa2e8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..b88778abc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/index/part-0-430d2a33-3c80-49e7-91ec-31484d8fc41b.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/index/part-0-430d2a33-3c80-49e7-91ec-31484d8fc41b.idx/.index.crc new file mode 100644 index 000000000..4fce8e00d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/index/part-0-430d2a33-3c80-49e7-91ec-31484d8fc41b.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/index/part-0-430d2a33-3c80-49e7-91ec-31484d8fc41b.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/index/part-0-430d2a33-3c80-49e7-91ec-31484d8fc41b.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..46cb77698 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/index/part-0-430d2a33-3c80-49e7-91ec-31484d8fc41b.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/index/part-0-430d2a33-3c80-49e7-91ec-31484d8fc41b.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/index/part-0-430d2a33-3c80-49e7-91ec-31484d8fc41b.idx/index new file mode 100644 index 000000000..d1574170b Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/index/part-0-430d2a33-3c80-49e7-91ec-31484d8fc41b.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/index/part-0-430d2a33-3c80-49e7-91ec-31484d8fc41b.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/index/part-0-430d2a33-3c80-49e7-91ec-31484d8fc41b.idx/metadata.json.gz new file mode 100644 index 000000000..ebb08f484 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/index/part-0-430d2a33-3c80-49e7-91ec-31484d8fc41b.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..5161834bf Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..2ccc6f456 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..db48fe98a Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/rows/parts/.part-0-430d2a33-3c80-49e7-91ec-31484d8fc41b.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/rows/parts/.part-0-430d2a33-3c80-49e7-91ec-31484d8fc41b.crc new file mode 100644 index 000000000..d7ddc9158 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/rows/parts/.part-0-430d2a33-3c80-49e7-91ec-31484d8fc41b.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/rows/parts/part-0-430d2a33-3c80-49e7-91ec-31484d8fc41b b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/rows/parts/part-0-430d2a33-3c80-49e7-91ec-31484d8fc41b new file mode 100644 index 000000000..c01f44610 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/mitomap/1.0.ht/rows/parts/part-0-430d2a33-3c80-49e7-91ec-31484d8fc41b differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..26c137306 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..a04a8a964 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/README.txt new file mode 100644 index 000000000..147b7af6a --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 10:26:06 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..361b6d3fd Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..11ede322e Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..2c5dc31e6 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..4353c3208 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/index/part-0-6ac8d5d4-cb28-4030-9208-f0d0e0f595e1.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/index/part-0-6ac8d5d4-cb28-4030-9208-f0d0e0f595e1.idx/.index.crc new file mode 100644 index 000000000..23324f542 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/index/part-0-6ac8d5d4-cb28-4030-9208-f0d0e0f595e1.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/index/part-0-6ac8d5d4-cb28-4030-9208-f0d0e0f595e1.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/index/part-0-6ac8d5d4-cb28-4030-9208-f0d0e0f595e1.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..576d4ffd2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/index/part-0-6ac8d5d4-cb28-4030-9208-f0d0e0f595e1.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/index/part-0-6ac8d5d4-cb28-4030-9208-f0d0e0f595e1.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/index/part-0-6ac8d5d4-cb28-4030-9208-f0d0e0f595e1.idx/index new file mode 100644 index 000000000..952d782a3 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/index/part-0-6ac8d5d4-cb28-4030-9208-f0d0e0f595e1.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/index/part-0-6ac8d5d4-cb28-4030-9208-f0d0e0f595e1.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/index/part-0-6ac8d5d4-cb28-4030-9208-f0d0e0f595e1.idx/metadata.json.gz new file mode 100644 index 000000000..9b05326c4 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/index/part-0-6ac8d5d4-cb28-4030-9208-f0d0e0f595e1.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..1d8e6f33b Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..a3b21f0b7 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..2b5ad46aa Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/rows/parts/.part-0-6ac8d5d4-cb28-4030-9208-f0d0e0f595e1.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/rows/parts/.part-0-6ac8d5d4-cb28-4030-9208-f0d0e0f595e1.crc new file mode 100644 index 000000000..2e0ae9c56 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/rows/parts/.part-0-6ac8d5d4-cb28-4030-9208-f0d0e0f595e1.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/rows/parts/part-0-6ac8d5d4-cb28-4030-9208-f0d0e0f595e1 b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/rows/parts/part-0-6ac8d5d4-cb28-4030-9208-f0d0e0f595e1 new file mode 100644 index 000000000..93fd423c8 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/screen/1.0.ht/rows/parts/part-0-6ac8d5d4-cb28-4030-9208-f0d0e0f595e1 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..f24ab2757 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..703610216 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/README.txt new file mode 100644 index 000000000..c2b876255 --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 12:31:55 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..054449578 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..98990f90a Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..131000afe Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..8585d729b Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/index/part-0-902230a8-2a45-4126-89b0-fdd919610d79.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/index/part-0-6272a9a2-b08b-4926-9552-feb84ffa2308.idx/.index.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_hgmd_1.ht/index/part-0-902230a8-2a45-4126-89b0-fdd919610d79.idx/.index.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/index/part-0-6272a9a2-b08b-4926-9552-feb84ffa2308.idx/.index.crc diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/index/part-0-902230a8-2a45-4126-89b0-fdd919610d79.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/index/part-0-6272a9a2-b08b-4926-9552-feb84ffa2308.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_hgmd_1.ht/index/part-0-902230a8-2a45-4126-89b0-fdd919610d79.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/index/part-0-6272a9a2-b08b-4926-9552-feb84ffa2308.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/index/part-0-902230a8-2a45-4126-89b0-fdd919610d79.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/index/part-0-6272a9a2-b08b-4926-9552-feb84ffa2308.idx/index similarity index 100% rename from v03_pipeline/var/test/reference_data/test_hgmd_1.ht/index/part-0-902230a8-2a45-4126-89b0-fdd919610d79.idx/index rename to v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/index/part-0-6272a9a2-b08b-4926-9552-feb84ffa2308.idx/index diff --git a/v03_pipeline/var/test/reference_data/test_hgmd_1.ht/index/part-0-902230a8-2a45-4126-89b0-fdd919610d79.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/index/part-0-6272a9a2-b08b-4926-9552-feb84ffa2308.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_hgmd_1.ht/index/part-0-902230a8-2a45-4126-89b0-fdd919610d79.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/index/part-0-6272a9a2-b08b-4926-9552-feb84ffa2308.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..3b20644da Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..815f5ba2f Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..fdc97550e Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/rows/parts/.part-0-6272a9a2-b08b-4926-9552-feb84ffa2308.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/rows/parts/.part-0-6272a9a2-b08b-4926-9552-feb84ffa2308.crc new file mode 100644 index 000000000..0dcc92f01 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/rows/parts/.part-0-6272a9a2-b08b-4926-9552-feb84ffa2308.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/rows/parts/part-0-6272a9a2-b08b-4926-9552-feb84ffa2308 b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/rows/parts/part-0-6272a9a2-b08b-4926-9552-feb84ffa2308 new file mode 100644 index 000000000..4304a92fc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/splice_ai/1.0.ht/rows/parts/part-0-6272a9a2-b08b-4926-9552-feb84ffa2308 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/.README.txt.crc new file mode 100644 index 000000000..19491808d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..db3eac1a7 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/README.txt b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/README.txt new file mode 100644 index 000000000..f37ec7de7 --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.132-678e1f52b999 + Created at 2024/11/21 12:31:19 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..fb5ed3f93 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/globals/metadata.json.gz new file mode 100644 index 000000000..c945547ba Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..1fdaa2e8d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/globals/parts/part-0 new file mode 100644 index 000000000..b88778abc Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/index/part-0-795ab066-10c9-4aac-ad59-f29794a4b01f.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/index/part-0-795ab066-10c9-4aac-ad59-f29794a4b01f.idx/.index.crc new file mode 100644 index 000000000..7cb9c5aaf Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/index/part-0-795ab066-10c9-4aac-ad59-f29794a4b01f.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/index/part-0-795ab066-10c9-4aac-ad59-f29794a4b01f.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/index/part-0-795ab066-10c9-4aac-ad59-f29794a4b01f.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..9af5fa925 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/index/part-0-795ab066-10c9-4aac-ad59-f29794a4b01f.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/index/part-0-795ab066-10c9-4aac-ad59-f29794a4b01f.idx/index b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/index/part-0-795ab066-10c9-4aac-ad59-f29794a4b01f.idx/index new file mode 100644 index 000000000..a979d82bf Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/index/part-0-795ab066-10c9-4aac-ad59-f29794a4b01f.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/index/part-0-795ab066-10c9-4aac-ad59-f29794a4b01f.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/index/part-0-795ab066-10c9-4aac-ad59-f29794a4b01f.idx/metadata.json.gz new file mode 100644 index 000000000..051d3e03d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/index/part-0-795ab066-10c9-4aac-ad59-f29794a4b01f.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/metadata.json.gz new file mode 100644 index 000000000..7889b2551 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..d838c358d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/rows/metadata.json.gz new file mode 100644 index 000000000..de84d3705 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/rows/parts/.part-0-795ab066-10c9-4aac-ad59-f29794a4b01f.crc b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/rows/parts/.part-0-795ab066-10c9-4aac-ad59-f29794a4b01f.crc new file mode 100644 index 000000000..a8496fdf0 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/rows/parts/.part-0-795ab066-10c9-4aac-ad59-f29794a4b01f.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/rows/parts/part-0-795ab066-10c9-4aac-ad59-f29794a4b01f b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/rows/parts/part-0-795ab066-10c9-4aac-ad59-f29794a4b01f new file mode 100644 index 000000000..0a289db86 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/GRCh38/topmed/1.0.ht/rows/parts/part-0-795ab066-10c9-4aac-ad59-f29794a4b01f differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/clinvar.vcf b/v03_pipeline/var/test/reference_datasets/raw/clinvar.vcf new file mode 100644 index 000000000..180bed6c1 --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/raw/clinvar.vcf @@ -0,0 +1,55 @@ +##fileformat=VCFv4.1 +##fileDate=2024-11-11 +##source=ClinVar +##reference=GRCh38 +##ID= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 69134 1 A G . . ALLELEID=2193183;CLNDISDB=MedGen:CN169374;CLNDN=not_specified;CLNHGVS=NC_000001.11:g.69134A>G;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Pathogenic;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=OR4F5:79501;MC=SO:0001583|missense_variant;ORIGIN=1 +1 69314 2 T G . . ALLELEID=3374047;CLNDISDB=MedGen:CN169374;CLNDN=not_specified;CLNHGVS=NC_000001.11:g.69314T>G;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=OR4F5:79501;MC=SO:0001583|missense_variant;ORIGIN=1 +1 69423 3 G A . . ALLELEID=3374048;CLNDISDB=MedGen:CN169374;CLNDN=not_specified;CLNHGVS=NC_000001.11:g.69423G>A;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=OR4F5:79501;MC=SO:0001583|missense_variant;ORIGIN=1 +1 69581 4 C G . . ALLELEID=2238986;CLNDISDB=MedGen:CN169374;CLNDN=not_specified;CLNHGVS=NC_000001.11:g.69581C>G;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=OR4F5:79501;MC=SO:0001583|missense_variant;ORIGIN=1 +1 69682 5 G A . . ALLELEID=2386655;CLNDISDB=MedGen:CN169374;CLNDN=not_specified;CLNHGVS=NC_000001.11:g.69682G>A;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=OR4F5:79501;MC=SO:0001583|missense_variant;ORIGIN=1 +1 69731 6 T C . . ALLELEID=3374049;CLNDISDB=MedGen:CN169374;CLNDN=not_specified;CLNHGVS=NC_000001.11:g.69731T>C;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=OR4F5:79501;MC=SO:0001583|missense_variant;ORIGIN=1 +1 69769 7 T C . . ALLELEID=2278803;CLNDISDB=MedGen:CN169374;CLNDN=not_specified;CLNHGVS=NC_000001.11:g.69769T>C;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=OR4F5:79501;MC=SO:0001583|missense_variant;ORIGIN=1 +1 69995 8 G C . . ALLELEID=2333177;CLNDISDB=MedGen:CN169374;CLNDN=not_specified;CLNHGVS=NC_000001.11:g.69995G>C;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=OR4F5:79501;MC=SO:0001583|missense_variant;ORIGIN=1 +1 925946 9 C G . . ALLELEID=1983057;CLNDISDB=MedGen:C3661900;CLNDN=not_provided;CLNHGVS=NC_000001.11:g.925946C>G;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=SAMD11:148398;MC=SO:0001583|missense_variant;ORIGIN=1 +1 925952 10 G A . . ALLELEID=1003021;CLNDISDB=MedGen:C3661900;CLNDN=not_provided;CLNHGVS=NC_000001.11:g.925952G>A;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=SAMD11:148398;MC=SO:0001583|missense_variant;ORIGIN=1;RS=1640863258 +1 925956 11 C T . . ALLELEID=1632777;CLNDISDB=MedGen:C3661900;CLNDN=not_provided;CLNHGVS=NC_000001.11:g.925956C>T;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=SAMD11:148398;MC=SO:0001819|synonymous_variant;ORIGIN=1;RS=1342334044 +1 925956 11 C T . . ALLELEID=1632777;CLNDISDB=MedGen:C3661900;CLNDN=not_provided;CLNHGVS=NC_000001.11:g.925956C>T;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=SAMD11:148398;MC=SO:0001819|synonymous_variant;ORIGIN=1;RS=1342334044 +MT 13112 693521 T C . . ALLELEID=680411;CLNDISDB=MONDO:MONDO:0009723,MedGen:C0023264,OMIM:256000,Orphanet:506;CLNDN=Leigh_syndrome;CLNHGVS=NC_012920.1:m.13112T>C;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Pathogenic;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=MT-ND5:4540;ORIGIN=1;RS=1603224043 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/raw/exac.vcf b/v03_pipeline/var/test/reference_datasets/raw/exac.vcf new file mode 100644 index 000000000..3392033b9 --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/raw/exac.vcf @@ -0,0 +1,202 @@ +##fileformat=VCFv4.2 +##ALT= +##FILTER== 20 and DP >= 10)"> +##FILTER= +##FILTER= +##FILTER= -2.632 && InbreedingCoeff >-0.8"> +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##GATKCommandLine= +##GATKCommandLine= +##GATKCommandLine= +##GATKCommandLine= -2.632 && InbreedingCoeff >-0.8, InbreedingCoeff <= -0.8] filterName=[NewCut_Filter, InbreedingCoeff_Filter] genotypeFilterExpression=[] genotypeFilterName=[] clusterSize=3 clusterWindowSize=0 maskExtension=0 maskName=Mask filterNotInMask=false missingValuesInExpressionsShouldEvaluateAsFailing=false invalidatePreviousFilters=false filter_reads_with_N_cigar=false filter_mismatching_base_and_quals=false filter_bases_not_stored=false"> +##GVCFBlock=minGQ=0(inclusive),maxGQ=5(exclusive) +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= 0.05"> +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= 0.05"> +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= 0.05"> +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##LoF=Loss-of-function annotation (HC = High Confidence; LC = Low Confidence) +##LoF_filter=Reason for LoF not being HC +##LoF_flags=Possible warning flags for LoF +##LoF_info=Info used for LoF annotation +##VEP=v85 cache=/humgen/atgu1/fs03/konradk/vep//homo_sapiens/85_GRCh37 db=. sift=sift5.2.2 polyphen=2.2.2 COSMIC=71 ESP=20141103 gencode=GENCODE 19 HGMD-PUBLIC=20152 genebuild=2011-04 regbuild=13 ClinVar=201507 dbSNP=144 assembly=GRCh37.p13 +##ancestral=ancestral allele +##context=1 base context around the variant +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 1046973 . G A,T 526.79 PASS AC=2,2;AC_AFR=0,0;AC_AMR=0,0;AC_Adj=0,1;AC_CONSANGUINEOUS=.,0;AC_EAS=0,0;AC_FEMALE=.,1;AC_FIN=0,0;AC_Het=0,1,0;AC_Hom=0,0;AC_MALE=.,0;AC_NFE=0,1;AC_OTH=0,0;AC_POPMAX=NA,1;AC_SAS=0,0;AF=1.702e-05,1.702e-05;AGE_HISTOGRAM_HET=.,0|0|0|0|0|0|0|0|0|0|0|0;AGE_HISTOGRAM_HOM=.,0|0|0|0|0|0|0|0|0|0|0|0;AN=117528;AN_AFR=3096;AN_AMR=860;AN_Adj=27700;AN_CONSANGUINEOUS=1488;AN_EAS=1442;AN_FEMALE=12128;AN_FIN=302;AN_MALE=15572;AN_NFE=13416;AN_OTH=228;AN_POPMAX=NA,13416;AN_SAS=8356;BaseQRankSum=0.439;CSQ=A|intron_variant|MODIFIER|AGRN|ENSG00000188157|Transcript|ENST00000379370|protein_coding||19/35|ENST00000379370.2:c.3388+16G>A|||||||rs751608650|1||1||SNV||HGNC|329|YES|||CCDS30551.1|ENSP00000368678|O00468|Q5XG79|UPI00001D7C8B|1||||||||||||||A:0&T:0|A:1.702e-05&T:1.702e-05|A:0&T:0|A:0&T:3.61e-05|A:0&T:0|A:0&T:0|A:0&T:7.454e-05|A:0&T:0|||||||||||||GGG|,T|intron_variant|MODIFIER|AGRN|ENSG00000188157|Transcript|ENST00000379370|protein_coding||19/35|ENST00000379370.2:c.3388+16G>T|||||||rs751608650|2||1||SNV||HGNC|329|YES|||CCDS30551.1|ENSP00000368678|O00468|Q5XG79|UPI00001D7C8B|1||||||||||||||A:0&T:0|A:1.702e-05&T:1.702e-05|A:0&T:0|A:0&T:3.61e-05|A:0&T:0|A:0&T:0|A:0&T:7.454e-05|A:0&T:0|||||||||||||GGG|,A|upstream_gene_variant|MODIFIER|AGRN|ENSG00000188157|Transcript|ENST00000419249|protein_coding||||||||||rs751608650|1|3583|1|cds_start_NF&cds_end_NF|SNV||HGNC|329|||||ENSP00000400771|||UPI000059CF46|1||||||||||||||A:0&T:0|A:1.702e-05&T:1.702e-05|A:0&T:0|A:0&T:3.61e-05|A:0&T:0|A:0&T:0|A:0&T:7.454e-05|A:0&T:0|||||||||||||GGG|,T|upstream_gene_variant|MODIFIER|AGRN|ENSG00000188157|Transcript|ENST00000419249|protein_coding||||||||||rs751608650|2|3583|1|cds_start_NF&cds_end_NF|SNV||HGNC|329|||||ENSP00000400771|||UPI000059CF46|1||||||||||||||A:0&T:0|A:1.702e-05&T:1.702e-05|A:0&T:0|A:0&T:3.61e-05|A:0&T:0|A:0&T:0|A:0&T:7.454e-05|A:0&T:0|||||||||||||GGG|,A|upstream_gene_variant|MODIFIER|AGRN|ENSG00000188157|Transcript|ENST00000466223|retained_intron||||||||||rs751608650|1|228|1||SNV||HGNC|329|||||||||1||||||||||||||A:0&T:0|A:1.702e-05&T:1.702e-05|A:0&T:0|A:0&T:3.61e-05|A:0&T:0|A:0&T:0|A:0&T:7.454e-05|A:0&T:0|||||||||||||GGG|,T|upstream_gene_variant|MODIFIER|AGRN|ENSG00000188157|Transcript|ENST00000466223|retained_intron||||||||||rs751608650|2|228|1||SNV||HGNC|329|||||||||1||||||||||||||A:0&T:0|A:1.702e-05&T:1.702e-05|A:0&T:0|A:0&T:3.61e-05|A:0&T:0|A:0&T:0|A:0&T:7.454e-05|A:0&T:0|||||||||||||GGG|,A|upstream_gene_variant|MODIFIER|AGRN|ENSG00000188157|Transcript|ENST00000478677|retained_intron||||||||||rs751608650|1|502|1||SNV||HGNC|329|||||||||1||||||||||||||A:0&T:0|A:1.702e-05&T:1.702e-05|A:0&T:0|A:0&T:3.61e-05|A:0&T:0|A:0&T:0|A:0&T:7.454e-05|A:0&T:0|||||||||||||GGG|,T|upstream_gene_variant|MODIFIER|AGRN|ENSG00000188157|Transcript|ENST00000478677|retained_intron||||||||||rs751608650|2|502|1||SNV||HGNC|329|||||||||1||||||||||||||A:0&T:0|A:1.702e-05&T:1.702e-05|A:0&T:0|A:0&T:3.61e-05|A:0&T:0|A:0&T:0|A:0&T:7.454e-05|A:0&T:0|||||||||||||GGG|,A|downstream_gene_variant|MODIFIER|AGRN|ENSG00000188157|Transcript|ENST00000479707|retained_intron||||||||||rs751608650|1|624|1||SNV||HGNC|329|||||||||1||||||||||||||A:0&T:0|A:1.702e-05&T:1.702e-05|A:0&T:0|A:0&T:3.61e-05|A:0&T:0|A:0&T:0|A:0&T:7.454e-05|A:0&T:0|||||||||||||GGG|,T|downstream_gene_variant|MODIFIER|AGRN|ENSG00000188157|Transcript|ENST00000479707|retained_intron||||||||||rs751608650|2|624|1||SNV||HGNC|329|||||||||1||||||||||||||A:0&T:0|A:1.702e-05&T:1.702e-05|A:0&T:0|A:0&T:3.61e-05|A:0&T:0|A:0&T:0|A:0&T:7.454e-05|A:0&T:0|||||||||||||GGG|,A|upstream_gene_variant|MODIFIER|AGRN|ENSG00000188157|Transcript|ENST00000492947|retained_intron||||||||||rs751608650|1|1556|1||SNV||HGNC|329|||||||||1||||||||||||||A:0&T:0|A:1.702e-05&T:1.702e-05|A:0&T:0|A:0&T:3.61e-05|A:0&T:0|A:0&T:0|A:0&T:7.454e-05|A:0&T:0|||||||||||||GGG|,T|upstream_gene_variant|MODIFIER|AGRN|ENSG00000188157|Transcript|ENST00000492947|retained_intron||||||||||rs751608650|2|1556|1||SNV||HGNC|329|||||||||1||||||||||||||A:0&T:0|A:1.702e-05&T:1.702e-05|A:0&T:0|A:0&T:3.61e-05|A:0&T:0|A:0&T:0|A:0&T:7.454e-05|A:0&T:0|||||||||||||GGG|;ClippingRankSum=-3.580e-01;DOUBLETON_DIST=.,.;DP=500454;DP_HIST=21516|23185|5132|497|5360|2303|632|78|31|12|8|5|1|2|1|0|1|0|0|0,1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0,0|1|0|0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0;ESP_AC=0,0;ESP_AF_GLOBAL=0,0;ESP_AF_POPMAX=0,0;FS=9.313;GQ_HIST=1031|21164|2478|2925|20769|980|656|232|77|69|65|68|6174|1026|336|365|177|61|47|64,0|1|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0|0,0|0|0|0|0|0|0|0|1|0|0|0|0|0|0|0|0|0|0|1;GQ_MEAN=20.85;GQ_STDDEV=18.47;Het_AFR=0,0,0;Het_AMR=0,0,0;Het_EAS=0,0,0;Het_FIN=0,0,0;Het_NFE=0,1,0;Het_OTH=0,0,0;Het_SAS=0,0,0;Hom_AFR=0,0;Hom_AMR=0,0;Hom_CONSANGUINEOUS=.,0;Hom_EAS=0,0;Hom_FIN=0,0;Hom_NFE=0,0;Hom_OTH=0,0;Hom_SAS=0,0;InbreedingCoeff=0.0337;K1_RUN=G:1,G:1;K2_RUN=GG:0,GG:0;K3_RUN=GGC:0,GGC:0;KG_AC=0,0;KG_AF_GLOBAL=0,0;KG_AF_POPMAX=0,0;MQ=60.00;MQ0=0;MQRankSum=0.556;NCC=4154;POPMAX=NA,NFE;QD=11.45;ReadPosRankSum=0.968;VQSLOD=-7.116e-01;culprit=MQ \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/.README.txt.crc new file mode 100644 index 000000000..202d5a4b2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..4090709b1 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/README.txt b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/README.txt similarity index 78% rename from v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/README.txt rename to v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/README.txt index 95969a3d8..1359f75de 100644 --- a/v03_pipeline/var/test/reference_data/test_gnomad_coding_noncoding_crdq_1.ht/README.txt +++ b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. Written with version 0.2.130-bea04d9c79b5 - Created at 2024/09/23 00:15:49 \ No newline at end of file + Created at 2024/11/13 15:32:27 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..036f69746 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/globals/metadata.json.gz new file mode 100644 index 000000000..40495e5c5 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..e1fd3bdfa Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/globals/parts/part-0 new file mode 100644 index 000000000..ce85de233 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/index/part-0-ac88ea82-778e-4722-b4a5-67b02b78322d.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/index/part-0-ac88ea82-778e-4722-b4a5-67b02b78322d.idx/.index.crc new file mode 100644 index 000000000..210493e1b Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/index/part-0-ac88ea82-778e-4722-b4a5-67b02b78322d.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/index/part-0-ac88ea82-778e-4722-b4a5-67b02b78322d.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/index/part-0-ac88ea82-778e-4722-b4a5-67b02b78322d.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..ca274b338 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/index/part-0-ac88ea82-778e-4722-b4a5-67b02b78322d.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/index/part-0-ac88ea82-778e-4722-b4a5-67b02b78322d.idx/index b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/index/part-0-ac88ea82-778e-4722-b4a5-67b02b78322d.idx/index new file mode 100644 index 000000000..ffb95440b Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/index/part-0-ac88ea82-778e-4722-b4a5-67b02b78322d.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/index/part-0-ac88ea82-778e-4722-b4a5-67b02b78322d.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/index/part-0-ac88ea82-778e-4722-b4a5-67b02b78322d.idx/metadata.json.gz new file mode 100644 index 000000000..14e2c0d67 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/index/part-0-ac88ea82-778e-4722-b4a5-67b02b78322d.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/metadata.json.gz new file mode 100644 index 000000000..b2a26b212 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..3ba9eb28c Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/rows/metadata.json.gz new file mode 100644 index 000000000..07dc2ffd3 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/rows/parts/.part-0-ac88ea82-778e-4722-b4a5-67b02b78322d.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/rows/parts/.part-0-ac88ea82-778e-4722-b4a5-67b02b78322d.crc new file mode 100644 index 000000000..3917586c6 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/rows/parts/.part-0-ac88ea82-778e-4722-b4a5-67b02b78322d.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/rows/parts/part-0-ac88ea82-778e-4722-b4a5-67b02b78322d b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/rows/parts/part-0-ac88ea82-778e-4722-b4a5-67b02b78322d new file mode 100644 index 000000000..b9257c703 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_37.ht/rows/parts/part-0-ac88ea82-778e-4722-b4a5-67b02b78322d differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/.README.txt.crc new file mode 100644 index 000000000..8c34199dd Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..04e94db9f Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_interval_1.ht/README.txt b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/README.txt similarity index 78% rename from v03_pipeline/var/test/reference_data/test_interval_1.ht/README.txt rename to v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/README.txt index 3d9a5ac98..d66e21027 100644 --- a/v03_pipeline/var/test/reference_data/test_interval_1.ht/README.txt +++ b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. Written with version 0.2.130-bea04d9c79b5 - Created at 2024/05/20 13:22:32 \ No newline at end of file + Created at 2024/11/13 15:27:31 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..b8ae778a4 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/globals/metadata.json.gz new file mode 100644 index 000000000..fd09f8a9e Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..5fd2a8f0f Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/globals/parts/part-0 new file mode 100644 index 000000000..daba6ef6d Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/index/part-0-90a97bd8-3648-4074-89bd-3a64d58266e2.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/index/part-0-90a97bd8-3648-4074-89bd-3a64d58266e2.idx/.index.crc new file mode 100644 index 000000000..9933a2de5 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/index/part-0-90a97bd8-3648-4074-89bd-3a64d58266e2.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/index/part-0-90a97bd8-3648-4074-89bd-3a64d58266e2.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/index/part-0-90a97bd8-3648-4074-89bd-3a64d58266e2.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..1ebb40475 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/index/part-0-90a97bd8-3648-4074-89bd-3a64d58266e2.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/index/part-0-90a97bd8-3648-4074-89bd-3a64d58266e2.idx/index b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/index/part-0-90a97bd8-3648-4074-89bd-3a64d58266e2.idx/index new file mode 100644 index 000000000..b77f409e3 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/index/part-0-90a97bd8-3648-4074-89bd-3a64d58266e2.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/index/part-0-90a97bd8-3648-4074-89bd-3a64d58266e2.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/index/part-0-90a97bd8-3648-4074-89bd-3a64d58266e2.idx/metadata.json.gz new file mode 100644 index 000000000..172607a22 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/index/part-0-90a97bd8-3648-4074-89bd-3a64d58266e2.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/metadata.json.gz new file mode 100644 index 000000000..f8b50dec6 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..f0334b79b Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/rows/metadata.json.gz new file mode 100644 index 000000000..39e342b07 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/rows/parts/.part-0-90a97bd8-3648-4074-89bd-3a64d58266e2.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/rows/parts/.part-0-90a97bd8-3648-4074-89bd-3a64d58266e2.crc new file mode 100644 index 000000000..d7c9e4de7 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/rows/parts/.part-0-90a97bd8-3648-4074-89bd-3a64d58266e2.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/rows/parts/part-0-90a97bd8-3648-4074-89bd-3a64d58266e2 b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/rows/parts/part-0-90a97bd8-3648-4074-89bd-3a64d58266e2 new file mode 100644 index 000000000..1164defaa Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_exomes_38.ht/rows/parts/part-0-90a97bd8-3648-4074-89bd-3a64d58266e2 differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/.README.txt.crc new file mode 100644 index 000000000..92a8b63a4 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..79fda1962 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/README.txt b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/README.txt new file mode 100644 index 000000000..59c689541 --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.130-bea04d9c79b5 + Created at 2024/11/13 15:44:06 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..036f69746 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/globals/metadata.json.gz new file mode 100644 index 000000000..40495e5c5 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..5d34707bb Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/globals/parts/part-0 new file mode 100644 index 000000000..8318eee0b Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/index/part-0-deb2219c-0ebe-4343-ba3c-143f95c4b24a.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/index/part-0-deb2219c-0ebe-4343-ba3c-143f95c4b24a.idx/.index.crc new file mode 100644 index 000000000..00c4847e3 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/index/part-0-deb2219c-0ebe-4343-ba3c-143f95c4b24a.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/index/part-0-deb2219c-0ebe-4343-ba3c-143f95c4b24a.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/index/part-0-deb2219c-0ebe-4343-ba3c-143f95c4b24a.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..7b9ae4ad7 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/index/part-0-deb2219c-0ebe-4343-ba3c-143f95c4b24a.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/index/part-0-deb2219c-0ebe-4343-ba3c-143f95c4b24a.idx/index b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/index/part-0-deb2219c-0ebe-4343-ba3c-143f95c4b24a.idx/index new file mode 100644 index 000000000..a64a36d41 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/index/part-0-deb2219c-0ebe-4343-ba3c-143f95c4b24a.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/index/part-0-deb2219c-0ebe-4343-ba3c-143f95c4b24a.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/index/part-0-deb2219c-0ebe-4343-ba3c-143f95c4b24a.idx/metadata.json.gz new file mode 100644 index 000000000..5f7a34128 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/index/part-0-deb2219c-0ebe-4343-ba3c-143f95c4b24a.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/metadata.json.gz new file mode 100644 index 000000000..fad17957b Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..66088fe5b Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/rows/metadata.json.gz new file mode 100644 index 000000000..edc848305 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/rows/parts/.part-0-deb2219c-0ebe-4343-ba3c-143f95c4b24a.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/rows/parts/.part-0-deb2219c-0ebe-4343-ba3c-143f95c4b24a.crc new file mode 100644 index 000000000..d2e431ff7 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/rows/parts/.part-0-deb2219c-0ebe-4343-ba3c-143f95c4b24a.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/rows/parts/part-0-deb2219c-0ebe-4343-ba3c-143f95c4b24a b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/rows/parts/part-0-deb2219c-0ebe-4343-ba3c-143f95c4b24a new file mode 100644 index 000000000..38cae4201 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_37.ht/rows/parts/part-0-deb2219c-0ebe-4343-ba3c-143f95c4b24a differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/.README.txt.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/.README.txt.crc new file mode 100644 index 000000000..8d27c2f97 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..7d7e0a3a0 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/README.txt b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/README.txt new file mode 100644 index 000000000..36ee163df --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.130-bea04d9c79b5 + Created at 2024/11/13 15:48:23 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/_SUCCESS b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..ac1c84b66 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/globals/metadata.json.gz new file mode 100644 index 000000000..02b977753 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..66bd45ca4 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/globals/parts/part-0 new file mode 100644 index 000000000..e79fae86e Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/index/part-0-a3c7b21c-f8dd-4d21-948b-3746f5229729.idx/.index.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/index/part-0-a3c7b21c-f8dd-4d21-948b-3746f5229729.idx/.index.crc new file mode 100644 index 000000000..12a140491 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/index/part-0-a3c7b21c-f8dd-4d21-948b-3746f5229729.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/index/part-0-a3c7b21c-f8dd-4d21-948b-3746f5229729.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/index/part-0-a3c7b21c-f8dd-4d21-948b-3746f5229729.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..1ebb40475 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/index/part-0-a3c7b21c-f8dd-4d21-948b-3746f5229729.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/index/part-0-a3c7b21c-f8dd-4d21-948b-3746f5229729.idx/index b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/index/part-0-a3c7b21c-f8dd-4d21-948b-3746f5229729.idx/index new file mode 100644 index 000000000..1e7534f23 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/index/part-0-a3c7b21c-f8dd-4d21-948b-3746f5229729.idx/index differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/index/part-0-a3c7b21c-f8dd-4d21-948b-3746f5229729.idx/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/index/part-0-a3c7b21c-f8dd-4d21-948b-3746f5229729.idx/metadata.json.gz new file mode 100644 index 000000000..172607a22 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/index/part-0-a3c7b21c-f8dd-4d21-948b-3746f5229729.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/metadata.json.gz new file mode 100644 index 000000000..8e094fef2 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..d7f4688c7 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/rows/metadata.json.gz new file mode 100644 index 000000000..7cd0fb4d3 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/rows/parts/.part-0-a3c7b21c-f8dd-4d21-948b-3746f5229729.crc b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/rows/parts/.part-0-a3c7b21c-f8dd-4d21-948b-3746f5229729.crc new file mode 100644 index 000000000..e32ff1ad1 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/rows/parts/.part-0-a3c7b21c-f8dd-4d21-948b-3746f5229729.crc differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/rows/parts/part-0-a3c7b21c-f8dd-4d21-948b-3746f5229729 b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/rows/parts/part-0-a3c7b21c-f8dd-4d21-948b-3746f5229729 new file mode 100644 index 000000000..5b6ef8b91 Binary files /dev/null and b/v03_pipeline/var/test/reference_datasets/raw/gnomad_genomes_38.ht/rows/parts/part-0-a3c7b21c-f8dd-4d21-948b-3746f5229729 differ diff --git a/v03_pipeline/var/test/reference_datasets/raw/submission_summary.txt b/v03_pipeline/var/test/reference_datasets/raw/submission_summary.txt new file mode 100644 index 000000000..425f99c23 --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/raw/submission_summary.txt @@ -0,0 +1,100 @@ +##Overview of interpretation, phenotypes, observations, and methods reported in each current submission +##Explanation of the columns in this report +#VariationID: the identifier assigned by ClinVar and used to build the URL, namely https://ncbi.nlm.nih.gov/clinvar/VariationID +#ClinicalSignificance: the germline classification on this submitted record +#DateLastEvaluated: the last date the classification on this record was evaluated by the submitter +#Description: an optional free text description comment describing the rationale for the classification +#SubmittedPhenotypeInfo: the name(s) or identifier(s) submitted as the condition for the classification +#ReportedPhenotypeInfo: the MedGen identifier/name combinations that the submitted condition for the classification maps to. 'na' means there is no public identifer in MedGen for the condition. +#ReviewStatus: the level of review for this submitted record; see http//www.ncbi.nlm.nih.gov/clinvar/docs/variation_report/#review_status +#CollectionMethod: the method by which the submitter collected the data for the classification; see https://www.ncbi.nlm.nih.gov/clinvar/docs/spreadsheet/#collection +#OriginCounts: the allele origin reported by the submitter and the number of observations for each origin. ‘na’ means that the number of observations was not provided by the submitter. +#Submitter: the submitter of this record +#SCV: the accession and current version assigned by ClinVar to this submitted record +#SubmittedGeneSymbol: the gene symbol reported in this submitted record +#ExplanationOfInterpretation: more details if the germline classification (ClinicalSignificance) is 'other' or 'drug response' +#SomaticClinicalImpact: the somatic classification of clinical impact on this submitted record +#Oncogenicity: the somatic classification of oncogenicity on this submitted record +#VariationID ClinicalSignificance DateLastEvaluated Description SubmittedPhenotypeInfo ReportedPhenotypeInfo ReviewStatus CollectionMethod OriginCounts Submitter SCV SubmittedGeneSymbol ExplanationOfInterpretation SomaticClinicalImpact Oncogenicity +2 Pathogenic - - OMIM:613647 C3150901:Hereditary spastic paraplegia 48 criteria provided, single submitter clinical testing unknown:2 Paris Brain Institute, Inserm - ICM SCV001451119.1 - - - - +2 Pathogenic Jun 29, 2010 - SPASTIC PARAPLEGIA 48, AUTOSOMAL RECESSIVE C3150901:Hereditary spastic paraplegia 48 no assertion criteria provided literature only germline:na OMIM SCV000020155.3 AP5Z1 - - - +3 Pathogenic Jun 29, 2010 - SPASTIC PARAPLEGIA 48 C3150901:Hereditary spastic paraplegia 48 no assertion criteria provided literature only germline:na OMIM SCV000020156.5 AP5Z1 - - - +4 Uncertain significance Jun 29, 2015 - RECLASSIFIED - VARIANT OF UNKNOWN SIGNIFICANCE C4551772:Galloway-Mowat syndrome 1 no assertion criteria provided literature only germline:na OMIM SCV000020157.2 ZNF592 - - - +5 Pathogenic Dec 30, 2019 Variant summary: FOXRED1 c.694C>T (p.Gln232X) results in a premature termination codon, predicted to cause a truncation of the encoded protein or absence of the protein due to nonsense mediated decay, which are commonly known mechanisms for disease. At least one publication reports experimental evidence that this variant affects mRNA splicing as evidenced by analysis of patient cDNA showing occasional skipping of exon 6, resulting in a transcript predicted to lack 40 internal residues (Calvo_2010). The variant allele was found at a frequency of 1.2e-05 in 251184 control chromosomes. c.694C>T has been reported in the literature in at-least one individual affected with Leigh syndrome (example, Calvo_2010). At least one publication reports experimental evidence evaluating an impact on protein function. The most pronounced variant effect results in defects in human mitochondrial complex I biogenesis (Formosa_2015). One clinical diagnostic laboratory has submitted clinical-significance assessments for this variant to ClinVar after 2014 without evidence for independent evaluation and classified the variant as pathogenic. Based on the evidence outlined above, the variant was classified as pathogenic. MedGen:C0023264 C0023264:Leigh syndrome criteria provided, single submitter clinical testing germline:na Women's Health and Genetics/Laboratory Corporation of America, LabCorp SCV001363290.1 FOXRED1 - - - +5 Pathogenic Dec 07, 2017 The Q232X variant in the FOXRED1 gene has been reported previously in Leigh syndrome, in an affected individual who was compound heterozygous for the Q232X variant and another FOXRED1 variant (Calvo et al., 2010). This variant is predicted to cause loss of normal protein function either through protein truncation or nonsense-mediated mRNA decay. The Q232X variant is not observed at a significant frequency in large population cohorts (Lek et al., 2016). We interpret Q232X as a pathogenic variant. Not Provided C3661900:not provided criteria provided, single submitter clinical testing germline:na GeneDx SCV000680696.2 FOXRED1 - - - +5 Pathogenic Oct 31, 2022 The FOXRED1 c.694C>T variant is predicted to result in premature protein termination (p.Gln232*). This variant was reported in individuals with mitochondrial complex I deficiency (Calvo et al. 2010. PubMed ID: 20818383, supplementary data; Formosa et al. 2015. PubMed ID: 25678554; Apatean et al. 2019. PubMed ID: 30723688). This variant is reported in 0.0040% of alleles in individuals of African descent in gnomAD (http://gnomad.broadinstitute.org/variant/11-126145284-C-T). Nonsense variants in FOXRED1 are expected to be pathogenic. This variant is interpreted as pathogenic. FOXRED1-related condition na:FOXRED1-related disorder criteria provided, single submitter clinical testing germline:na PreventionGenetics, part of Exact Sciences SCV004119439.1 FOXRED1 - - - +5 Pathogenic Oct 01, 2010 - MITOCHONDRIAL COMPLEX I DEFICIENCY, NUCLEAR TYPE 19 C4748791:Mitochondrial complex 1 deficiency, nuclear type 19 no assertion criteria provided literature only germline:na OMIM SCV000020158.5 FOXRED1 - - - +5 Pathogenic Dec 01, 2023 This sequence change creates a premature translational stop signal (p.Gln232*) in the FOXRED1 gene. It is expected to result in an absent or disrupted protein product. Loss-of-function variants in FOXRED1 are known to be pathogenic (PMID: 20818383, 20858599). This variant is present in population databases (rs267606829, gnomAD 0.003%). This premature translational stop signal has been observed in individual(s) with Leigh syndrome (PMID: 20818383). ClinVar contains an entry for this variant (Variation ID: 5). For these reasons, this variant has been classified as Pathogenic. MedGen:CN517202 C3661900:not provided criteria provided, single submitter clinical testing germline:na Labcorp Genetics (formerly Invitae), Labcorp SCV002982300.2 FOXRED1 - - - +5 Pathogenic Mar 29, 2022 - OMIM:618241 C4748791:Mitochondrial complex 1 deficiency, nuclear type 19 criteria provided, single submitter clinical testing unknown:na Fulgent Genetics, Fulgent Genetics SCV002793147.1 - - - - +6 Pathogenic Oct 01, 2010 - MITOCHONDRIAL COMPLEX I DEFICIENCY, NUCLEAR TYPE 19 C4748791:Mitochondrial complex 1 deficiency, nuclear type 19 no assertion criteria provided literature only germline:na OMIM SCV000020159.5 FOXRED1 - - - +7 Pathogenic Sep 01, 2017 This variant has been previously reported as disease-causing and was found once in our laboratory in trans with a missense variant in an 18-year-old male with mitochondrial disease OMIM:252010 C1838979:Mitochondrial complex I deficiency criteria provided, single submitter clinical testing germline:na Baylor Genetics SCV000245520.3 NUBPL - - - +7 Pathogenic Apr 23, 2013 - MITOCHONDRIAL COMPLEX I DEFICIENCY, NUCLEAR TYPE 21 C4748792:Mitochondrial complex 1 deficiency, nuclear type 21 no assertion criteria provided literature only germline:na OMIM SCV000020160.6 NUBPL - - - +9 Pathogenic Dec 09, 2019 NM_000410.3(HFE):c.845G>A(C282Y) is classified as pathogenic in the context of HFE-associated hereditary hemochromatosis. Please note that clinical symptoms are uncommon in C282Y homozygotes. Sources cited for classification include the following: PMID 9162021, 9356458, 8931958, 9341868, 9462220 and 11812557. Classification of NM_000410.3(HFE):c.845G>A(C282Y) is based on the following criteria: This is a well-established pathogenic variant in the literature that has been observed more frequently in patients with clinical diagnoses than in healthy populations. Please note: this variant was assessed in the context of healthy population screening. OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing unknown:na Myriad Genetics, Inc. SCV001194044.2 HFE - - - +9 Pathogenic Dec 01, 2015 - Hereditary cancer-predisposing syndrome C0027672:Hereditary cancer-predisposing syndrome criteria provided, single submitter clinical testing germline:na Vantari Genetics SCV000267038.1 HFE - - - +9 Pathogenic Sep 03, 2024 The HFE c.845G>A variant is predicted to result in the amino acid substitution p.Cys282Tyr. In patients with transferrin-iron saturation higher than 45%, presence of the c.845G>A (p.Cys282Tyr) variant is useful in confirmation of hereditary hemochromatosis diagnosis as individuals homozygous for the variant represent 80% of cases (Bacon et al. 2011. PubMed ID: 21452290; Alexander and Kowdley. 2009. PubMed ID: 19444013; Kowdley et al. 2012. PubMed ID: 22395570). The c.845G>A (p.Cys282Tyr) variant is incompletely penetrant with ~35% of individuals homozygous for the variant having normal ferritin levels (Bacon et al. 2011. PubMed ID: 21452290). This variant is interpreted as pathogenic. HFE-related condition na:HFE-related disorder no assertion criteria provided clinical testing germline:na PreventionGenetics, part of Exact Sciences SCV004120883.3 HFE - - - +9 Pathogenic Jul 21, 2020 - OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:na Genomic Medicine Lab, University of California San Francisco SCV002576300.1 HFE - - - +9 Pathogenic May 15, 2023 - OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:2 New York Genome Center SCV003925227.2 - - - - +9 not provided - Variant identified in multiple participants and classified as Pathogenic. GenomeConnect assertions are reported exactly as they appear on the patient-provided report from the testing laboratory. GenomeConnect staff make no attempt to reinterpret the clinical significance of the variant. MedGen:C0018995 C0018995:Bronze diabetes no classification provided phenotyping only unknown:24 GenomeConnect, ClinGen SCV000607202.5 HFE - - - +9 Pathogenic Sep 12, 2023 The HFE c.845G>A (p.Cys282Tyr) variant has been reported in the homozygous or compound heterozygous state in many individuals affected with hereditary hemochromatosis and is considered the most common cause of hereditary hemochromatosis (Barton JC and Edwards CQ, PMID: 20301613). Studies show penetrance rates of severe iron overload to be as high as 35% and severe liver disease in 9–24% among male p.Cys282Tyr homozygotes (Grosse SD et al., PMID: 28771247). This variant has been reported in the ClinVar database as a germline pathogenic variant by many submitters. Computational predictors indicate that the variant is damaging, evidence that correlates with impact to HFE function. In support of these predictions, a homozygous mouse model showed postnatal iron loading and in vitro functional studies have shown that the variant causes reduced function (Ali-Rahmani F et al., PMID: 21243428; Boucherma R et al., PMID: 22531912; Levy JE et al., PMID: 10381492). Based on available information and the ACMG/AMP guidelines for variant interpretation (Richards S et al., PMID: 25741868), this variant is classified as pathogenic with reduced penetrance. OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:na Clinical Genomics Laboratory, Washington University in St. Louis SCV004177020.1 - - - - +9 Pathogenic Jun 07, 2022 PS3, PM3_Very Strong, PP3 HFE-related disorder na:HFE-related disorder criteria provided, single submitter clinical testing germline:na Greenwood Genetic Center Diagnostic Laboratories, Greenwood Genetic Center SCV002568182.1 HFE - - - +9 Pathogenic Jul 21, 2023 The variant NM_000410.4:c.845G>A (chr6:26092913) in HFE was detected in 7331 heterozygotes and 264 homozygotes out of 58K WGS Icelanders (MAF= 6,775%). Following imputation in a set of 166K Icelanders (710 imputed homozygotes) we observed an association with hemochromatosis under a recessive model using 2403 cases and 240747 controls (OR= 50.27, P= 2.69e-212). This variant has been reported multiple times in ClinVar as pathogenic. Based on ACMG criteria (PS3, PS4, PP1, PP4, PP5) this variant classifies as pathogenic. OMIM:235200 C3469186:Hemochromatosis type 1 no assertion criteria provided research germline:710 deCODE genetics, Amgen SCV004022244.1 HFE - - - +9 Pathogenic May 11, 2018 - Human Phenotype Ontology:HP:0000707;Human Phenotype Ontology:HP:0000708;Human Phenotype Ontology:HP:0000759;Human Phenotype Ontology:HP:0002027;Human Phenotype Ontology:HP:0009830;Human Phenotype Ontology:HP:0010461;Human Phenotype Ontology:HP:0012531 C0000737:Abdominal pain;C0004941:Atypical behavior;C0030193:Pain;C0031117:Peripheral neuropathy;C0497552:Abnormality of the nervous system;C4023819:Abnormality of the male genitalia;C4025831:Abnormal peripheral nervous system morphology criteria provided, single submitter clinical testing germline:1 Knight Diagnostic Laboratories, Oregon Health and Sciences University SCV001448752.1 HFE - - - +9 Pathogenic Jan 05, 2022 The c.845G>A;p.(Cys282Tyr) missense variant has been observed in affected individual(s) and ClinVar contains an entry for this variant (ClinVar ID: 9; OMIM: 613609.0001; PMID: 20301613; 27659401; 26365338; 19084217; 11040194; 23953397; 26365338) - PS4. Well-established in vitro or in vivo functional studies support a damaging effect on the gene or gene product (PMID: 11040194; 23953397; 9162021; 9356458) - PS3_moderate. The variant is located in a mutational hot spot and/or critical and well-established functional domain (Immunoglobulin C1-set domain) - PM1. The p.(Cys282Tyr) was detected in trans with a pathogenic variant (PMID: 15507752; 17384005; 26244503; 25850353; 25277871; 24401005; 23953397; 32153640; 11478530; 26365338) - PM3_very strong The variant co-segregated with disease in multiple affected family members (PMID: 32153640; 11478530) - PP1. Multiple lines of computational evidence support a deleterious effect on the gene or gene product - PP3. In summary, the currently available evidence indicates that the variant is pathogenic. MedGen:C0392514 C0392514:Hereditary hemochromatosis criteria provided, single submitter clinical testing germline:2 DASA SCV002061285.1 HFE - - - +9 Pathogenic Sep 25, 2024 Criteria applied: PS3,PM3_STR,PP3,PP4 OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing unknown:na Institute of Human Genetics, University of Leipzig Medical Center SCV002044430.3 HFE - - - +9 Pathogenic Sep 15, 2021 - not provided C3661900:not provided criteria provided, single submitter clinical testing germline:na Institute of Medical Genetics and Applied Genomics, University Hospital Tübingen SCV001905583.1 - - - - +9 not provided - Variant reported in multiple Invitae PIN participants by multiple clinical testing laboratories. Variant interpreted as Pathogenic by all laboratories and reported most recently on 11/20/2019 by Illumina and 6/19/2020 by Invitae. GenomeConnect-Invitae Patient Insights Network assertions are reported exactly as they appear on the patient-provided report from the testing laboratory. Registry team members make no attempt to reinterpret the clinical significance of the variant. Phenotypic details are available under supporting information. MedGen:C0392514 C0392514:Hereditary hemochromatosis no classification provided phenotyping only unknown:2 GenomeConnect - Invitae Patient Insights Network SCV001749341.1 HFE - - - +9 Pathogenic Jun 12, 2023 - Not provided C3661900:not provided criteria provided, single submitter clinical testing germline:32 Mayo Clinic Laboratories, Mayo Clinic SCV002525758.2 HFE - - - +9 Pathogenic Jun 24, 2019 - OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:na Centogene AG - the Rare Disease Company SCV002028313.1 HFE - - - +9 Pathogenic Jun 01, 2021 • The p.Cys282Tyr variant in the HFE gene has been identified in the homozygous state in approximately 60- 90% of individuals of European ancestry with HFE hemochromatosis, and in the compound heterozygous state with p.His63Asp in approximately 3-8% of individuals of European ancestry with HFE hemochromatosis (Barton and Edwards, 2018). • The p.Cys282Tyr variant is associated with a high penetrance for biochemical evidence of iron overload, but with a low penetrance for clinical manifestations of iron overload with studies reporting evidence of clinical disease present in as low as 2% and as high as 33% of p.Cys282Tyr homozygotes (Beutler et al., 2002; Whitlock et al., 2006). • Individuals heterozygous for the p.Cys282Tyr variant may demonstrate evidence of biochemical disease, including mildly elevated serum transferrin-iron saturation and serum ferritin concentration, but do not develop clinical manifestations of disease (Allen et al., 2008; Pedersen and Milman, 2009). • This variant has been identified in 7,435/128,950 European (non-Finnish) chromosomes (9,544/282,608 chromosomes overall) by the Genome Aggregation Database (http://gnomad.broadinstitute.org/). Although the p.Cys282Tyr variant is seen at a frequency greater than 5% in the general population, this variant is recognized as a common low-penetrant variant that is an exception to ACMG/AMP classification guidelines (Ghosh et al., 2018). • These data were assessed using the ACMG/AMP variant interpretation guidelines. In summary, there is sufficient evidence to classify the p.Cys282Tyr variant as pathogenic for autosomal recessive HFE hemochromatosis based on the information above. [ACMG evidence codes used: PS4; PP3] OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:1 Clinical Genomics Laboratory, Stanford Medicine SCV004803175.1 HFE - - - +9 Pathogenic Sep 23, 2021 - OMIM:235200 C3469186:Hemochromatosis type 1 no assertion criteria provided clinical testing germline:na Clinical Genetics Laboratory, University Hospital Schleswig-Holstein SCV002011713.1 HFE - - - +9 Pathogenic Mar 30, 2016 The c.845G>A (p.Cys282Tyr) missense variant is widely recognized as one of the two most common disease-causing variants in the HFE gene. Cys282Tyr homozygotes account for 80-85% of typical patients with Hereditary Hemochromatosis (HH). However, the majority of individuals who are homozygous for this variant do not develop the disease (GeneReviews, Kowdley et al., 2012; Ramrakhiani and Bacon, 1998; and Morrison et al., 2003). In summary, this variant c.845G>A (p.Cys282Tyr) meets our criteria for a Pathogenic classification. We have confirmed this finding in our laboratory using Sanger sequencing. OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:na Knight Diagnostic Laboratories, Oregon Health and Sciences University SCV000223934.2 HFE - - - +9 Pathogenic Sep 14, 2015 - OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:na Genetic Services Laboratory, University of Chicago SCV000151394.2 HFE - - - +9 Pathogenic Nov 26, 2015 - MedGen:C0392514 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:15 Blueprint Genetics SCV000206975.3 HFE - - - +9 Pathogenic Jul 25, 2019 - OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing unknown:18 Equipe Genetique des Anomalies du Developpement, Université de Bourgogne SCV000883106.1 HFE - - - +9 other May 31, 2018 - not provided C3661900:not provided criteria provided, single submitter clinical testing germline:124 Eurofins Ntd Llc (ga) SCV000230091.5 HFE Variant classified as "other reportable" ??? variant is clinically benign (not associated with disease) but is reported when observed (e.g. pseudodeficiency alleles). - - +9 Pathogenic Jul 01, 2024 Common pathogenic variant associated with hereditary hemochromatosis (PMID: 23953397, 8696333); Published functional studies demonstrate a damaging effect as C282Y results in a protein that does not reach the cell surface and is subject to accelerated degradation (PMID: 21243428, 9356458); This variant is associated with the following publications: (PMID: 9356458, 23792061, 32153640, 34490613, 26474245, 29969830, 23953397, 19084217, 19159930, 19271219, 20117027, 19176287, 24604426, 12707220, 22693327, 19258483, 20031541, 20031565, 9836708, 23121079, 20640879, 20946107, 22531912, 23178241, 20099304, 22611049, 20669231, 19820015, 21785125, 23222517, 21514009, 19429178, 22209421, 23281741, 20560808, 17450498, 8696333, 26501199, 27661980, 27659401, 26365338, 25916738, 27153395, 25767899, 11903355, 29555771, 30291871, 30374069, 15254010, 31019283, 31028937, 31640930, 29301508, 25287020, 32189932, 31447099, 30145563, 31980526, 26893171, 32228506, 34426522, 9630070, 9674544, 11336458, 11478530, 11531973, 11976822, 9382962, 10520044, 32641076, 11565552, 9858243, 19912313, 10792295, 11181289, 10090890, 11500063, 11189980, 32874917, 37937776, 27816425, 37443404, 38195192, 28399358, 29145899, 35499102, 27784128, 21243428) Not Provided C3661900:not provided criteria provided, single submitter clinical testing germline:na GeneDx SCV000329362.9 HFE - - - +9 Pathogenic Feb 02, 2022 The HFE c.845G>A (p.Cys282Tyr) missense variant results in the substitution of cysteine at amino acid position 282 to tyrosine. This variant is one of the two most common and well-studied pathogenic variants associated with HFE hemochromatosis. Approximately 60-90% of individuals of European ancestry with HFE hemochromatosis are homozygous for the variant and between 3-8% of individuals are compound heterozygous (Feder et al. 1996; Morrison et al. 2003; Gallego et al. 2015; Press et al. 2016; Barton and Edwards 2018). Disease penetrance for c.845G>A variant carriers is variable (Beutler et al. 2002; Pedersen et al. 2009; Gurrin et al. 2009), with homozygotes being at a greater risk for iron overload than compound heterozygotes (Gallego et al. 2015; Barton and Edwards 2018). The c.845G>A variant affects HFE protein activity by preventing the formation of a disulfide bridge in the alpha-3 domain, which impairs the beta-2-microglobulin interaction and prevents the protein from reaching the cell surface (Feder et al. 1997; Barton and Edwards 2018). The c.845G>A variant has a frequency of 5-7% in Caucasians (Press et al. 2016) and is reported at a frequency of 0.064660 in the European (non-Finnish) population (including 137 homozygotes) of the Genome Aggregation Database (version 3.1.2). This allele frequency is high but is consistent with estimates of disease prevalence. Based on the available evidence, the c.845G>A (p.Cys282Tyr) variant is classified as pathogenic for HFE hemochromatosis but is noted to have reduced penetrance. MedGen:C3469186 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing unknown:na Illumina Laboratory Services, Illumina SCV000461887.4 - - - - +9 Pathogenic Jun 05, 2017 The c.845G>A (p.Cys282Tyr) variant in the HFE gene in the homozygous state has been reported as a common cause of hereditary hemochromatosis with high penetrance of biochemically defined iron overload but low penetrance of clinically defined iron overload [OMIM:613609.0001; PMID 8896549, 10381492, 18199861]. This variant has been detected at high frequency in the ExAC population database (up to 5% in Europeans) (http://exac.broadinstitute.org/variant/6-26093141-G-A). Cysteine at amino acid position 282 of the HFE protein is highly conserved in mammals and computer-based algorithms predict this p.Cys282Tyr change to be deleterious. This variant is classified as pathogenic.
Apparent homozygosity of this variant may be caused by the presence of the mutant allele on both alleles of this individual, or the presence of a mutant allele on one allele and an exonic deletion on the opposite allele. Copy number variant (CNV) analysis or segregation analysis is necessary to assess the apparent homozygosity status of this variant. OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:na Human Genome Sequencing Center Clinical Lab, Baylor College of Medicine SCV000839959.1 HFE - - - +9 Pathogenic, low penetrance Jan 31, 2024 This sequence change replaces cysteine, which is neutral and slightly polar, with tyrosine, which is neutral and polar, at codon 282 of the HFE protein (p.Cys282Tyr). This variant is present in population databases (rs1800562, gnomAD 6%), and has an allele count higher than expected for a pathogenic variant. This is a common, low penetrance variant that is known to contribute to hemochromatosis when homozygous or present with a second pathogenic allele in HFE. As many as 90% of individuals of European descent who are affected with hemochromatosis are homozygous for this variant (PMID: 16132052, 26153218, 26365338). ClinVar contains an entry for this variant (Variation ID: 9). Advanced modeling of protein sequence and biophysical properties (such as structural, functional, and spatial information, amino acid conservation, physicochemical variation, residue mobility, and thermodynamic stability) performed at Invitae indicates that this missense variant is expected to disrupt HFE protein function with a positive predictive value of 80%. Experimental studies have shown that this missense change disrupts a disulfide bond in the Œ±3 domain of the HFE protein and impairs interaction of HFE with beta2-microglobulin, resulting in a block in intracellular transport and loss of cell surface expression of the Cys282Tyr variant protein (PMID: 9162021, 9356458). In summary, this variant is reported to cause disease. However, as this variant is associated with a lower penetrance than other pathogenic alleles in the HFE gene, it has been classified as Pathogenic (low penetrance). MedGen:C0392514 C0392514:Hereditary hemochromatosis criteria provided, single submitter clinical testing germline:na Labcorp Genetics (formerly Invitae), Labcorp SCV000219175.11 HFE - - - +9 Pathogenic Apr 15, 2021 PS3, PP5, PS4, PM3 Cardiomyopathy C0878544:Cardiomyopathy criteria provided, single submitter clinical testing germline:na Clinical Genetics Laboratory, Region Ostergotland SCV001984982.1 HFE - - - +9 Pathogenic Jul 31, 2024 - not provided C3661900:not provided criteria provided, single submitter clinical testing germline:na Center for Genomic Medicine, Rigshospitalet, Copenhagen University Hospital SCV002550674.6 HFE - - - +9 risk factor Mar 04, 2020 HFE c.845G>A (p.Cys282Tyr) has been associated with increased risk for hemochromatosis. This variant has been observed in multiple ethnic backgrounds with highest frequencies in individuals of European ancestry (5.7%, Genome Aggregation Database (gnomAD); rs1800562) and is present in ClinVar (ID: 9). A large meta-analysis has reported an odds ratio of 1.2 [95% CI 0.8-1.6] for developing liver disease in heterozygous carriers (Ellervik 2007). In vitro and in vivo functional studies provide some evidence that this variant may impact protein function (Ali-Rahmani 2011, Boucherma 2012). In summary, this variant is uncertain risk allele for hemochromatosis in heterozygous state. HFE c.845G>A (p.Cys282Tyr) has been associated with increased risk for hemochromatosis. This variant has been observed in multiple ethnic backgrounds with highest frequencies in individuals of European ancestry (5.7%, Genome Aggregation Database (gnomAD); rs1800562) and is present in ClinVar (ID: 9). A large meta-analysis has reported an odds ratio of 3.9 [95% CI 1.9-8.1] for developing liver disease in homozygous carriers (Ellervik 2007). In vitro and in vivo functional studies provide some evidence that this variant may impact protein function (Ali-Rahmani 2011, Boucherma 2012). In summary, this variant is established risk allele for hemochromatosis in homozygous state. Orphanet:ORPHA79230 C0268060:Juvenile hemochromatosis criteria provided, single submitter clinical testing germline:104 Laboratory for Molecular Medicine, Mass General Brigham Personalized Medicine SCV000221190.4 HFE - - - +9 Pathogenic Feb 23, 2021 Variant summary: HFE c.845G>A (p.Cys282Tyr) results in a non-conservative amino acid change located in the Immunoglobulin C1-set domain (IPR003597) of the encoded protein sequence. Five of five in-silico tools predict a damaging effect of the variant on protein function. The variant allele was found at a frequency of 0.033 in 251236 control chromosomes in the gnomAD database, including 244 homozygotes. c.845G>A has been reported in the literature as the most common mutation found in individuals with Hemochromatosis Type 1, being identified as homozygous or compound heterozygous with another pathogenic variant in approximately 80-90% of reported cases, most frequently in individuals of European ancestry (e.g. Feder_1996, LeGac_2004, Beutler_2002, Yonal_2007, vanGemmeren_2015, deTayrac_2015, Zhang_2020). These data indicate that the variant is likely to be associated with disease, however the variant appears to have significantly reduced penetrance, as the majority of homozygous or compound heterozygous individuals with this variant do not exhibit clinical symptoms of the disorder despite some cases having elevated serum ferritin and transferrin saturation levels (e.g. Feder_1996, Beutler_2002, Yonal_2007). The mechanisms behind the variable expressivity of this variant are not known, but it has been proposed that other genetic variants could modify the phenotype exhibited by individuals who are homozygous for this variant (e.g. LeGac_2004, deTayrac_2015). In-vitro experimental evidence suggests that the Cys282Tyr-mutant protein has impaired intracellular trafficking and accelerated degradation compared to wild-type HFE (e.g. Waheed_1997) and that cells expressing the variant have altered expression of genes involved in sphingolipid metabolism (e.g. Ali-Rahmani_2011). In addition, an in-vivo study reported a loss of CD8+ T-cell tolerance to HFE in transgenic mice expressing the C282Y variant (e.g. Boucherma_2012) . Seventeen clinical diagnostic laboratories have submitted clinical-significance assessments for this variant to ClinVar after 2014 without evidence for independent evaluation. Sixteen of these laboratories cited the variant as pathogenic/likely pathogenic or as a risk factor for disease. Based on the evidence outlined above, the variant was classified as pathogenic with low penetrance for developing Hemochromatosis. MedGen:C3469186 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:na Women's Health and Genetics/Laboratory Corporation of America, LabCorp SCV001519562.1 HFE - - - +9 Pathogenic Dec 12, 2023 - not provided C3661900:not provided criteria provided, single submitter clinical testing germline:na Clinical Genetics Laboratory, Skane University Hospital Lund SCV005198438.1 HFE - - - +9 Pathogenic Mar 08, 2022 - Not provided C3661900:not provided criteria provided, single submitter clinical testing germline:120 AiLife Diagnostics, AiLife Diagnostics SCV002502491.1 HFE - - - +9 Pathogenic Sep 27, 2021 The c.845G>A (p.C282Y) alteration is located in coding exon 4 of the HFE gene. This alteration results from a G to A substitution at nucleotide position 845, causing the cysteine (C) at amino acid position 282 to be replaced by a tyrosine (Y). Based on data from gnomAD, the A allele has an overall frequency of 3.38% (9544/282608) total alleles studied. The highest observed frequency was 5.77% (7435/128950) of European (non-Finnish) alleles. This alteration is the most common cause of hereditary hemochromatosis (Allen, 2008). In homozygous individuals, up to 50% may develop iron overload, with 10-33% developing hemochromatosis-associated morbidity (EASL, 2010). Men appear to have a higher risk for disease development than women. In homozygous men, 84% display elevated transferrin-iron saturation and 88% have elevated serum ferritin concentration. In comparison, fewer homozygous women have elevated transferrin-iron saturation and serum ferritin concentration (73% and 57%, respectively). However, when p.C282Y is compound heterozygous with another pathogenic alteration, disease penetrance is significantly lower (Adams, 1997). This amino acid position is highly conserved in available vertebrate species. Functional studies have shown that this alteration leads to impaired intracellular transport of the protein and degradation before reaching the cell surface (Feder, 1997; Waheed, 1997). This alteration is predicted to be deleterious by in silico analysis. Based on the available evidence, this alteration is classified as pathogenic. MedGen:C0950123 C0950123:Inborn genetic diseases criteria provided, single submitter clinical testing germline:na Ambry Genetics SCV003702847.3 HFE - - - +9 Pathogenic Dec 25, 2023 - OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing unknown:na Baylor Genetics SCV001523198.4 - - - - +9 Pathogenic - The HFE p.Cys282Tyr variant is a common variant known to cause hereditary hemochromatosis; over 80% of hereditary hemochromatosis patients are homozygous for the p.C282Y variant (Feder_1996_PMID:8696333; Morrison_2003_PMID:12693884). The variant was identified in dbSNP (ID: rs1800562), in ClinVar (classified as pathogenic 13 times, likely pathogenic once and as a VUS once) and LOVD 3.0 (classified as pathogenic). The variant was identified in control databases in 9544 of 282608 chromosomes (276 homozygous) at a frequency of 0.033771 increasing the likelihood this could be a low frequency benign variant (Genome Aggregation Database Feb 27, 2017). The variant was observed in the following populations: European (non-Finnish) in 7435 of 128950 chromosomes (freq: 0.05766), Other in 281 of 7224 chromosomes (freq: 0.0389), European (Finnish) in 879 of 25108 chromosomes (freq: 0.03501), Latino in 494 of 35430 chromosomes (freq: 0.01394), Ashkenazi Jewish in 124 of 10366 chromosomes (freq: 0.01196), African in 260 of 24962 chromosomes (freq: 0.01042), South Asian in 68 of 30616 chromosomes (freq: 0.002221), and East Asian in 3 of 19952 chromosomes (freq: 0.00015). The p.Cys282 residue is conserved across mammals and other organisms, and four out of five computational analyses (PolyPhen-2, SIFT, AlignGVGD, BLOSUM, MutationTaster) suggest that the variant may impact the protein. Functional studies of the p.C282Y variant have demonstrated abnormal protein interaction, expression, processing and localization (Feder_1997_PMID:9162021; Waheed_1997_PMID:9356458). In summary, based on the above information this variant meets our laboratory’s criteria to be classified as pathogenic. not provided C3661900:not provided no assertion criteria provided clinical testing unknown:na Department of Pathology and Laboratory Medicine, Sinai Health System SCV001549492.1 HFE - - - +9 Pathogenic Nov 05, 2023 This sequence change in HFE is predicted to replace cysteine with tyrosine at codon 282, p.(Cys282Tyr). The cysteine residue is highly conserved (100 vertebrates, UCSC), and alters a critical cysteine residue involved in a disulfide bond in the Ig-like C2 type domain and prevents HFE protein presentation (PMID: 20301613). There is a large physicochemical difference between cysteine and tyrosine. The highest population minor allele frequency in the population database gnomAD v2.1 is 5.6% (7,345/128,950 alleles, 243 homozygotes) in the European non-Finnish population. This variant is reported as the common cause of HFE-related haemochromatosis. It has been reported in multiple individuals with haemochromatosis who were either homozygous or compound heterozygous for the variant (PMID: 19159930, 32153640, 11903354). The variant has been reported to segregate with haemochromatosis in multiple affected individuals from unrelated families (PMID: 10575540, 27518069). In vitro functional assays with limited validation showed a significant impairment to protein trafficking and accelerated protein degradation indicating that this variant impacts protein function (PMID: 9162021, 9356458). A transgenic mouse model for the variant showed an increased predisposition to iron loading (PMID: 10381492). Computational evidence predicts a deleterious effect for the missense substitution (REVEL = 0.872). Based on the classification scheme RMH Modified ACMG/AMP Guidelines v1.6.1, this variant is classified as PATHOGENIC. Following criteria are met: BS1, PM3_VeryStrong, PM1, PP1_Strong, PP3, PS3_Moderate. MONDO:MONDO:0021001 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:na Molecular Genetics, Royal Melbourne Hospital SCV004812520.1 HFE - - - +9 Pathogenic Jun 30, 2022 - OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:16 MGZ Medical Genetics Center SCV002580992.1 HFE - - - +9 Pathogenic - The HFE c.845G>A (p.C282Y) variant is a pathogenic variant observed in 3.4% of the human population (gnomAD). Individuals that are homozygous for the p.C282Y variant have a greater risk of developing iron overload compared to individuals with compound heterozygous variants (i.e. c.845G>A p.C282Y and c.187C>G p.H63D in trans) (PMID: 20301613). MedGen:C3469186 C3469186:Hemochromatosis type 1 criteria provided, single submitter research germline:4 UNC Molecular Genetics Laboratory, University of North Carolina at Chapel Hill SCV001251531.1 HFE - - - +9 Pathogenic - - OMIM:235200 C3469186:Hemochromatosis type 1 no assertion criteria provided research unknown:na Genomics And Bioinformatics Analysis Resource, Columbia University SCV004024088.1 HFE - - - +9 Pathogenic Aug 01, 2024 HFE: PM3:Very Strong, PS3, PM2:Supporting MedGen:CN517202 C3661900:not provided criteria provided, single submitter clinical testing germline:95 CeGaT Center for Human Genetics Tuebingen SCV001246053.24 HFE - - - +9 Uncertain significance Apr 12, 2014 - Human Phenotype Ontology:HP:0000992;Human Phenotype Ontology:HP:0010473 C0151861:Porphyrinuria;C0349506:Cutaneous photosensitivity flagged submission clinical testing unknown:na Centre for Mendelian Genomics, University Medical Centre Ljubljana SCV000493004.1 HFE - - - +9 Pathogenic Jun 17, 2022 ACMG Criteria: PS3, PS4, PM3, PP1_M, PP5; Variant was found in compound heterozygous state with NM_000410.4:c.187C>G. OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:na Institute of Immunology and Genetics Kaiserslautern SCV005382109.1 - - - - +9 not provided - Variant classified as Pathogenic and reported on 05-24-2022 by GeneDx. Assertions are reported exactly as they appear on the patient provided laboratory report. GenomeConnect does not attempt to reinterpret the variant. The IDDRC-CTSA National Brain Gene Registry (BGR) is a study funded by the U.S. National Center for Advancing Translational Sciences (NCATS) and includes 13 Intellectual and Developmental Disability Research Center (IDDRC) institutions. The study is led by Principal Investigator Dr. Philip Payne from Washington University. The BGR is a data commons of gene variants paired with subject clinical information. This database helps scientists learn more about genetic changes and their impact on the brain and behavior. Participation in the Brain Gene Registry requires participation in GenomeConnect. More information about the Brain Gene Registry can be found on the study website - https://braingeneregistry.wustl.edu/. MedGen:C0392514 C0392514:Hereditary hemochromatosis no classification provided phenotyping only biparental:1 GenomeConnect - Brain Gene Registry SCV003931195.1 HFE - - - +9 Pathogenic May 20, 2023 The missense variant c.845G>A(p.Cys282Tyr) in HFE gene has been reported in homozygous state in multiple individuals affected with hemochromatosis (Porto G et. al., 2016; Gallego et. al., 2015). Experimental studies have shown that this missense change disrupts a disulfide bond in the Œ±3 domain of the HFE protein and impairs interaction of HFE with beta2-microglobulin, resulting in a block in intracellular transport and loss of cell surface expression of the Cys282Tyr variant protein (Waheed et. al., 1997). The observed variant has allele frequency of 3.3% in gnomAD exomes database. This variant has been submitted to the ClinVar database as risk factor / Uncertain Significance / Pathogenic (multiple submissions). The reference amino acid change p.Cys282Tyr in HFE is predicted as conserved by GERP++ and PhyloP across 100 vertebrates. The amino acid Cys at position 282 is changed to a Tyr changing protein sequence and it might alter its composition and physico-chemical properties. For these reasons, this variant has been classified as Pathogenic. OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:na Neuberg Centre For Genomic Medicine, NCGM SCV005382430.1 HFE - - - +9 Pathogenic May 28, 2019 - OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing unknown:na Mendelics SCV001137062.1 HFE - - - +9 Pathogenic Dec 21, 2023 Based on the classification scheme VCGS_Germline_v1.3.4, this variant is classified as Pathogenic. Following criteria are met: 0102 - Loss of function is a known mechanism of disease in this gene and is associated with haemochromatosis (MIM#235200). (I) 0106 - This gene is associated with autosomal recessive disease. (I) 0112 - The condition associated with this gene has incomplete penetrance. The highest biochemical and clinical penetrance has been reported in p.(Cys282Tyr) homozygotes (PMID: 20301613). (I) 0200 - Variant is predicted to result in a missense amino acid change from cysteine to tyrosine. (I) 0252 - This variant is homozygous. (I) 0307 - Variant is present in gnomAD at a frequency >=0.05 (v2: 8992 heterozygotes, 276 homozygotes). (SB) 0501 - Missense variant consistently predicted to be damaging by multiple in silico tools or highly conserved with a major amino acid change. (SP) 0600 - Variant is located in the annotated IgC MHC I alpha3 functional domain (NCBI). (I) 0801 - This variant has very strong previous evidence of pathogenicity in unrelated individuals. It has previously been described as pathogenic in multiple patients with haemochromatosis (ClinVar; PMIDs: 37260121, 9162021, 19159930); either in a homozygous state or in trans with NP_000401.1(HFE):p.(His63Asp). (SP) 1002 - This variant has moderate functional evidence supporting abnormal protein function. Functional analysis using transfected cell lines showed defects in HFE protein intracellular transport and cell surface expression (PMID: 9162021). (SP) 1208 - Inheritance information for this variant is not currently available in this individual. (I) Legend: (SP) - Supporting pathogenic, (I) - Information, (SB) - Supporting benign OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:na Victorian Clinical Genetics Services, Murdoch Childrens Research Institute SCV005086593.1 HFE - - - +9 Pathogenic Oct 25, 2022 - MedGen:CN235283 C3661900:not provided criteria provided, single submitter clinical testing germline:na ARUP Laboratories, Molecular Genetics and Genomics, ARUP Laboratories SCV003799234.2 - - - - +9 not provided - - OMIM:235200 C3469186:Hemochromatosis type 1 no classification provided literature only germline:na GeneReviews SCV000245793.3 - - - - +9 Pathogenic Mar 30, 2021 HFE NM_000410.3 exon 4 p.Cys282Tyr (c.845G>A): This variant has been reported in the literature in the homozygous or compound heterozygous state in many individuals with hereditary hemochromatosis (HH) (Allen 2008 PMID:18199861, Pederson 2009 PMID:19159930, Cezard 2014 PMID:23953397, Gallego 2015 PMID:26365338) and is reported to be the most common cause of HH (Le Gac 2005 PMID:16132052, Gallego 2015 PMID:26365338, Porto 2016 PMID:26153218). This variant is present in 3.3% (9544/282608) of total alleles in the Genome Aggregation Database, including 276 homozygotes (https://gnomad.broadinstitute.org/variant/6-26093141-G-A). Please note, disease causing variants may be present in control databases at low frequencies, reflective of the general population, carrier status, and/or variable expressivity. This variant is present in ClinVar, with several labs classifying this variant as pathogenic (Variation ID:9). Evolutionary conservation and computational predictive tools suggest that this variant may impact the protein. In addition, an in vivo mouse study showed postnatal iron loading in mice homozygous for this variant (Levy 1999 PMID:10381492), and in vitro functional studies have shown that the mutant protein is retained in the ER and is unable to interact with beta2-microglobulin (Feder 1997 PMID:9162021, Waheed 1997 PMID:9356458). However, these studies may not accurately represent in vivo biological function. In summary, this variant is classified as pathogenic based on the data above. OMIM:104300;OMIM:176100;OMIM:176200;OMIM:235200;OMIM:612635;OMIM:614193 C0162532:Variegate porphyria;C0268323:Familial porphyria cutanea tarda;C1863052:Alzheimer disease type 1;C2673520:Microvascular complications of diabetes, susceptibility to, 7;C3280096:Transferrin serum level quantitative trait locus 2;C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:na Center for Genomics, Ann and Robert H. Lurie Children's Hospital of Chicago SCV003920032.1 HFE - - - +9 Benign Jan 01, 2009 - HEMOCHROMATOSIS, TYPE 1 C3469186:Hemochromatosis type 1 no assertion criteria provided literature only germline:na OMIM SCV000020162.9 HFE - - - +10 Likely pathogenic Mar 26, 2024 - OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:na Center for Genomic Medicine, King Faisal Specialist Hospital and Research Center SCV004806939.1 - - - - +10 Pathogenic Mar 28, 2023 - OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing biparental:na Genomic Medicine Lab, University of California San Francisco SCV004847117.1 HFE - - - +10 Pathogenic - The HFE c.187C>G (p.H63D) variant is a pathogenic variant seen in 10.8% of the human population in gnomAD. Indviduals with the p.H63D variant are considered carriers of hemochromatosis, although this variant is associated with less severe iron overload and reduced penetrance compared to another pathogenic HFE variant, c.845G>A, p.C282Y (PMID: 19159930; 20301613). MedGen:C3469186 C3469186:Hemochromatosis type 1 criteria provided, single submitter research germline:11 UNC Molecular Genetics Laboratory, University of North Carolina at Chapel Hill SCV001251532.1 HFE - - - +10 Pathogenic May 13, 2021 • The p.His63Asp variant in the HFE gene has been identified in the homozygous state in approximately 1% of individuals of European ancestry with HFE hemochromatosis, and in the compound heterozygous state with p.Cys282Tyr in approximately 3-8% of individuals of European ancestry with HFE hemochromatosis (Barton and Edwards, 2018). • The p.His63Asp variant is described as a low-penetrant allele and is rarely associated with clinical disease in the homozygous or compound heterozygous state (Gochee et al., 2002; Gurrin et al., 2009). • Individuals heterozygous for the p.His63Asp variant may demonstrate evidence of biochemical disease, including mildly elevated serum transferrin-iron saturation and serum ferritin concentration, but do not develop clinical manifestations of disease (Allen et al., 2008; Pedersen and Milman, 2009). • This variant has been identified in 18,635/129,168 European (non-Finnish) chromosomes (30,592/282,844 chromosomes overall) by the Genome Aggregation Database (http://gnomad.broadinstitute.org/). Although the p.His63Asp variant is seen at a frequency greater than 5% in the general population, this variant is recognized as a common low-penetrant variant that is an exception to ACMG/AMP classification guidelines (Ghosh et al., 2018). • These data were assessed using the ACMG/AMP variant interpretation guidelines. In summary, there is sufficient evidence to classify the p.His63Asp variant as pathogenic for autosomal recessive HFE hemochromatosis based on the information above. [ACMG evidence codes used: PS4] OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:1 Clinical Genomics Laboratory, Stanford Medicine SCV004801387.1 HFE - - - +10 Pathogenic - The observed missense c.187C>G(p.His63Asp) variant in HFE gene has been reported previously in homozygous or compound heterozygous state in multiple individuals affected with hemochromatosis (Atkins et al., 2022), however, penetrance of the homozygous genotype is very low and is associated with variable phenotypes. Experimental studies have shown that this missense change affects HFE function (Tomatsu et al., 2003). This variant has been reported with the high allele frequency of 10.9% in the gnomAD Exomes. This variant has been submitted to the ClinVar database with Benign / Uncertain Significance / Risk factor / Pathogenic (multiple submitters). The amino acid His at position 63 is changed to a Asp changing protein sequence and it might alter its composition and physico-chemical properties. The amino acid change p.His63Asp in HFE is predicted as conserved by GERP++ and PhyloP across 100 vertebrates.Though the variant frequency is very high in the population, the variant is enriched in patints with HFE hemochromatosis as compared to the general population (Burke et al., 2000). For these reasons, this variant has been classified as a Pathogenic variant which acts as a risk factor for the development of the disease. OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:na Neuberg Centre For Genomic Medicine, NCGM SCV005061024.1 HFE - - - +10 Pathogenic Dec 21, 2021 ACMG classification criteria: PS3, PS4, PM3 OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:na Laboratorio de Genetica e Diagnostico Molecular, Hospital Israelita Albert Einstein SCV004183355.1 HFE - - - +10 Pathogenic Nov 25, 2023 - OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing unknown:na Baylor Genetics SCV001523197.4 - - - - +10 Pathogenic Feb 01, 2024 HFE: PM3:Strong, PM1, PP4:Moderate, PS3:Moderate, PM2:Supporting MedGen:CN517202 C3661900:not provided criteria provided, single submitter clinical testing germline:118 CeGaT Center for Human Genetics Tuebingen SCV001154674.25 HFE - - - +10 Pathogenic Mar 25, 2022 - Not provided C3661900:not provided criteria provided, single submitter clinical testing germline:237 AiLife Diagnostics, AiLife Diagnostics SCV002502480.1 HFE - - - +10 Pathogenic May 23, 2023 - Not provided C3661900:not provided criteria provided, single submitter clinical testing germline:62 Mayo Clinic Laboratories, Mayo Clinic SCV001715880.3 HFE - - - +10 Pathogenic Feb 26, 2021 Variant summary: HFE c.187C>G (p.His63Asp) results in a non-conservative amino acid change located in the MHC class I-like antigen recognition-like domain (IPR011161) of the encoded protein sequence. Three of five in-silico tools predict a benign effect of the variant on protein function. The variant allele was found at a frequency of 0.11 in 251484 control chromosomes in the gnomAD database, including 1832 homozygotes. c.187C>G has been reported as a common disease variant in the literature in individuals affected with Hemochromatosis Type 1, in both homozygous and compound heterozygous states, but most frequently in trans with the most common disease variant c.845G>A (p.Cys282Tyr) (e.g. Feder_1996, Kelley_2014). These data indicate that the variant is likely to be associated with disease, however the variant appears to have very low penetrance, as the majority of homozygous or compound heterozygous individuals with this variant do not exhibit clinical symptoms of hemochromatosis despite some cases having elevated serum ferritin and transferrin saturation levels (e.g. Beutler_2002, Pedersen_2009). Several publications report experimental evidence evaluating an impact on protein function. While p.His63Asp was shown to have normal levels of association with beta2-globulin and expression of HFE on the cell surface in contrast to impairment observed in cells with the other common pathogenic variant p.Cys282Tyr (e.g. Waheed_1997), p.His63Asp was shown to induce ER-stress in-vitro and in a transgenic mouse model (e.g. Liu_2011). Transgenic mice expressing the murine equivalent of this variant were also reported to have increased iron storage and decreased levels of iron mobilization at 12 months of age (e.g. Nandar_2013). The variant has also been reported to alter the expression levels of several genes involved in sphingolipid metabolism (e.g. Ali-Rahmani_2011) and to affect cellular glutamate levels (e.g. Mitchell_2011). Sixteen clinical diagnostic laboratories have submitted clinical-significance assessments for this variant to ClinVar after 2014 without evidence for independent evaluation. Thirteen of these submitters report the variant as either Pathogenic or a risk factor for disease. Based on the evidence outlined above, the variant was classified as pathogenic with very low penetrance in association with Hemochromatosis. MedGen:C3469186 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:na Women's Health and Genetics/Laboratory Corporation of America, LabCorp SCV001519563.1 HFE - - - +10 Pathogenic Dec 27, 2022 - OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:1 Institute of Human Genetics Munich, Klinikum Rechts Der Isar, TU München SCV004045959.1 HFE - - - +10 Pathogenic Jan 13, 2020 Contributing pathogenic variant when co-inherited with other pathogenic variants in HFE or PPOX genes, but not pathogenic alone, even in the homozygous state. OMIM:176200 C0162532:Variegate porphyria criteria provided, single submitter clinical testing germline:na Genetics and Molecular Pathology, SA Pathology SCV002556586.2 HFE - - - +10 Pathogenic Oct 09, 2023 - OMIM:235200 C3469186:Hemochromatosis type 1 no assertion criteria provided clinical testing germline:na Zotz-Klimas Genetics Lab, MVZ Zotz Klimas SCV004041642.1 - - - - +10 other Jun 26, 2018 - not provided C3661900:not provided criteria provided, single submitter clinical testing germline:348 Eurofins Ntd Llc (ga) SCV000227124.5 HFE Variant classified as "other reportable" ??? variant is clinically benign (not associated with disease) but is reported when observed (e.g. pseudodeficiency alleles). - - +10 Pathogenic May 15, 2023 - OMIM:235200 C3469186:Hemochromatosis type 1 criteria provided, single submitter clinical testing germline:2 New York Genome Center SCV004046529.2 - - - - +10 Pathogenic Jul 31, 2024 - not provided C3661900:not provided criteria provided, single submitter clinical testing germline:na Center for Genomic Medicine, Rigshospitalet, Copenhagen University Hospital SCV002568070.6 HFE - - - +10 Pathogenic Jan 24, 2024 - not provided C3661900:not provided criteria provided, single submitter clinical testing germline:na Clinical Genetics Laboratory, Skane University Hospital Lund SCV005198437.1 HFE - - - diff --git a/v03_pipeline/var/test/reference_data/test_hgmd.vcf b/v03_pipeline/var/test/reference_datasets/raw/test_hgmd.vcf similarity index 100% rename from v03_pipeline/var/test/reference_data/test_hgmd.vcf rename to v03_pipeline/var/test/reference_datasets/raw/test_hgmd.vcf diff --git a/v03_pipeline/var/test/reference_datasets/raw/test_mitomap.csv b/v03_pipeline/var/test/reference_datasets/raw/test_mitomap.csv new file mode 100644 index 000000000..24491c2c0 --- /dev/null +++ b/v03_pipeline/var/test/reference_datasets/raw/test_mitomap.csv @@ -0,0 +1,4 @@ +"Index","Locus Type","Locus","Associated Diseases","Allele","Position","aaΔ or RNA","Status ♣(Mitomap [ClinGen])","Last StatusUpdate" +"1","tRNA","MT-TF","MELAS / MM & EXIT","m.583G>A","583","tRNA Phe","Cfrm [VUS*]","2022.10.10" +"2","tRNA","MT-TF","Gitelman-like syndrome","m.591C>T","591","tRNA Phe","Cfrm [LP]","2024.07.22" +"3","tRNA","MT-TF","Maternally inherited epilepsy / mito tubulointerstitial kidney disease (MITKD) / Gitelman-like syndrome","m.616T>C","616","tRNA Phe","Cfrm [LP]","2022.06.13"