Skip to content

use predetermined filters/imputed_sex paths #814

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Jun 21, 2024
4 changes: 0 additions & 4 deletions v03_pipeline/lib/misc/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,6 @@ def import_callset(
callset_path: str,
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
filters_path: str | None = None,
) -> hl.MatrixTable:
if dataset_type == DatasetType.GCNV:
mt = import_gcnv_bed_file(callset_path)
Expand All @@ -131,9 +130,6 @@ def import_callset(
mt = hl.read_matrix_table(callset_path)
if dataset_type == DatasetType.SV:
mt = mt.annotate_rows(variant_id=mt.rsid)
if filters_path:
filters_ht = import_vcf(filters_path, reference_genome).rows()
mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters)
return mt.key_rows_by(*dataset_type.table_key_type(reference_genome).fields)


Expand Down
8 changes: 7 additions & 1 deletion v03_pipeline/lib/model/dataset_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import hail as hl

from v03_pipeline.lib.annotations import gcnv, mito, shared, snv_indel, sv
from v03_pipeline.lib.model.definitions import ReferenceGenome
from v03_pipeline.lib.model.definitions import ReferenceGenome, SampleType

MITO_MIN_HOM_THRESHOLD = 0.95
ZERO = 0.0
Expand Down Expand Up @@ -155,6 +155,12 @@ def has_gencode_ensembl_to_refseq_id_mapping(
self == DatasetType.SNV_INDEL and reference_genome == ReferenceGenome.GRCh38
)

def expect_filters(
self,
sample_type: SampleType,
) -> bool:
return self == DatasetType.SNV_INDEL and sample_type == SampleType.WES

@property
def has_gencode_gene_symbol_to_gene_id_mapping(self) -> bool:
return self == DatasetType.SV
Expand Down
25 changes: 19 additions & 6 deletions v03_pipeline/lib/model/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
from dataclasses import dataclass

# NB: using os.environ.get inside the dataclass defaults gives a lint error.
ACCESS_PRIVATE_REFERENCE_DATASETS = (
os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1'
)
REFERENCE_DATA_AUTO_UPDATE = os.environ.get('REFERENCE_DATA_AUTO_UPDATE') == '1'
HAIL_TMPDIR = os.environ.get('HAIL_TMPDIR', '/tmp') # noqa: S108
HAIL_SEARCH_DATA = os.environ.get('HAIL_SEARCH_DATA', '/hail-search-data')
LIFTOVER_REF_PATH = os.environ.get(
'LIFTOVER_REF_PATH',
'gs://hail-common/references/grch38_to_grch37.over.chain.gz',
)
LOADING_DATASETS = os.environ.get('LOADING_DATASETS', '/seqr-loading-temp')
PRIVATE_REFERENCE_DATASETS = os.environ.get(
'PRIVATE_REFERENCE_DATASETS',
Expand All @@ -19,22 +19,35 @@
)
VEP_CONFIG_PATH = os.environ.get('VEP_CONFIG_PATH', None)
VEP_CONFIG_URI = os.environ.get('VEP_CONFIG_URI', None)
SHOULD_REGISTER_ALLELES = os.environ.get('SHOULD_REGISTER_ALLELES') == '1'

# Allele registry secrets :/
ALLELE_REGISTRY_SECRET_NAME = os.environ.get('ALLELE_REGISTRY_SECRET_NAME', None)
PROJECT_ID = os.environ.get('PROJECT_ID', None)

# Feature Flags
ACCESS_PRIVATE_REFERENCE_DATASETS = (
os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1'
)
CHECK_SEX_AND_RELATEDNESS = os.environ.get('CHECK_SEX_AND_RELATEDNESS') == '1'
EXPECT_WES_FILTERS = os.environ.get('EXPECT_WES_FILTERS') == '1'
REFERENCE_DATA_AUTO_UPDATE = os.environ.get('REFERENCE_DATA_AUTO_UPDATE') == '1'
SHOULD_REGISTER_ALLELES = os.environ.get('SHOULD_REGISTER_ALLELES') == '1'


@dataclass
class Env:
ACCESS_PRIVATE_REFERENCE_DATASETS: bool = ACCESS_PRIVATE_REFERENCE_DATASETS
ALLELE_REGISTRY_SECRET_NAME: str | None = ALLELE_REGISTRY_SECRET_NAME
REFERENCE_DATA_AUTO_UPDATE: bool = REFERENCE_DATA_AUTO_UPDATE
CHECK_SEX_AND_RELATEDNESS: bool = CHECK_SEX_AND_RELATEDNESS
EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS
HAIL_TMPDIR: str = HAIL_TMPDIR
HAIL_SEARCH_DATA: str = HAIL_SEARCH_DATA
LIFTOVER_REF_PATH: str = LIFTOVER_REF_PATH
LOADING_DATASETS: str = LOADING_DATASETS
PRIVATE_REFERENCE_DATASETS: str = PRIVATE_REFERENCE_DATASETS
PROJECT_ID: str | None = PROJECT_ID
REFERENCE_DATASETS: str = REFERENCE_DATASETS
REFERENCE_DATA_AUTO_UPDATE: bool = REFERENCE_DATA_AUTO_UPDATE
SHOULD_REGISTER_ALLELES: bool = SHOULD_REGISTER_ALLELES
VEP_CONFIG_PATH: str | None = VEP_CONFIG_PATH
VEP_CONFIG_URI: str | None = VEP_CONFIG_URI
36 changes: 36 additions & 0 deletions v03_pipeline/lib/paths.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import hashlib
import os
import re

from v03_pipeline.lib.model import (
AccessControl,
Expand All @@ -9,6 +10,7 @@
PipelineVersion,
ReferenceDatasetCollection,
ReferenceGenome,
SampleType,
)


Expand Down Expand Up @@ -73,6 +75,22 @@ def family_table_path(
)


def imputed_sex_path(
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
callset_path: str,
) -> str:
return os.path.join(
_v03_pipeline_prefix(
Env.LOADING_DATASETS,
reference_genome,
dataset_type,
),
'imputed_sex',
f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.tsv',
)


def imported_callset_path(
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
Expand Down Expand Up @@ -198,6 +216,24 @@ def sex_check_table_path(
)


def valid_filters_path(
dataset_type: DatasetType,
sample_type: SampleType,
callset_path: str,
) -> str | None:
if (
not Env.EXPECT_WES_FILTERS
or not dataset_type.expect_filters(sample_type)
or 'part_one_outputs' not in callset_path
):
return None
return re.sub(
'part_one_outputs/.*$',
'part_two_outputs/*.filtered.*.vcf.gz',
callset_path,
)


def valid_reference_dataset_collection_path(
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
Expand Down
33 changes: 33 additions & 0 deletions v03_pipeline/lib/paths_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,21 @@
DatasetType,
ReferenceDatasetCollection,
ReferenceGenome,
SampleType,
)
from v03_pipeline.lib.paths import (
cached_reference_dataset_query_path,
family_table_path,
imported_callset_path,
imputed_sex_path,
lookup_table_path,
metadata_for_run_path,
new_variants_table_path,
project_table_path,
relatedness_check_table_path,
remapped_and_subsetted_callset_path,
sex_check_table_path,
valid_filters_path,
valid_reference_dataset_collection_path,
variant_annotations_table_path,
)
Expand Down Expand Up @@ -54,6 +57,26 @@ def test_family_table_path(self) -> None:
'gs://seqr-datasets/v03/GRCh37/SNV_INDEL/families/franklin.ht',
)

def test_valid_filters_path(self) -> None:
self.assertEqual(
valid_filters_path(
DatasetType.MITO,
SampleType.WES,
'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
),
None,
)
with patch('v03_pipeline.lib.paths.Env') as mock_env:
mock_env.EXPECT_WES_FILTERS = True
self.assertEqual(
valid_filters_path(
DatasetType.SNV_INDEL,
SampleType.WES,
'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
),
'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_two_outputs/*.filtered.*.vcf.gz',
)

def test_project_table_path(self) -> None:
self.assertEqual(
project_table_path(
Expand Down Expand Up @@ -162,6 +185,16 @@ def test_imported_callset_path(self) -> None:
'/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imported_callsets/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt',
)

def test_imputed_sex_path(self) -> None:
self.assertEqual(
imputed_sex_path(
ReferenceGenome.GRCh38,
DatasetType.SNV_INDEL,
'gs://abc.efg/callset.vcf.gz',
),
'/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imputed_sex/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.tsv',
)

def test_new_variants_table_path(self) -> None:
self.assertEqual(
new_variants_table_path(
Expand Down
26 changes: 8 additions & 18 deletions v03_pipeline/lib/tasks/base/base_loading_run_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,37 +10,27 @@ class BaseLoadingRunParams(luigi.Task):
# but nothing else.
sample_type = luigi.EnumParameter(enum=SampleType)
callset_path = luigi.Parameter()
# HINT: OptionalParameter vs Parameter is significant here.
# The default Parameter will case `None` to the string "None".
imputed_sex_path = luigi.OptionalParameter(
default=None,
description='Optional path to a tsv of imputed sex values from the DRAGEN GVS pipeline.',
)
filters_path = luigi.OptionalParameter(
default=None,
description='Optional path to part two outputs from callset (VCF shards containing filter information)',
)
ignore_missing_samples_when_remapping = luigi.BoolParameter(
default=False,
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
)
validate = luigi.BoolParameter(
default=True,
force = luigi.BoolParameter(
default=False,
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
)
force = luigi.BoolParameter(
skip_check_sex_and_relatedness = luigi.BoolParameter(
default=False,
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
)
check_sex_and_relatedness = luigi.BoolParameter(
skip_expect_filters = luigi.BoolParameter(
default=False,
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
)
skip_validation = luigi.BoolParameter(
default=False,
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
)
is_new_gcnv_joint_call = luigi.BoolParameter(
default=False,
description='Is this a fully joint-called callset.',
)
liftover_ref_path = luigi.OptionalParameter(
default='gs://hail-common/references/grch38_to_grch37.over.chain.gz',
description='Path to GRCh38 to GRCh37 coordinates file',
)
7 changes: 2 additions & 5 deletions v03_pipeline/lib/tasks/update_lookup_table_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
)
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase

TEST_LIFTOVER = 'v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz'
TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv'
TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv'
Expand All @@ -26,8 +25,7 @@ def test_skip_update_lookup_table_task(self) -> None:
], # a project excluded from the lookup table
project_remap_paths=[TEST_REMAP],
project_pedigree_paths=[TEST_PEDIGREE_3],
validate=False,
liftover_ref_path=TEST_LIFTOVER,
skip_validation=True,
)
worker.add(uslt_task)
worker.run()
Expand Down Expand Up @@ -58,8 +56,7 @@ def test_update_lookup_table_task(self) -> None:
project_guids=['R0113_test_project'],
project_remap_paths=[TEST_REMAP],
project_pedigree_paths=[TEST_PEDIGREE_3],
validate=False,
liftover_ref_path=TEST_LIFTOVER,
skip_validation=True,
)
worker.add(uslt_task)
worker.run()
Expand Down
4 changes: 1 addition & 3 deletions v03_pipeline/lib/tasks/update_project_table_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from v03_pipeline.lib.tasks.update_project_table import UpdateProjectTableTask
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase

TEST_LIFTOVER = 'v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz'
TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv'
TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv'
Expand All @@ -22,8 +21,7 @@ def test_update_project_table_task(self) -> None:
project_guid='R0113_test_project',
project_remap_path=TEST_REMAP,
project_pedigree_path=TEST_PEDIGREE_3,
validate=False,
liftover_ref_path=TEST_LIFTOVER,
skip_validation=True,
)
worker.add(upt_task)
worker.run()
Expand Down
Loading
Loading