Skip to content

Dev #833

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 8, 2024
Merged

Dev #833

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 3 additions & 54 deletions v03_pipeline/lib/misc/callsets.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,11 @@
from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path


def get_callset_ht( # noqa: PLR0913
def get_callset_ht(
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
callset_paths: list[str],
callset_path: str,
project_guids: list[str],
project_remap_paths: list[str],
project_pedigree_paths: list[str],
imputed_sex_paths: list[str] | None,
):
callset_hts = [
hl.read_matrix_table(
Expand All @@ -24,58 +21,10 @@ def get_callset_ht( # noqa: PLR0913
project_guid,
),
).rows()
for (callset_path, project_guid, _, _, _) in callset_project_pairs(
callset_paths,
project_guids,
project_remap_paths,
project_pedigree_paths,
imputed_sex_paths,
)
for project_guid in project_guids
]
callset_ht = functools.reduce(
(lambda ht1, ht2: ht1.union(ht2, unify=True)),
callset_hts,
)
return callset_ht.distinct()


def callset_project_pairs(
callset_paths: list[str],
project_guids: list[str],
project_remap_paths: list[str],
project_pedigree_paths: list[str],
imputed_sex_paths: list[str] | None,
):
if len(callset_paths) == len(project_guids):
return zip(
callset_paths,
project_guids,
project_remap_paths,
project_pedigree_paths,
imputed_sex_paths
if imputed_sex_paths is not None
else [None] * len(callset_paths),
strict=True,
)
return (
(
callset_path,
project_guid,
project_remap_path,
project_pedigree_path,
imputed_sex_path,
)
for callset_path, imputed_sex_path in zip(
callset_paths,
imputed_sex_paths
if imputed_sex_paths is not None
else [None] * len(callset_paths),
strict=False,
)
for (project_guid, project_remap_path, project_pedigree_path) in zip(
project_guids,
project_remap_paths,
project_pedigree_paths,
strict=True,
)
)
4 changes: 0 additions & 4 deletions v03_pipeline/lib/misc/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,6 @@ def import_callset(
callset_path: str,
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
filters_path: str | None = None,
) -> hl.MatrixTable:
if dataset_type == DatasetType.GCNV:
mt = import_gcnv_bed_file(callset_path)
Expand All @@ -131,9 +130,6 @@ def import_callset(
mt = hl.read_matrix_table(callset_path)
if dataset_type == DatasetType.SV:
mt = mt.annotate_rows(variant_id=mt.rsid)
if filters_path:
filters_ht = import_vcf(filters_path, reference_genome).rows()
mt = mt.annotate_rows(filters=filters_ht[mt.row_key].filters)
return mt.key_rows_by(*dataset_type.table_key_type(reference_genome).fields)


Expand Down
8 changes: 7 additions & 1 deletion v03_pipeline/lib/model/dataset_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import hail as hl

from v03_pipeline.lib.annotations import gcnv, mito, shared, snv_indel, sv
from v03_pipeline.lib.model.definitions import ReferenceGenome
from v03_pipeline.lib.model.definitions import ReferenceGenome, SampleType

MITO_MIN_HOM_THRESHOLD = 0.95
ZERO = 0.0
Expand Down Expand Up @@ -155,6 +155,12 @@ def has_gencode_ensembl_to_refseq_id_mapping(
self == DatasetType.SNV_INDEL and reference_genome == ReferenceGenome.GRCh38
)

def expect_filters(
self,
sample_type: SampleType,
) -> bool:
return self == DatasetType.SNV_INDEL and sample_type == SampleType.WES

@property
def has_gencode_gene_symbol_to_gene_id_mapping(self) -> bool:
return self == DatasetType.SV
Expand Down
25 changes: 19 additions & 6 deletions v03_pipeline/lib/model/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
from dataclasses import dataclass

# NB: using os.environ.get inside the dataclass defaults gives a lint error.
ACCESS_PRIVATE_REFERENCE_DATASETS = (
os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1'
)
REFERENCE_DATA_AUTO_UPDATE = os.environ.get('REFERENCE_DATA_AUTO_UPDATE') == '1'
HAIL_TMPDIR = os.environ.get('HAIL_TMPDIR', '/tmp') # noqa: S108
HAIL_SEARCH_DATA = os.environ.get('HAIL_SEARCH_DATA', '/hail-search-data')
LIFTOVER_REF_PATH = os.environ.get(
'LIFTOVER_REF_PATH',
'gs://hail-common/references/grch38_to_grch37.over.chain.gz',
)
LOADING_DATASETS = os.environ.get('LOADING_DATASETS', '/seqr-loading-temp')
PRIVATE_REFERENCE_DATASETS = os.environ.get(
'PRIVATE_REFERENCE_DATASETS',
Expand All @@ -19,21 +19,34 @@
)
VEP_CONFIG_PATH = os.environ.get('VEP_CONFIG_PATH', None)
VEP_CONFIG_URI = os.environ.get('VEP_CONFIG_URI', None)
SHOULD_REGISTER_ALLELES = os.environ.get('SHOULD_REGISTER_ALLELES') == '1'

# Allele registry secrets :/
ALLELE_REGISTRY_SECRET_NAME = os.environ.get('ALLELE_REGISTRY_SECRET_NAME', None)
PROJECT_ID = os.environ.get('PROJECT_ID', None)

# Feature Flags
ACCESS_PRIVATE_REFERENCE_DATASETS = (
os.environ.get('ACCESS_PRIVATE_REFERENCE_DATASETS') == '1'
)
CHECK_SEX_AND_RELATEDNESS = os.environ.get('CHECK_SEX_AND_RELATEDNESS') == '1'
EXPECT_WES_FILTERS = os.environ.get('EXPECT_WES_FILTERS') == '1'
REFERENCE_DATA_AUTO_UPDATE = os.environ.get('REFERENCE_DATA_AUTO_UPDATE') == '1'
SHOULD_REGISTER_ALLELES = os.environ.get('SHOULD_REGISTER_ALLELES') == '1'


@dataclass
class Env:
ACCESS_PRIVATE_REFERENCE_DATASETS: bool = ACCESS_PRIVATE_REFERENCE_DATASETS
ALLELE_REGISTRY_SECRET_NAME: str | None = ALLELE_REGISTRY_SECRET_NAME
REFERENCE_DATA_AUTO_UPDATE: bool = REFERENCE_DATA_AUTO_UPDATE
CHECK_SEX_AND_RELATEDNESS: bool = CHECK_SEX_AND_RELATEDNESS
EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS
HAIL_TMPDIR: str = HAIL_TMPDIR
HAIL_SEARCH_DATA: str = HAIL_SEARCH_DATA
LIFTOVER_REF_PATH: str = LIFTOVER_REF_PATH
LOADING_DATASETS: str = LOADING_DATASETS
PRIVATE_REFERENCE_DATASETS: str = PRIVATE_REFERENCE_DATASETS
PROJECT_ID: str | None = PROJECT_ID
REFERENCE_DATA_AUTO_UPDATE: bool = REFERENCE_DATA_AUTO_UPDATE
REFERENCE_DATASETS: str = REFERENCE_DATASETS
SHOULD_REGISTER_ALLELES: bool = SHOULD_REGISTER_ALLELES
VEP_CONFIG_PATH: str | None = VEP_CONFIG_PATH
Expand Down
36 changes: 36 additions & 0 deletions v03_pipeline/lib/paths.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import hashlib
import os
import re

from v03_pipeline.lib.model import (
AccessControl,
Expand All @@ -9,6 +10,7 @@
PipelineVersion,
ReferenceDatasetCollection,
ReferenceGenome,
SampleType,
)


Expand Down Expand Up @@ -73,6 +75,22 @@ def family_table_path(
)


def imputed_sex_path(
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
callset_path: str,
) -> str:
return os.path.join(
_v03_pipeline_prefix(
Env.LOADING_DATASETS,
reference_genome,
dataset_type,
),
'imputed_sex',
f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.tsv',
)


def imported_callset_path(
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
Expand Down Expand Up @@ -198,6 +216,24 @@ def sex_check_table_path(
)


def valid_filters_path(
dataset_type: DatasetType,
sample_type: SampleType,
callset_path: str,
) -> str | None:
if (
not Env.EXPECT_WES_FILTERS
or not dataset_type.expect_filters(sample_type)
or 'part_one_outputs' not in callset_path
):
return None
return re.sub(
'part_one_outputs/.*$',
'part_two_outputs/*.filtered.*.vcf.gz',
callset_path,
)


def valid_reference_dataset_collection_path(
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
Expand Down
33 changes: 33 additions & 0 deletions v03_pipeline/lib/paths_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,21 @@
DatasetType,
ReferenceDatasetCollection,
ReferenceGenome,
SampleType,
)
from v03_pipeline.lib.paths import (
cached_reference_dataset_query_path,
family_table_path,
imported_callset_path,
imputed_sex_path,
lookup_table_path,
metadata_for_run_path,
new_variants_table_path,
project_table_path,
relatedness_check_table_path,
remapped_and_subsetted_callset_path,
sex_check_table_path,
valid_filters_path,
valid_reference_dataset_collection_path,
variant_annotations_table_path,
)
Expand Down Expand Up @@ -54,6 +57,26 @@ def test_family_table_path(self) -> None:
'gs://seqr-datasets/v03/GRCh37/SNV_INDEL/families/franklin.ht',
)

def test_valid_filters_path(self) -> None:
self.assertEqual(
valid_filters_path(
DatasetType.MITO,
SampleType.WES,
'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
),
None,
)
with patch('v03_pipeline.lib.paths.Env') as mock_env:
mock_env.EXPECT_WES_FILTERS = True
self.assertEqual(
valid_filters_path(
DatasetType.SNV_INDEL,
SampleType.WES,
'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_one_outputs/chr*/*.vcf.gz',
),
'gs://bucket/RDG_Broad_WES_Internal_Oct2023/part_two_outputs/*.filtered.*.vcf.gz',
)

def test_project_table_path(self) -> None:
self.assertEqual(
project_table_path(
Expand Down Expand Up @@ -162,6 +185,16 @@ def test_imported_callset_path(self) -> None:
'/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imported_callsets/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt',
)

def test_imputed_sex_path(self) -> None:
self.assertEqual(
imputed_sex_path(
ReferenceGenome.GRCh38,
DatasetType.SNV_INDEL,
'gs://abc.efg/callset.vcf.gz',
),
'/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imputed_sex/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.tsv',
)

def test_new_variants_table_path(self) -> None:
self.assertEqual(
new_variants_table_path(
Expand Down
36 changes: 36 additions & 0 deletions v03_pipeline/lib/tasks/base/base_loading_run_params.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import luigi

from v03_pipeline.lib.model import SampleType


class BaseLoadingRunParams(luigi.Task):
# NB:
# These params are "inherited" with the special
# luigi.util.inherits function, copying params
# but nothing else.
sample_type = luigi.EnumParameter(enum=SampleType)
callset_path = luigi.Parameter()
ignore_missing_samples_when_remapping = luigi.BoolParameter(
default=False,
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
)
force = luigi.BoolParameter(
default=False,
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
)
skip_check_sex_and_relatedness = luigi.BoolParameter(
default=False,
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
)
skip_expect_filters = luigi.BoolParameter(
default=False,
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
)
skip_validation = luigi.BoolParameter(
default=False,
parsing=luigi.BoolParameter.EXPLICIT_PARSING,
)
is_new_gcnv_joint_call = luigi.BoolParameter(
default=False,
description='Is this a fully joint-called callset.',
)
Loading
Loading