Skip to content

Remove concept of private crdqs #807

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 2 additions & 12 deletions v03_pipeline/lib/model/cached_reference_dataset_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
import hail as hl

from v03_pipeline.lib.model.dataset_type import DatasetType
from v03_pipeline.lib.model.definitions import AccessControl, ReferenceGenome
from v03_pipeline.lib.model.environment import Env
from v03_pipeline.lib.model.definitions import ReferenceGenome
from v03_pipeline.lib.reference_data.queries import (
clinvar_path_variants,
gnomad_coding_and_noncoding_variants,
Expand All @@ -20,10 +19,6 @@ class CachedReferenceDatasetQuery(Enum):
GNOMAD_QC = 'gnomad_qc'
HIGH_AF_VARIANTS = 'high_af_variants'

@property
def access_control(self) -> AccessControl:
return AccessControl.PUBLIC

def dataset(self, dataset_type: DatasetType) -> str | None:
return {
CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS: 'clinvar_mito'
Expand Down Expand Up @@ -56,15 +51,10 @@ def for_reference_genome_dataset_type(
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
) -> list['CachedReferenceDatasetQuery']:
crdqs = {
return {
(ReferenceGenome.GRCh38, DatasetType.SNV_INDEL): list(cls),
(ReferenceGenome.GRCh38, DatasetType.MITO): [
CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS,
],
(ReferenceGenome.GRCh37, DatasetType.SNV_INDEL): list(cls),
}.get((reference_genome, dataset_type), [])
if not Env.ACCESS_PRIVATE_REFERENCE_DATASETS:
return [
crdq for crdq in crdqs if crdq.access_control == AccessControl.PUBLIC
]
return crdqs
37 changes: 16 additions & 21 deletions v03_pipeline/lib/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,22 @@ def _v03_reference_data_prefix(
)


def cached_reference_dataset_query_path(
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
cached_reference_dataset_query: CachedReferenceDatasetQuery,
) -> str:
return os.path.join(
_v03_reference_data_prefix(
AccessControl.PUBLIC,
reference_genome,
),
dataset_type.value,
'cached_reference_dataset_queries',
f'{cached_reference_dataset_query.value}.ht',
)


def family_table_path(
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
Expand Down Expand Up @@ -182,27 +198,6 @@ def sex_check_table_path(
)


def valid_cached_reference_dataset_query_path(
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
cached_reference_dataset_query: CachedReferenceDatasetQuery,
) -> str | None:
if (
not Env.ACCESS_PRIVATE_REFERENCE_DATASETS
and cached_reference_dataset_query.access_control == AccessControl.PRIVATE
):
return None
return os.path.join(
_v03_reference_data_prefix(
cached_reference_dataset_query.access_control,
reference_genome,
),
dataset_type.value,
'cached_reference_dataset_queries',
f'{cached_reference_dataset_query.value}.ht',
)


def valid_reference_dataset_collection_path(
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
Expand Down
4 changes: 2 additions & 2 deletions v03_pipeline/lib/paths_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
ReferenceGenome,
)
from v03_pipeline.lib.paths import (
cached_reference_dataset_query_path,
family_table_path,
imported_callset_path,
lookup_table_path,
Expand All @@ -17,7 +18,6 @@
relatedness_check_table_path,
remapped_and_subsetted_callset_path,
sex_check_table_path,
valid_cached_reference_dataset_query_path,
valid_reference_dataset_collection_path,
variant_annotations_table_path,
)
Expand All @@ -26,7 +26,7 @@
class TestPaths(unittest.TestCase):
def test_cached_reference_dataset_query_path(self) -> None:
self.assertEqual(
valid_cached_reference_dataset_query_path(
cached_reference_dataset_query_path(
ReferenceGenome.GRCh38,
DatasetType.SNV_INDEL,
CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
ReferenceDatasetCollection,
)
from v03_pipeline.lib.paths import (
valid_cached_reference_dataset_query_path,
cached_reference_dataset_query_path,
valid_reference_dataset_collection_path,
)
from v03_pipeline.lib.reference_data.compare_globals import (
Expand Down Expand Up @@ -56,7 +56,7 @@ def complete(self) -> bool:

def output(self) -> luigi.Target:
return GCSorLocalTarget(
valid_cached_reference_dataset_query_path(
cached_reference_dataset_query_path(
self.reference_genome,
self.dataset_type,
self.crdq,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
SampleType,
)
from v03_pipeline.lib.paths import (
valid_cached_reference_dataset_query_path,
cached_reference_dataset_query_path,
valid_reference_dataset_collection_path,
)
from v03_pipeline.lib.reference_data.clinvar import CLINVAR_ASSERTIONS
Expand Down Expand Up @@ -167,7 +167,7 @@ def test_clinvar(
# clinvar has version '2022-01-01'
shutil.copytree(
CLINVAR_CRDQ_PATH,
valid_cached_reference_dataset_query_path(
cached_reference_dataset_query_path(
ReferenceGenome.GRCh38,
DatasetType.SNV_INDEL,
CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
SampleType,
)
from v03_pipeline.lib.paths import (
valid_cached_reference_dataset_query_path,
cached_reference_dataset_query_path,
valid_reference_dataset_collection_path,
)
from v03_pipeline.lib.reference_data.clinvar import CLINVAR_ASSERTIONS
Expand Down Expand Up @@ -341,7 +341,7 @@ def test_multiple_update_vat(
),
)
coding_and_noncoding_variants_ht.write(
valid_cached_reference_dataset_query_path(
cached_reference_dataset_query_path(
ReferenceGenome.GRCh38,
DatasetType.SNV_INDEL,
CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS,
Expand Down
6 changes: 3 additions & 3 deletions v03_pipeline/lib/tasks/write_imported_callset.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@
from v03_pipeline.lib.model import CachedReferenceDatasetQuery
from v03_pipeline.lib.model.environment import Env
from v03_pipeline.lib.paths import (
cached_reference_dataset_query_path,
imported_callset_path,
sex_check_table_path,
valid_cached_reference_dataset_query_path,
)
from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask
from v03_pipeline.lib.tasks.files import CallsetTask, GCSorLocalTarget, HailTableTask
Expand Down Expand Up @@ -86,7 +86,7 @@ def requires(self) -> list[luigi.Task]:
)
if Env.REFERENCE_DATA_AUTO_UPDATE
else HailTableTask(
valid_cached_reference_dataset_query_path(
cached_reference_dataset_query_path(
self.reference_genome,
self.dataset_type,
CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS,
Expand Down Expand Up @@ -169,7 +169,7 @@ def create_table(self) -> hl.MatrixTable:
validate_no_duplicate_variants(mt)
validate_expected_contig_frequency(mt, self.reference_genome)
coding_and_noncoding_ht = hl.read_table(
valid_cached_reference_dataset_query_path(
cached_reference_dataset_query_path(
self.reference_genome,
self.dataset_type,
CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS,
Expand Down
4 changes: 2 additions & 2 deletions v03_pipeline/lib/tasks/write_relatedness_check_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
from v03_pipeline.lib.methods.relatedness import call_relatedness
from v03_pipeline.lib.model import CachedReferenceDatasetQuery, Env
from v03_pipeline.lib.paths import (
cached_reference_dataset_query_path,
relatedness_check_table_path,
valid_cached_reference_dataset_query_path,
)
from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask
from v03_pipeline.lib.tasks.files import GCSorLocalTarget, HailTableTask
Expand Down Expand Up @@ -48,7 +48,7 @@ def requires(self) -> luigi.Task:
)
if Env.REFERENCE_DATA_AUTO_UPDATE
else HailTableTask(
valid_cached_reference_dataset_query_path(
cached_reference_dataset_query_path(
self.reference_genome,
self.dataset_type,
CachedReferenceDatasetQuery.GNOMAD_QC,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@
SampleType,
)
from v03_pipeline.lib.paths import (
cached_reference_dataset_query_path,
imported_callset_path,
relatedness_check_table_path,
valid_cached_reference_dataset_query_path,
)
from v03_pipeline.lib.tasks.write_relatedness_check_table import (
WriteRelatednessCheckTableTask,
Expand Down Expand Up @@ -45,7 +45,7 @@
class WriteRelatednessCheckTableTaskTest(MockedDatarootTestCase):
def setUp(self) -> None:
super().setUp()
self.gnomad_qc_path = valid_cached_reference_dataset_query_path(
self.gnomad_qc_path = cached_reference_dataset_query_path(
ReferenceGenome.GRCh38,
DatasetType.SNV_INDEL,
CachedReferenceDatasetQuery.GNOMAD_QC,
Expand Down
Loading