Skip to content

Dev #872

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Aug 14, 2024
Merged

Dev #872

1 change: 1 addition & 0 deletions v03_pipeline/lib/model/definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ class Sex(Enum):
class PipelineVersion(Enum):
V02 = 'v02'
V03 = 'v03'
V3_1 = 'v3.1'


class ReferenceGenome(Enum):
Expand Down
30 changes: 17 additions & 13 deletions v03_pipeline/lib/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@
)


def _v03_pipeline_prefix(
def _pipeline_prefix(
root: str,
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
) -> str:
return os.path.join(
root,
PipelineVersion.V03.value,
PipelineVersion.V3_1.value,
reference_genome.value,
dataset_type.value,
)
Expand Down Expand Up @@ -62,15 +62,17 @@ def cached_reference_dataset_query_path(
def family_table_path(
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
sample_type: SampleType,
family_guid: str,
) -> str:
return os.path.join(
_v03_pipeline_prefix(
_pipeline_prefix(
Env.HAIL_SEARCH_DATA,
reference_genome,
dataset_type,
),
'families',
sample_type.value,
f'{family_guid}.ht',
)

Expand All @@ -81,7 +83,7 @@ def imputed_sex_path(
callset_path: str,
) -> str:
return os.path.join(
_v03_pipeline_prefix(
_pipeline_prefix(
Env.LOADING_DATASETS,
reference_genome,
dataset_type,
Expand All @@ -97,7 +99,7 @@ def imported_callset_path(
callset_path: str,
) -> str:
return os.path.join(
_v03_pipeline_prefix(
_pipeline_prefix(
Env.LOADING_DATASETS,
reference_genome,
dataset_type,
Expand Down Expand Up @@ -125,15 +127,17 @@ def metadata_for_run_path(
def project_table_path(
reference_genome: ReferenceGenome,
dataset_type: DatasetType,
sample_type: SampleType,
project_guid: str,
) -> str:
return os.path.join(
_v03_pipeline_prefix(
_pipeline_prefix(
Env.HAIL_SEARCH_DATA,
reference_genome,
dataset_type,
),
'projects',
sample_type.value,
f'{project_guid}.ht',
)

Expand All @@ -144,7 +148,7 @@ def relatedness_check_table_path(
callset_path: str,
) -> str:
return os.path.join(
_v03_pipeline_prefix(
_pipeline_prefix(
Env.LOADING_DATASETS,
reference_genome,
dataset_type,
Expand All @@ -161,7 +165,7 @@ def remapped_and_subsetted_callset_path(
project_guid: str,
) -> str:
return os.path.join(
_v03_pipeline_prefix(
_pipeline_prefix(
Env.LOADING_DATASETS,
reference_genome,
dataset_type,
Expand All @@ -177,7 +181,7 @@ def lookup_table_path(
dataset_type: DatasetType,
) -> str:
return os.path.join(
_v03_pipeline_prefix(
_pipeline_prefix(
Env.HAIL_SEARCH_DATA,
reference_genome,
dataset_type,
Expand All @@ -191,7 +195,7 @@ def runs_path(
dataset_type: DatasetType,
) -> str:
return os.path.join(
_v03_pipeline_prefix(
_pipeline_prefix(
Env.HAIL_SEARCH_DATA,
reference_genome,
dataset_type,
Expand All @@ -206,7 +210,7 @@ def sex_check_table_path(
callset_path: str,
) -> str:
return os.path.join(
_v03_pipeline_prefix(
_pipeline_prefix(
Env.LOADING_DATASETS,
reference_genome,
dataset_type,
Expand Down Expand Up @@ -260,7 +264,7 @@ def variant_annotations_table_path(
dataset_type: DatasetType,
) -> str:
return os.path.join(
_v03_pipeline_prefix(
_pipeline_prefix(
Env.HAIL_SEARCH_DATA,
reference_genome,
dataset_type,
Expand All @@ -274,7 +278,7 @@ def variant_annotations_vcf_path(
dataset_type: DatasetType,
) -> str:
return os.path.join(
_v03_pipeline_prefix(
_pipeline_prefix(
Env.HAIL_SEARCH_DATA,
reference_genome,
dataset_type,
Expand Down
29 changes: 16 additions & 13 deletions v03_pipeline/lib/paths_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,19 +42,21 @@ def test_family_table_path(self) -> None:
family_table_path(
ReferenceGenome.GRCh37,
DatasetType.SNV_INDEL,
SampleType.WES,
'franklin',
),
'/hail-search-data/v03/GRCh37/SNV_INDEL/families/franklin.ht',
'/hail-search-data/v3.1/GRCh37/SNV_INDEL/families/WES/franklin.ht',
)
with patch('v03_pipeline.lib.paths.Env') as mock_env:
mock_env.HAIL_SEARCH_DATA = 'gs://seqr-datasets/'
self.assertEqual(
family_table_path(
ReferenceGenome.GRCh37,
DatasetType.SNV_INDEL,
SampleType.WES,
'franklin',
),
'gs://seqr-datasets/v03/GRCh37/SNV_INDEL/families/franklin.ht',
'gs://seqr-datasets/v3.1/GRCh37/SNV_INDEL/families/WES/franklin.ht',
)

def test_valid_filters_path(self) -> None:
Expand Down Expand Up @@ -82,9 +84,10 @@ def test_project_table_path(self) -> None:
project_table_path(
ReferenceGenome.GRCh38,
DatasetType.MITO,
SampleType.WES,
'R0652_pipeline_test',
),
'/hail-search-data/v03/GRCh38/MITO/projects/R0652_pipeline_test.ht',
'/hail-search-data/v3.1/GRCh38/MITO/projects/WES/R0652_pipeline_test.ht',
)

def test_valid_reference_dataset_collection_path(self) -> None:
Expand Down Expand Up @@ -113,7 +116,7 @@ def test_lookup_table_path(self) -> None:
ReferenceGenome.GRCh37,
DatasetType.SV,
),
'/hail-search-data/v03/GRCh37/SV/lookup.ht',
'/hail-search-data/v3.1/GRCh37/SV/lookup.ht',
)

def test_sex_check_table_path(self) -> None:
Expand All @@ -123,7 +126,7 @@ def test_sex_check_table_path(self) -> None:
DatasetType.SNV_INDEL,
'gs://abc.efg/callset.vcf.gz',
),
'/seqr-loading-temp/v03/GRCh38/SNV_INDEL/sex_check/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.ht',
'/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/sex_check/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.ht',
)

def test_relatedness_check_table_path(self) -> None:
Expand All @@ -133,7 +136,7 @@ def test_relatedness_check_table_path(self) -> None:
DatasetType.SNV_INDEL,
'gs://abc.efg/callset.vcf.gz',
),
'/seqr-loading-temp/v03/GRCh38/SNV_INDEL/relatedness_check/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.ht',
'/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/relatedness_check/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.ht',
)

def test_metadata_for_run_path(self) -> None:
Expand All @@ -143,7 +146,7 @@ def test_metadata_for_run_path(self) -> None:
DatasetType.SNV_INDEL,
'manual__2023-06-26T18:30:09.349671+00:00',
),
'/hail-search-data/v03/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/metadata.json',
'/hail-search-data/v3.1/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/metadata.json',
)

def test_variant_annotations_table_path(self) -> None:
Expand All @@ -152,7 +155,7 @@ def test_variant_annotations_table_path(self) -> None:
ReferenceGenome.GRCh38,
DatasetType.GCNV,
),
'/hail-search-data/v03/GRCh38/GCNV/annotations.ht',
'/hail-search-data/v3.1/GRCh38/GCNV/annotations.ht',
)

def test_remapped_and_subsetted_callset_path(self) -> None:
Expand All @@ -163,7 +166,7 @@ def test_remapped_and_subsetted_callset_path(self) -> None:
'gs://abc.efg/callset.vcf.gz',
'R0111_tgg_bblanken_wes',
),
'/seqr-loading-temp/v03/GRCh38/GCNV/remapped_and_subsetted_callsets/R0111_tgg_bblanken_wes/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt',
'/seqr-loading-temp/v3.1/GRCh38/GCNV/remapped_and_subsetted_callsets/R0111_tgg_bblanken_wes/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt',
)
self.assertEqual(
remapped_and_subsetted_callset_path(
Expand All @@ -172,7 +175,7 @@ def test_remapped_and_subsetted_callset_path(self) -> None:
'gs://abc.efg/callset/*.vcf.gz',
'R0111_tgg_bblanken_wes',
),
'/seqr-loading-temp/v03/GRCh38/GCNV/remapped_and_subsetted_callsets/R0111_tgg_bblanken_wes/bce53ccdb49a5ed2513044e1d0c6224e3ffcc323f770dc807d9175fd3c70a050.mt',
'/seqr-loading-temp/v3.1/GRCh38/GCNV/remapped_and_subsetted_callsets/R0111_tgg_bblanken_wes/bce53ccdb49a5ed2513044e1d0c6224e3ffcc323f770dc807d9175fd3c70a050.mt',
)

def test_imported_callset_path(self) -> None:
Expand All @@ -182,7 +185,7 @@ def test_imported_callset_path(self) -> None:
DatasetType.SNV_INDEL,
'gs://abc.efg/callset.vcf.gz',
),
'/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imported_callsets/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt',
'/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/imported_callsets/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt',
)

def test_imputed_sex_path(self) -> None:
Expand All @@ -192,7 +195,7 @@ def test_imputed_sex_path(self) -> None:
DatasetType.SNV_INDEL,
'gs://abc.efg/callset.vcf.gz',
),
'/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imputed_sex/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.tsv',
'/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/imputed_sex/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.tsv',
)

def test_new_variants_table_path(self) -> None:
Expand All @@ -202,5 +205,5 @@ def test_new_variants_table_path(self) -> None:
DatasetType.SNV_INDEL,
'manual__2023-06-26T18:30:09.349671+00:00',
),
'/hail-search-data/v03/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/new_variants.ht',
'/hail-search-data/v3.1/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/new_variants.ht',
)
3 changes: 3 additions & 0 deletions v03_pipeline/lib/tasks/base/base_update_project_table.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
import hail as hl
import luigi

from v03_pipeline.lib.model import SampleType
from v03_pipeline.lib.paths import project_table_path
from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask
from v03_pipeline.lib.tasks.files import GCSorLocalTarget


class BaseUpdateProjectTableTask(BaseUpdateTask):
sample_type = luigi.EnumParameter(enum=SampleType)
project_guid = luigi.Parameter()

def output(self) -> luigi.Target:
return GCSorLocalTarget(
project_table_path(
self.reference_genome,
self.dataset_type,
self.sample_type,
self.project_guid,
),
)
Expand Down
3 changes: 3 additions & 0 deletions v03_pipeline/lib/tasks/delete_family_table.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
import luigi

from v03_pipeline.lib.model import SampleType
from v03_pipeline.lib.paths import family_table_path
from v03_pipeline.lib.tasks.base.base_delete_table import BaseDeleteTableTask
from v03_pipeline.lib.tasks.files import GCSorLocalTarget


class DeleteFamilyTableTask(BaseDeleteTableTask):
sample_type = luigi.EnumParameter(enum=SampleType)
family_guid = luigi.Parameter()

def output(self) -> luigi.Target:
return GCSorLocalTarget(
family_table_path(
self.reference_genome,
self.dataset_type,
self.sample_type,
self.family_guid,
),
)
5 changes: 4 additions & 1 deletion v03_pipeline/lib/tasks/delete_family_table_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import hail as hl
import luigi.worker

from v03_pipeline.lib.model import DatasetType, ReferenceGenome
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
from v03_pipeline.lib.paths import family_table_path
from v03_pipeline.lib.tasks.delete_family_table import DeleteFamilyTableTask
from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
Expand Down Expand Up @@ -41,6 +41,7 @@ def setUp(self) -> None:
family_table_path(
ReferenceGenome.GRCh38,
DatasetType.SNV_INDEL,
SampleType.WES,
'abc_1',
),
)
Expand All @@ -50,6 +51,7 @@ def test_delete_family_table_task(self) -> None:
task = DeleteFamilyTableTask(
reference_genome=ReferenceGenome.GRCh38,
dataset_type=DatasetType.SNV_INDEL,
sample_type=SampleType.WES,
family_guid='abc_1',
)
worker.add(task)
Expand All @@ -60,6 +62,7 @@ def test_delete_family_table_task(self) -> None:
family_table_path(
ReferenceGenome.GRCh38,
DatasetType.SNV_INDEL,
SampleType.WES,
'abc_1',
),
).exists(),
Expand Down
19 changes: 11 additions & 8 deletions v03_pipeline/lib/tasks/delete_family_tables.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import luigi

from v03_pipeline.lib.model import SampleType
from v03_pipeline.lib.tasks.base.base_hail_table import BaseHailTableTask
from v03_pipeline.lib.tasks.delete_family_table import DeleteFamilyTableTask

Expand All @@ -18,12 +19,14 @@ def complete(self) -> bool:
)

def run(self):
for family_guid in self.family_guids:
self.dynamic_delete_family_table_tasks.add(
DeleteFamilyTableTask(
reference_genome=self.reference_genome,
dataset_type=self.dataset_type,
family_guid=family_guid,
),
)
for sample_type in SampleType:
for family_guid in self.family_guids:
self.dynamic_delete_family_table_tasks.add(
DeleteFamilyTableTask(
reference_genome=self.reference_genome,
dataset_type=self.dataset_type,
sample_type=sample_type,
family_guid=family_guid,
),
)
yield self.dynamic_delete_family_table_tasks
Loading
Loading