diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index 3c20ee35c..c8e0894c1 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -36,6 +36,7 @@ jobs: run: ruff . --output-format github - name: Unit Tests run: | + export HAIL_TMP_DIR=/tmp export GRCH37_TO_GRCH38_LIFTOVER_REF_PATH=v03_pipeline/var/test/liftover/grch37_to_grch38.over.chain.gz export GRCH38_TO_GRCH37_LIFTOVER_REF_PATH=v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz export ACCESS_PRIVATE_REFERENCE_DATASETS=1 diff --git a/v03_pipeline/lib/misc/allele_registry.py b/v03_pipeline/lib/misc/allele_registry.py index 2c12538d5..e1ad9e1a2 100644 --- a/v03_pipeline/lib/misc/allele_registry.py +++ b/v03_pipeline/lib/misc/allele_registry.py @@ -77,8 +77,8 @@ def register_alleles( base_url: str, ) -> hl.Table: uuid4 = uuid.uuid4() - raw_vcf_file_name = f'{Env.HAIL_TMPDIR}/r_{uuid4}.vcf' - formatted_vcf_file_name = f'{Env.HAIL_TMPDIR}/f_{uuid4}.vcf' + raw_vcf_file_name = f'{Env.HAIL_TMP_DIR}/r_{uuid4}.vcf' + formatted_vcf_file_name = f'{Env.HAIL_TMP_DIR}/f_{uuid4}.vcf' # Export the variants to a VCF hl.export_vcf(ht, raw_vcf_file_name) diff --git a/v03_pipeline/lib/misc/allele_registry_test.py b/v03_pipeline/lib/misc/allele_registry_test.py index 83a9ceda6..45eea3061 100644 --- a/v03_pipeline/lib/misc/allele_registry_test.py +++ b/v03_pipeline/lib/misc/allele_registry_test.py @@ -39,7 +39,7 @@ def test_register_alleles_38( mock_put_request: Mock, ): mock_get_credentials.return_value = ('', '') - mock_env.HAIL_TMPDIR = self.temp_dir.name + mock_env.HAIL_TMP_DIR = self.temp_dir.name new_variants_ht = hl.Table.parallelize( [ diff --git a/v03_pipeline/lib/misc/io.py b/v03_pipeline/lib/misc/io.py index 599e791b8..ef2b26ecd 100644 --- a/v03_pipeline/lib/misc/io.py +++ b/v03_pipeline/lib/misc/io.py @@ -217,7 +217,7 @@ def checkpoint(t: hl.Table | hl.MatrixTable) -> tuple[hl.Table | hl.MatrixTable, suffix = 'mt' if isinstance(t, hl.MatrixTable) else 'ht' read_fn = hl.read_matrix_table if isinstance(t, hl.MatrixTable) else hl.read_table checkpoint_path = os.path.join( - Env.HAIL_TMPDIR, + Env.HAIL_TMP_DIR, f'{uuid.uuid4()}.{suffix}', ) t.write(checkpoint_path) diff --git a/v03_pipeline/lib/misc/validation.py b/v03_pipeline/lib/misc/validation.py index 6fac170b3..5540533fd 100644 --- a/v03_pipeline/lib/misc/validation.py +++ b/v03_pipeline/lib/misc/validation.py @@ -155,7 +155,7 @@ def validate_sample_type( msg = f'Sample type validation error: dataset contains noncoding variants but is missing common coding variants for {reference_genome.value}. Please verify that the dataset contains coding variants.' raise SeqrValidationError(msg) if has_coding and not has_noncoding and sample_type != SampleType.WES: - msg = 'Sample type validation error: dataset sample-type is specified as WGS but appears to be WES because it contains many common coding variants' + msg = 'Sample type validation error: dataset sample-type is specified as WGS but appears to be WES because it contains many common coding variants but is missing common non-coding variants' raise SeqrValidationError(msg) if has_noncoding and has_coding and sample_type != SampleType.WGS: msg = 'Sample type validation error: dataset sample-type is specified as WES but appears to be WGS because it contains many common non-coding variants' diff --git a/v03_pipeline/lib/model/environment.py b/v03_pipeline/lib/model/environment.py index 277f6a852..91e69a6a2 100644 --- a/v03_pipeline/lib/model/environment.py +++ b/v03_pipeline/lib/model/environment.py @@ -2,8 +2,6 @@ from dataclasses import dataclass # NB: using os.environ.get inside the dataclass defaults gives a lint error. -HAIL_TMPDIR = os.environ.get('HAIL_TMPDIR', '/tmp') # noqa: S108 -HAIL_SEARCH_DATA = os.environ.get('HAIL_SEARCH_DATA', '/seqr/hail-search-data') GRCH37_TO_GRCH38_LIFTOVER_REF_PATH = os.environ.get( 'GRCH37_TO_GRCH38_LIFTOVER_REF_PATH', 'gs://hail-common/references/grch37_to_grch38.over.chain.gz', @@ -12,15 +10,18 @@ 'GRCH38_TO_GRCH37_LIFTOVER_REF_PATH', 'gs://hail-common/references/grch38_to_grch37.over.chain.gz', ) -LOADING_DATASETS = os.environ.get('LOADING_DATASETS', '/seqr/seqr-loading-temp') -PRIVATE_REFERENCE_DATASETS = os.environ.get( - 'PRIVATE_REFERENCE_DATASETS', +HAIL_TMP_DIR = os.environ.get('HAIL_TMP_DIR', '/seqr/tmp') +HAIL_SEARCH_DATA_DIR = os.environ.get('HAIL_SEARCH_DATA_DIR', '/seqr/hail-search-data') +LOADING_DATASETS_DIR = os.environ.get('LOADING_DATASETS_DIR', '/seqr/seqr-loading-temp') +PRIVATE_REFERENCE_DATASETS_DIR_DIR = os.environ.get( + 'PRIVATE_REFERENCE_DATASETS_DIR_DIR', '/seqr/seqr-reference-data-private', ) -REFERENCE_DATASETS = os.environ.get( - 'REFERENCE_DATASETS', +REFERENCE_DATASETS_DIR = os.environ.get( + 'REFERENCE_DATASETS_DIR', '/seqr/seqr-reference-data', ) + # Allele registry secrets :/ ALLELE_REGISTRY_SECRET_NAME = os.environ.get('ALLELE_REGISTRY_SECRET_NAME', None) PROJECT_ID = os.environ.get('PROJECT_ID', None) @@ -40,12 +41,12 @@ class Env: ALLELE_REGISTRY_SECRET_NAME: str | None = ALLELE_REGISTRY_SECRET_NAME CHECK_SEX_AND_RELATEDNESS: bool = CHECK_SEX_AND_RELATEDNESS EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS - HAIL_TMPDIR: str = HAIL_TMPDIR - HAIL_SEARCH_DATA: str = HAIL_SEARCH_DATA + HAIL_TMP_DIR: str = HAIL_TMP_DIR + HAIL_SEARCH_DATA_DIR: str = HAIL_SEARCH_DATA_DIR GRCH37_TO_GRCH38_LIFTOVER_REF_PATH: str = GRCH37_TO_GRCH38_LIFTOVER_REF_PATH GRCH38_TO_GRCH37_LIFTOVER_REF_PATH: str = GRCH38_TO_GRCH37_LIFTOVER_REF_PATH - LOADING_DATASETS: str = LOADING_DATASETS - PRIVATE_REFERENCE_DATASETS: str = PRIVATE_REFERENCE_DATASETS + LOADING_DATASETS_DIR: str = LOADING_DATASETS_DIR + PRIVATE_REFERENCE_DATASETS_DIR_DIR: str = PRIVATE_REFERENCE_DATASETS_DIR_DIR PROJECT_ID: str | None = PROJECT_ID - REFERENCE_DATASETS: str = REFERENCE_DATASETS + REFERENCE_DATASETS_DIR: str = REFERENCE_DATASETS_DIR SHOULD_REGISTER_ALLELES: bool = SHOULD_REGISTER_ALLELES diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 5512e24dd..44dbb5310 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -32,9 +32,9 @@ def _v03_reference_data_prefix( reference_genome: ReferenceGenome, ) -> str: root = ( - Env.PRIVATE_REFERENCE_DATASETS + Env.PRIVATE_REFERENCE_DATASETS_DIR_DIR if access_control == AccessControl.PRIVATE - else Env.REFERENCE_DATASETS + else Env.REFERENCE_DATASETS_DIR ) return os.path.join( root, @@ -67,7 +67,7 @@ def family_table_path( ) -> str: return os.path.join( _pipeline_prefix( - Env.HAIL_SEARCH_DATA, + Env.HAIL_SEARCH_DATA_DIR, reference_genome, dataset_type, ), @@ -84,7 +84,7 @@ def imputed_sex_path( ) -> str: return os.path.join( _pipeline_prefix( - Env.LOADING_DATASETS, + Env.LOADING_DATASETS_DIR, reference_genome, dataset_type, ), @@ -100,7 +100,7 @@ def imported_callset_path( ) -> str: return os.path.join( _pipeline_prefix( - Env.LOADING_DATASETS, + Env.LOADING_DATASETS_DIR, reference_genome, dataset_type, ), @@ -132,7 +132,7 @@ def project_table_path( ) -> str: return os.path.join( _pipeline_prefix( - Env.HAIL_SEARCH_DATA, + Env.HAIL_SEARCH_DATA_DIR, reference_genome, dataset_type, ), @@ -149,7 +149,7 @@ def relatedness_check_table_path( ) -> str: return os.path.join( _pipeline_prefix( - Env.LOADING_DATASETS, + Env.LOADING_DATASETS_DIR, reference_genome, dataset_type, ), @@ -166,7 +166,7 @@ def remapped_and_subsetted_callset_path( ) -> str: return os.path.join( _pipeline_prefix( - Env.LOADING_DATASETS, + Env.LOADING_DATASETS_DIR, reference_genome, dataset_type, ), @@ -182,7 +182,7 @@ def lookup_table_path( ) -> str: return os.path.join( _pipeline_prefix( - Env.HAIL_SEARCH_DATA, + Env.HAIL_SEARCH_DATA_DIR, reference_genome, dataset_type, ), @@ -196,7 +196,7 @@ def runs_path( ) -> str: return os.path.join( _pipeline_prefix( - Env.HAIL_SEARCH_DATA, + Env.HAIL_SEARCH_DATA_DIR, reference_genome, dataset_type, ), @@ -211,7 +211,7 @@ def sex_check_table_path( ) -> str: return os.path.join( _pipeline_prefix( - Env.LOADING_DATASETS, + Env.LOADING_DATASETS_DIR, reference_genome, dataset_type, ), @@ -265,7 +265,7 @@ def variant_annotations_table_path( ) -> str: return os.path.join( _pipeline_prefix( - Env.HAIL_SEARCH_DATA, + Env.HAIL_SEARCH_DATA_DIR, reference_genome, dataset_type, ), @@ -279,7 +279,7 @@ def variant_annotations_vcf_path( ) -> str: return os.path.join( _pipeline_prefix( - Env.HAIL_SEARCH_DATA, + Env.HAIL_SEARCH_DATA_DIR, reference_genome, dataset_type, ), @@ -304,7 +304,7 @@ def new_variants_table_path( def clinvar_dataset_path(reference_genome: ReferenceGenome, etag: str) -> str: return os.path.join( - Env.HAIL_TMPDIR, + Env.HAIL_TMP_DIR, f'clinvar-{reference_genome.value}-{etag}.ht', ) @@ -317,7 +317,7 @@ def project_remap_path( ) -> str: return os.path.join( _pipeline_prefix( - Env.LOADING_DATASETS, + Env.LOADING_DATASETS_DIR, reference_genome, dataset_type, ), @@ -335,7 +335,7 @@ def project_pedigree_path( ) -> str: return os.path.join( _pipeline_prefix( - Env.LOADING_DATASETS, + Env.LOADING_DATASETS_DIR, reference_genome, dataset_type, ), @@ -347,7 +347,7 @@ def project_pedigree_path( def loading_pipeline_queue_path() -> str: return os.path.join( - Env.LOADING_DATASETS, + Env.LOADING_DATASETS_DIR, 'loading_pipeline_queue', 'request.json', ) diff --git a/v03_pipeline/lib/paths_test.py b/v03_pipeline/lib/paths_test.py index 4b595e7e1..59081e647 100644 --- a/v03_pipeline/lib/paths_test.py +++ b/v03_pipeline/lib/paths_test.py @@ -50,7 +50,7 @@ def test_family_table_path(self) -> None: '/seqr/hail-search-data/v3.1/GRCh37/SNV_INDEL/families/WES/franklin.ht', ) with patch('v03_pipeline.lib.paths.Env') as mock_env: - mock_env.HAIL_SEARCH_DATA = 'gs://seqr-datasets/' + mock_env.HAIL_SEARCH_DATA_DIR = 'gs://seqr-datasets/' self.assertEqual( family_table_path( ReferenceGenome.GRCh37, diff --git a/v03_pipeline/lib/reference_data/clinvar.py b/v03_pipeline/lib/reference_data/clinvar.py index b9b59f89d..5e1980e32 100644 --- a/v03_pipeline/lib/reference_data/clinvar.py +++ b/v03_pipeline/lib/reference_data/clinvar.py @@ -139,7 +139,7 @@ def download_and_import_latest_clinvar_vcf( with tempfile.NamedTemporaryFile(suffix='.vcf.gz', delete=False) as tmp_file: urllib.request.urlretrieve(clinvar_url, tmp_file.name) # noqa: S310 gcs_tmp_file_name = os.path.join( - Env.HAIL_TMPDIR, + Env.HAIL_TMP_DIR, os.path.basename(tmp_file.name), ) safely_move_to_gcs(tmp_file.name, gcs_tmp_file_name) @@ -203,7 +203,7 @@ def download_and_import_clinvar_submission_summary() -> hl.Table: shutil.copyfileobj(f_in, f_out) gcs_tmp_file_name = os.path.join( - Env.HAIL_TMPDIR, + Env.HAIL_TMP_DIR, os.path.basename(unzipped_tmp_file.name), ) safely_move_to_gcs(unzipped_tmp_file.name, gcs_tmp_file_name) diff --git a/v03_pipeline/lib/tasks/base/base_hail_table.py b/v03_pipeline/lib/tasks/base/base_hail_table.py index 7f8a84a95..b5f7a0262 100644 --- a/v03_pipeline/lib/tasks/base/base_hail_table.py +++ b/v03_pipeline/lib/tasks/base/base_hail_table.py @@ -23,7 +23,7 @@ def complete(self) -> bool: def init_hail(self): # Need to use the GCP bucket as temp storage for very large callset joins - hl.init(tmp_dir=Env.HAIL_TMPDIR, idempotent=True) + hl.init(tmp_dir=Env.HAIL_TMP_DIR, idempotent=True) # Interval ref data join causes shuffle death, this prevents it hl._set_flags(use_new_shuffle='1', no_whole_stage_codegen='1') # noqa: SLF001 diff --git a/v03_pipeline/lib/tasks/base/base_update_lookup_table.py b/v03_pipeline/lib/tasks/base/base_update_lookup_table.py index 0c9aa719e..1dcb5f58d 100644 --- a/v03_pipeline/lib/tasks/base/base_update_lookup_table.py +++ b/v03_pipeline/lib/tasks/base/base_update_lookup_table.py @@ -46,5 +46,6 @@ def initialize_table(self) -> hl.Table: remap_pedigree_hash=hl.tint32, ), ), + migrations=hl.empty_array(hl.tstr), ), ) diff --git a/v03_pipeline/lib/tasks/update_lookup_table.py b/v03_pipeline/lib/tasks/update_lookup_table.py index 5828bd141..26f525ecc 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table.py +++ b/v03_pipeline/lib/tasks/update_lookup_table.py @@ -115,5 +115,6 @@ def update_table(self, ht: hl.Table) -> hl.Table: ), ), ), + migrations=ht.migrations, ) return ht diff --git a/v03_pipeline/lib/tasks/update_lookup_table_test.py b/v03_pipeline/lib/tasks/update_lookup_table_test.py index 6bdca1ce4..ebf6b0eef 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table_test.py +++ b/v03_pipeline/lib/tasks/update_lookup_table_test.py @@ -51,6 +51,7 @@ def test_skip_update_lookup_table_task(self) -> None: ), ), }, + migrations=[], ), ], ) @@ -89,6 +90,7 @@ def test_update_lookup_table_task(self) -> None: ), ), }, + migrations=[], ), ], ) diff --git a/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_families_test.py b/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_families_test.py index 098b97ffb..75caab822 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_families_test.py +++ b/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_families_test.py @@ -33,6 +33,7 @@ def test_delete_project_empty_table( project_sample_types=[], project_families={}, updates=set(), + migrations=[], ), ], ) @@ -137,6 +138,7 @@ def test_delete_project( remap_pedigree_hash=123, ), }, + migrations=hl.empty_array(hl.tstr), ), ) worker = luigi.worker.Worker() @@ -172,6 +174,7 @@ def test_delete_project( remap_pedigree_hash=123, ), }, + migrations=[], ), ], ) diff --git a/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_project_test.py b/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_project_test.py index b0cb2db80..cf0cfb61a 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_project_test.py +++ b/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_project_test.py @@ -32,6 +32,7 @@ def test_delete_project_empty_table( project_sample_types=[], project_families={}, updates=set(), + migrations=[], ), ], ) @@ -136,6 +137,7 @@ def test_delete_project( remap_pedigree_hash=123, ), }, + migrations=hl.empty_array(hl.tstr), ), ) worker = luigi.worker.Worker() @@ -162,6 +164,7 @@ def test_delete_project( remap_pedigree_hash=123, ), }, + migrations=[], ), ], ) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project_test.py index e493fbc30..fb5c4ccfd 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project_test.py @@ -107,6 +107,7 @@ def setUp(self) -> None: remap_pedigree_hash=123, ), }, + migrations=hl.empty_array(hl.tstr), ), ) ht.write( diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index e1730e25d..9a678786b 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -1221,7 +1221,7 @@ def test_sv_update_vat( self.assertTrue(update_variant_annotations_task.complete()) self.assertFalse( GCSorLocalFolderTarget( - f'{self.mock_env.REFERENCE_DATASETS}/v03/GRCh38/SV/lookup.ht', + f'{self.mock_env.REFERENCE_DATASETS_DIR}/v03/GRCh38/SV/lookup.ht', ).exists(), ) ht = hl.read_table(update_variant_annotations_task.output().path) @@ -1800,7 +1800,7 @@ def test_gcnv_update_vat( self.assertTrue(update_variant_annotations_task.complete()) self.assertFalse( GCSorLocalFolderTarget( - f'{self.mock_env.REFERENCE_DATASETS}/v03/GRCh38/GCNV/lookup.ht', + f'{self.mock_env.REFERENCE_DATASETS_DIR}/v03/GRCh38/GCNV/lookup.ht', ).exists(), ) ht = hl.read_table(update_variant_annotations_task.output().path)