Skip to content

Dev #895

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Sep 12, 2024
Merged

Dev #895

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ jobs:
run: ruff . --output-format github
- name: Unit Tests
run: |
export HAIL_TMP_DIR=/tmp
export GRCH37_TO_GRCH38_LIFTOVER_REF_PATH=v03_pipeline/var/test/liftover/grch37_to_grch38.over.chain.gz
export GRCH38_TO_GRCH37_LIFTOVER_REF_PATH=v03_pipeline/var/test/liftover/grch38_to_grch37.over.chain.gz
export ACCESS_PRIVATE_REFERENCE_DATASETS=1
Expand Down
4 changes: 2 additions & 2 deletions v03_pipeline/lib/misc/allele_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ def register_alleles(
base_url: str,
) -> hl.Table:
uuid4 = uuid.uuid4()
raw_vcf_file_name = f'{Env.HAIL_TMPDIR}/r_{uuid4}.vcf'
formatted_vcf_file_name = f'{Env.HAIL_TMPDIR}/f_{uuid4}.vcf'
raw_vcf_file_name = f'{Env.HAIL_TMP_DIR}/r_{uuid4}.vcf'
formatted_vcf_file_name = f'{Env.HAIL_TMP_DIR}/f_{uuid4}.vcf'

# Export the variants to a VCF
hl.export_vcf(ht, raw_vcf_file_name)
Expand Down
2 changes: 1 addition & 1 deletion v03_pipeline/lib/misc/allele_registry_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def test_register_alleles_38(
mock_put_request: Mock,
):
mock_get_credentials.return_value = ('', '')
mock_env.HAIL_TMPDIR = self.temp_dir.name
mock_env.HAIL_TMP_DIR = self.temp_dir.name

new_variants_ht = hl.Table.parallelize(
[
Expand Down
2 changes: 1 addition & 1 deletion v03_pipeline/lib/misc/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ def checkpoint(t: hl.Table | hl.MatrixTable) -> tuple[hl.Table | hl.MatrixTable,
suffix = 'mt' if isinstance(t, hl.MatrixTable) else 'ht'
read_fn = hl.read_matrix_table if isinstance(t, hl.MatrixTable) else hl.read_table
checkpoint_path = os.path.join(
Env.HAIL_TMPDIR,
Env.HAIL_TMP_DIR,
f'{uuid.uuid4()}.{suffix}',
)
t.write(checkpoint_path)
Expand Down
2 changes: 1 addition & 1 deletion v03_pipeline/lib/misc/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def validate_sample_type(
msg = f'Sample type validation error: dataset contains noncoding variants but is missing common coding variants for {reference_genome.value}. Please verify that the dataset contains coding variants.'
raise SeqrValidationError(msg)
if has_coding and not has_noncoding and sample_type != SampleType.WES:
msg = 'Sample type validation error: dataset sample-type is specified as WGS but appears to be WES because it contains many common coding variants'
msg = 'Sample type validation error: dataset sample-type is specified as WGS but appears to be WES because it contains many common coding variants but is missing common non-coding variants'
raise SeqrValidationError(msg)
if has_noncoding and has_coding and sample_type != SampleType.WGS:
msg = 'Sample type validation error: dataset sample-type is specified as WES but appears to be WGS because it contains many common non-coding variants'
Expand Down
25 changes: 13 additions & 12 deletions v03_pipeline/lib/model/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
from dataclasses import dataclass

# NB: using os.environ.get inside the dataclass defaults gives a lint error.
HAIL_TMPDIR = os.environ.get('HAIL_TMPDIR', '/tmp') # noqa: S108
HAIL_SEARCH_DATA = os.environ.get('HAIL_SEARCH_DATA', '/seqr/hail-search-data')
GRCH37_TO_GRCH38_LIFTOVER_REF_PATH = os.environ.get(
'GRCH37_TO_GRCH38_LIFTOVER_REF_PATH',
'gs://hail-common/references/grch37_to_grch38.over.chain.gz',
Expand All @@ -12,15 +10,18 @@
'GRCH38_TO_GRCH37_LIFTOVER_REF_PATH',
'gs://hail-common/references/grch38_to_grch37.over.chain.gz',
)
LOADING_DATASETS = os.environ.get('LOADING_DATASETS', '/seqr/seqr-loading-temp')
PRIVATE_REFERENCE_DATASETS = os.environ.get(
'PRIVATE_REFERENCE_DATASETS',
HAIL_TMP_DIR = os.environ.get('HAIL_TMP_DIR', '/seqr/tmp')
HAIL_SEARCH_DATA_DIR = os.environ.get('HAIL_SEARCH_DATA_DIR', '/seqr/hail-search-data')
LOADING_DATASETS_DIR = os.environ.get('LOADING_DATASETS_DIR', '/seqr/seqr-loading-temp')
PRIVATE_REFERENCE_DATASETS_DIR_DIR = os.environ.get(
'PRIVATE_REFERENCE_DATASETS_DIR_DIR',
'/seqr/seqr-reference-data-private',
)
REFERENCE_DATASETS = os.environ.get(
'REFERENCE_DATASETS',
REFERENCE_DATASETS_DIR = os.environ.get(
'REFERENCE_DATASETS_DIR',
'/seqr/seqr-reference-data',
)

# Allele registry secrets :/
ALLELE_REGISTRY_SECRET_NAME = os.environ.get('ALLELE_REGISTRY_SECRET_NAME', None)
PROJECT_ID = os.environ.get('PROJECT_ID', None)
Expand All @@ -40,12 +41,12 @@ class Env:
ALLELE_REGISTRY_SECRET_NAME: str | None = ALLELE_REGISTRY_SECRET_NAME
CHECK_SEX_AND_RELATEDNESS: bool = CHECK_SEX_AND_RELATEDNESS
EXPECT_WES_FILTERS: bool = EXPECT_WES_FILTERS
HAIL_TMPDIR: str = HAIL_TMPDIR
HAIL_SEARCH_DATA: str = HAIL_SEARCH_DATA
HAIL_TMP_DIR: str = HAIL_TMP_DIR
HAIL_SEARCH_DATA_DIR: str = HAIL_SEARCH_DATA_DIR
GRCH37_TO_GRCH38_LIFTOVER_REF_PATH: str = GRCH37_TO_GRCH38_LIFTOVER_REF_PATH
GRCH38_TO_GRCH37_LIFTOVER_REF_PATH: str = GRCH38_TO_GRCH37_LIFTOVER_REF_PATH
LOADING_DATASETS: str = LOADING_DATASETS
PRIVATE_REFERENCE_DATASETS: str = PRIVATE_REFERENCE_DATASETS
LOADING_DATASETS_DIR: str = LOADING_DATASETS_DIR
PRIVATE_REFERENCE_DATASETS_DIR_DIR: str = PRIVATE_REFERENCE_DATASETS_DIR_DIR
PROJECT_ID: str | None = PROJECT_ID
REFERENCE_DATASETS: str = REFERENCE_DATASETS
REFERENCE_DATASETS_DIR: str = REFERENCE_DATASETS_DIR
SHOULD_REGISTER_ALLELES: bool = SHOULD_REGISTER_ALLELES
34 changes: 17 additions & 17 deletions v03_pipeline/lib/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,9 @@ def _v03_reference_data_prefix(
reference_genome: ReferenceGenome,
) -> str:
root = (
Env.PRIVATE_REFERENCE_DATASETS
Env.PRIVATE_REFERENCE_DATASETS_DIR_DIR
if access_control == AccessControl.PRIVATE
else Env.REFERENCE_DATASETS
else Env.REFERENCE_DATASETS_DIR
)
return os.path.join(
root,
Expand Down Expand Up @@ -67,7 +67,7 @@ def family_table_path(
) -> str:
return os.path.join(
_pipeline_prefix(
Env.HAIL_SEARCH_DATA,
Env.HAIL_SEARCH_DATA_DIR,
reference_genome,
dataset_type,
),
Expand All @@ -84,7 +84,7 @@ def imputed_sex_path(
) -> str:
return os.path.join(
_pipeline_prefix(
Env.LOADING_DATASETS,
Env.LOADING_DATASETS_DIR,
reference_genome,
dataset_type,
),
Expand All @@ -100,7 +100,7 @@ def imported_callset_path(
) -> str:
return os.path.join(
_pipeline_prefix(
Env.LOADING_DATASETS,
Env.LOADING_DATASETS_DIR,
reference_genome,
dataset_type,
),
Expand Down Expand Up @@ -132,7 +132,7 @@ def project_table_path(
) -> str:
return os.path.join(
_pipeline_prefix(
Env.HAIL_SEARCH_DATA,
Env.HAIL_SEARCH_DATA_DIR,
reference_genome,
dataset_type,
),
Expand All @@ -149,7 +149,7 @@ def relatedness_check_table_path(
) -> str:
return os.path.join(
_pipeline_prefix(
Env.LOADING_DATASETS,
Env.LOADING_DATASETS_DIR,
reference_genome,
dataset_type,
),
Expand All @@ -166,7 +166,7 @@ def remapped_and_subsetted_callset_path(
) -> str:
return os.path.join(
_pipeline_prefix(
Env.LOADING_DATASETS,
Env.LOADING_DATASETS_DIR,
reference_genome,
dataset_type,
),
Expand All @@ -182,7 +182,7 @@ def lookup_table_path(
) -> str:
return os.path.join(
_pipeline_prefix(
Env.HAIL_SEARCH_DATA,
Env.HAIL_SEARCH_DATA_DIR,
reference_genome,
dataset_type,
),
Expand All @@ -196,7 +196,7 @@ def runs_path(
) -> str:
return os.path.join(
_pipeline_prefix(
Env.HAIL_SEARCH_DATA,
Env.HAIL_SEARCH_DATA_DIR,
reference_genome,
dataset_type,
),
Expand All @@ -211,7 +211,7 @@ def sex_check_table_path(
) -> str:
return os.path.join(
_pipeline_prefix(
Env.LOADING_DATASETS,
Env.LOADING_DATASETS_DIR,
reference_genome,
dataset_type,
),
Expand Down Expand Up @@ -265,7 +265,7 @@ def variant_annotations_table_path(
) -> str:
return os.path.join(
_pipeline_prefix(
Env.HAIL_SEARCH_DATA,
Env.HAIL_SEARCH_DATA_DIR,
reference_genome,
dataset_type,
),
Expand All @@ -279,7 +279,7 @@ def variant_annotations_vcf_path(
) -> str:
return os.path.join(
_pipeline_prefix(
Env.HAIL_SEARCH_DATA,
Env.HAIL_SEARCH_DATA_DIR,
reference_genome,
dataset_type,
),
Expand All @@ -304,7 +304,7 @@ def new_variants_table_path(

def clinvar_dataset_path(reference_genome: ReferenceGenome, etag: str) -> str:
return os.path.join(
Env.HAIL_TMPDIR,
Env.HAIL_TMP_DIR,
f'clinvar-{reference_genome.value}-{etag}.ht',
)

Expand All @@ -317,7 +317,7 @@ def project_remap_path(
) -> str:
return os.path.join(
_pipeline_prefix(
Env.LOADING_DATASETS,
Env.LOADING_DATASETS_DIR,
reference_genome,
dataset_type,
),
Expand All @@ -335,7 +335,7 @@ def project_pedigree_path(
) -> str:
return os.path.join(
_pipeline_prefix(
Env.LOADING_DATASETS,
Env.LOADING_DATASETS_DIR,
reference_genome,
dataset_type,
),
Expand All @@ -347,7 +347,7 @@ def project_pedigree_path(

def loading_pipeline_queue_path() -> str:
return os.path.join(
Env.LOADING_DATASETS,
Env.LOADING_DATASETS_DIR,
'loading_pipeline_queue',
'request.json',
)
2 changes: 1 addition & 1 deletion v03_pipeline/lib/paths_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def test_family_table_path(self) -> None:
'/seqr/hail-search-data/v3.1/GRCh37/SNV_INDEL/families/WES/franklin.ht',
)
with patch('v03_pipeline.lib.paths.Env') as mock_env:
mock_env.HAIL_SEARCH_DATA = 'gs://seqr-datasets/'
mock_env.HAIL_SEARCH_DATA_DIR = 'gs://seqr-datasets/'
self.assertEqual(
family_table_path(
ReferenceGenome.GRCh37,
Expand Down
4 changes: 2 additions & 2 deletions v03_pipeline/lib/reference_data/clinvar.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def download_and_import_latest_clinvar_vcf(
with tempfile.NamedTemporaryFile(suffix='.vcf.gz', delete=False) as tmp_file:
urllib.request.urlretrieve(clinvar_url, tmp_file.name) # noqa: S310
gcs_tmp_file_name = os.path.join(
Env.HAIL_TMPDIR,
Env.HAIL_TMP_DIR,
os.path.basename(tmp_file.name),
)
safely_move_to_gcs(tmp_file.name, gcs_tmp_file_name)
Expand Down Expand Up @@ -203,7 +203,7 @@ def download_and_import_clinvar_submission_summary() -> hl.Table:
shutil.copyfileobj(f_in, f_out)

gcs_tmp_file_name = os.path.join(
Env.HAIL_TMPDIR,
Env.HAIL_TMP_DIR,
os.path.basename(unzipped_tmp_file.name),
)
safely_move_to_gcs(unzipped_tmp_file.name, gcs_tmp_file_name)
Expand Down
2 changes: 1 addition & 1 deletion v03_pipeline/lib/tasks/base/base_hail_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def complete(self) -> bool:

def init_hail(self):
# Need to use the GCP bucket as temp storage for very large callset joins
hl.init(tmp_dir=Env.HAIL_TMPDIR, idempotent=True)
hl.init(tmp_dir=Env.HAIL_TMP_DIR, idempotent=True)

# Interval ref data join causes shuffle death, this prevents it
hl._set_flags(use_new_shuffle='1', no_whole_stage_codegen='1') # noqa: SLF001
Expand Down
1 change: 1 addition & 0 deletions v03_pipeline/lib/tasks/base/base_update_lookup_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,6 @@ def initialize_table(self) -> hl.Table:
remap_pedigree_hash=hl.tint32,
),
),
migrations=hl.empty_array(hl.tstr),
),
)
1 change: 1 addition & 0 deletions v03_pipeline/lib/tasks/update_lookup_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,5 +115,6 @@ def update_table(self, ht: hl.Table) -> hl.Table:
),
),
),
migrations=ht.migrations,
)
return ht
2 changes: 2 additions & 0 deletions v03_pipeline/lib/tasks/update_lookup_table_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def test_skip_update_lookup_table_task(self) -> None:
),
),
},
migrations=[],
),
],
)
Expand Down Expand Up @@ -89,6 +90,7 @@ def test_update_lookup_table_task(self) -> None:
),
),
},
migrations=[],
),
],
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def test_delete_project_empty_table(
project_sample_types=[],
project_families={},
updates=set(),
migrations=[],
),
],
)
Expand Down Expand Up @@ -137,6 +138,7 @@ def test_delete_project(
remap_pedigree_hash=123,
),
},
migrations=hl.empty_array(hl.tstr),
),
)
worker = luigi.worker.Worker()
Expand Down Expand Up @@ -172,6 +174,7 @@ def test_delete_project(
remap_pedigree_hash=123,
),
},
migrations=[],
),
],
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def test_delete_project_empty_table(
project_sample_types=[],
project_families={},
updates=set(),
migrations=[],
),
],
)
Expand Down Expand Up @@ -136,6 +137,7 @@ def test_delete_project(
remap_pedigree_hash=123,
),
},
migrations=hl.empty_array(hl.tstr),
),
)
worker = luigi.worker.Worker()
Expand All @@ -162,6 +164,7 @@ def test_delete_project(
remap_pedigree_hash=123,
),
},
migrations=[],
),
],
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ def setUp(self) -> None:
remap_pedigree_hash=123,
),
},
migrations=hl.empty_array(hl.tstr),
),
)
ht.write(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1221,7 +1221,7 @@ def test_sv_update_vat(
self.assertTrue(update_variant_annotations_task.complete())
self.assertFalse(
GCSorLocalFolderTarget(
f'{self.mock_env.REFERENCE_DATASETS}/v03/GRCh38/SV/lookup.ht',
f'{self.mock_env.REFERENCE_DATASETS_DIR}/v03/GRCh38/SV/lookup.ht',
).exists(),
)
ht = hl.read_table(update_variant_annotations_task.output().path)
Expand Down Expand Up @@ -1800,7 +1800,7 @@ def test_gcnv_update_vat(
self.assertTrue(update_variant_annotations_task.complete())
self.assertFalse(
GCSorLocalFolderTarget(
f'{self.mock_env.REFERENCE_DATASETS}/v03/GRCh38/GCNV/lookup.ht',
f'{self.mock_env.REFERENCE_DATASETS_DIR}/v03/GRCh38/GCNV/lookup.ht',
).exists(),
)
ht = hl.read_table(update_variant_annotations_task.output().path)
Expand Down
Loading