diff --git a/pyproject.toml b/pyproject.toml index 73826b005..2ffbdfe04 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,6 @@ extend-exclude = [ 'luigi_pipeline/lib/*', 'luigi_pipeline/seqr*.py', 'luigi_pipeline/tests/data/*', - 'v03_pipeline/lib/reference_data/gencode/*', ] ignore = [ # Individual Rules diff --git a/requirements.in b/requirements.in index 3cff99f79..af19fd655 100644 --- a/requirements.in +++ b/requirements.in @@ -3,6 +3,5 @@ google-api-python-client>=1.8.0 hail==0.2.132 luigi>=3.4.0 gnomad==0.6.4 -google-cloud-storage>=2.14.0 aiofiles==24.1.0 pydantic==2.8.2 diff --git a/requirements.txt b/requirements.txt index 4083931f8..a565a7c41 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,9 +8,9 @@ aiodns==2.0.0 # via hail aiofiles==24.1.0 # via -r requirements.in -aiohappyeyeballs==2.3.5 +aiohappyeyeballs==2.4.3 # via aiohttp -aiohttp==3.10.2 +aiohttp==3.10.10 # via # hail # slackclient @@ -22,122 +22,105 @@ asttokens==2.4.1 # via stack-data async-timeout==4.0.3 # via aiohttp -attrs==23.1.0 +attrs==24.2.0 # via aiohttp avro==1.11.3 # via hail azure-common==1.1.28 # via azure-mgmt-storage -azure-core==1.29.5 +azure-core==1.31.0 # via # azure-identity # azure-mgmt-core # azure-storage-blob # msrest -azure-identity==1.16.1 +azure-identity==1.19.0 # via hail azure-mgmt-core==1.4.0 # via azure-mgmt-storage azure-mgmt-storage==20.1.0 # via hail -azure-storage-blob==12.19.0 +azure-storage-blob==12.23.1 # via hail -bokeh==3.3.1 +bokeh==3.3.4 # via hail -boto3==1.33.1 +boto3==1.35.48 # via hail -botocore==1.33.1 +botocore==1.35.48 # via # boto3 # hail # s3transfer -cachetools==5.3.2 +cachetools==5.5.0 # via google-auth -certifi==2023.11.17 +certifi==2024.8.30 # via # elasticsearch # msrest # requests -cffi==1.16.0 +cffi==1.17.1 # via # cryptography # pycares -charset-normalizer==3.3.2 +charset-normalizer==3.4.0 # via requests click==8.1.7 # via typer -comm==0.2.0 +comm==0.2.2 # via ipywidgets commonmark==0.9.1 # via rich -contourpy==1.2.0 +contourpy==1.3.0 # via bokeh -cryptography==43.0.1 +cryptography==43.0.3 # via # azure-identity # azure-storage-blob # msal # pyjwt -cython==0.29.36 - # via hdbscan decorator==4.4.2 # via # hail # ipython deprecated==1.2.14 # via hail -dill==0.3.7 +dill==0.3.9 # via hail -docutils==0.20.1 +docutils==0.21.2 # via python-daemon elasticsearch==7.9.1 # via -r requirements.in -exceptiongroup==1.2.0 +exceptiongroup==1.2.2 # via ipython -executing==2.0.1 +executing==2.1.0 # via stack-data -frozenlist==1.4.0 +frozenlist==1.5.0 # via # aiohttp # aiosignal # hail gnomad==0.6.4 # via -r requirements.in -google-api-core==2.14.0 - # via - # google-api-python-client - # google-cloud-core - # google-cloud-storage -google-api-python-client==2.108.0 +google-api-core==2.21.0 + # via google-api-python-client +google-api-python-client==2.149.0 # via -r requirements.in -google-auth==2.23.4 +google-auth==2.35.0 # via # google-api-core # google-api-python-client # google-auth-httplib2 # google-auth-oauthlib - # google-cloud-core - # google-cloud-storage # hail -google-auth-httplib2==0.1.1 +google-auth-httplib2==0.2.0 # via google-api-python-client google-auth-oauthlib==0.8.0 # via hail -google-cloud-core==2.4.1 - # via google-cloud-storage -google-cloud-storage==2.14.0 - # via -r requirements.in -google-crc32c==1.5.0 - # via - # google-cloud-storage - # google-resumable-media -google-resumable-media==2.7.0 - # via google-cloud-storage -googleapis-common-protos==1.61.0 +googleapis-common-protos==1.65.0 # via google-api-core hail==0.2.132 # via -r requirements.in -hdbscan==0.8.33 +hdbscan==0.8.39 # via gnomad httplib2==0.22.0 # via @@ -145,15 +128,15 @@ httplib2==0.22.0 # google-auth-httplib2 humanize==1.1.0 # via hail -idna==3.6 +idna==3.10 # via # requests # yarl -ipython==8.18.1 +ipython==8.28.0 # via ipywidgets -ipywidgets==8.1.1 +ipywidgets==8.1.5 # via gnomad -isodate==0.6.1 +isodate==0.7.2 # via # azure-storage-blob # msrest @@ -161,43 +144,43 @@ janus==1.0.0 # via hail jedi==0.19.1 # via ipython -jinja2==3.1.3 +jinja2==3.1.4 # via bokeh jmespath==1.0.1 # via # boto3 # botocore -joblib==1.3.2 +joblib==1.4.2 # via # hdbscan # scikit-learn -jproperties==2.1.1 +jproperties==2.1.2 # via hail -jupyterlab-widgets==3.0.9 +jupyterlab-widgets==3.0.13 # via ipywidgets lockfile==0.12.2 # via python-daemon -luigi==3.4.0 +luigi==3.5.2 # via -r requirements.in -markupsafe==2.1.3 +markupsafe==3.0.2 # via jinja2 -matplotlib-inline==0.1.6 +matplotlib-inline==0.1.7 # via ipython -msal==1.28.0 +msal==1.31.0 # via # azure-identity # msal-extensions -msal-extensions==1.0.0 +msal-extensions==1.2.0 # via azure-identity msrest==0.7.1 # via azure-mgmt-storage -multidict==6.0.4 +multidict==6.1.0 # via # aiohttp # yarl -nest-asyncio==1.5.8 +nest-asyncio==1.6.0 # via hail -numpy==1.26.2 +numpy==1.26.4 # via # bokeh # contourpy @@ -208,102 +191,108 @@ numpy==1.26.2 # scipy oauthlib==3.2.2 # via requests-oauthlib -orjson==3.10.6 +orjson==3.10.10 # via hail -packaging==23.2 +packaging==24.1 # via # bokeh # plotly -pandas==2.1.3 +pandas==2.2.3 # via # bokeh # hail parsimonious==0.10.0 # via hail -parso==0.8.3 +parso==0.8.4 # via jedi pexpect==4.9.0 # via ipython -pillow==10.3.0 +pillow==11.0.0 # via bokeh -plotly==5.18.0 +plotly==5.24.1 # via hail -portalocker==2.8.2 +portalocker==2.10.1 # via msal-extensions -prompt-toolkit==3.0.41 +prompt-toolkit==3.0.48 # via ipython +propcache==0.2.0 + # via yarl +proto-plus==1.25.0 + # via google-api-core protobuf==3.20.2 # via # google-api-core # googleapis-common-protos # hail + # proto-plus ptyprocess==0.7.0 # via pexpect -pure-eval==0.2.2 +pure-eval==0.2.3 # via stack-data py4j==0.10.9.7 # via pyspark -pyasn1==0.5.1 +pyasn1==0.6.1 # via # pyasn1-modules # rsa -pyasn1-modules==0.3.0 +pyasn1-modules==0.4.1 # via google-auth pycares==4.4.0 # via aiodns -pycparser==2.21 +pycparser==2.22 # via cffi pydantic==2.8.2 # via -r requirements.in pydantic-core==2.20.1 # via pydantic -pygments==2.17.2 +pygments==2.18.0 # via # ipython # rich -pyjwt[crypto]==2.8.0 +pyjwt[crypto]==2.9.0 # via msal -pyparsing==3.1.1 +pyparsing==3.2.0 # via httplib2 -pyspark==3.5.1 +pyspark==3.5.3 # via hail python-daemon==3.0.1 # via luigi -python-dateutil==2.8.2 +python-dateutil==2.9.0.post0 # via # botocore # luigi # pandas python-json-logger==2.0.7 # via hail -pytz==2023.3.post1 +pytz==2024.2 # via pandas -pyyaml==6.0.1 +pyyaml==6.0.2 # via # bokeh # hail -regex==2023.10.3 +regex==2024.9.11 # via parsimonious -requests==2.31.0 +requests==2.32.3 # via # azure-core # google-api-core - # google-cloud-storage # hail # msal # msrest # requests-oauthlib -requests-oauthlib==1.3.1 +requests-oauthlib==2.0.0 # via # google-auth-oauthlib # msrest rich==12.6.0 - # via hail + # via + # hail + # typer rsa==4.9 # via google-auth -s3transfer==0.8.0 +s3transfer==0.10.3 # via boto3 -scikit-learn==1.5.0 +scikit-learn==1.5.2 # via # gnomad # hdbscan @@ -312,11 +301,12 @@ scipy==1.11.4 # hail # hdbscan # scikit-learn +shellingham==1.5.4 + # via typer six==1.16.0 # via # asttokens # azure-core - # isodate # jproperties # python-dateutil slackclient==2.5.0 @@ -327,52 +317,55 @@ stack-data==0.6.3 # via ipython tabulate==0.9.0 # via hail -tenacity==8.2.3 +tenacity==8.5.0 # via # luigi # plotly -threadpoolctl==3.2.0 +threadpoolctl==3.5.0 # via scikit-learn tornado==6.4.1 # via # bokeh # luigi -traitlets==5.14.0 +traitlets==5.14.3 # via # comm # ipython # ipywidgets # matplotlib-inline -typer==0.9.0 +typer==0.12.5 # via hail -typing-extensions==4.8.0 +typing-extensions==4.12.2 # via # azure-core + # azure-identity # azure-storage-blob + # ipython # janus + # multidict # pydantic # pydantic-core # typer -tzdata==2023.3 +tzdata==2024.2 # via pandas uritemplate==4.1.1 # via google-api-python-client -urllib3==2.0.7 +urllib3==2.2.3 # via # botocore # elasticsearch # requests -uvloop==0.19.0 +uvloop==0.21.0 # via hail -wcwidth==0.2.12 +wcwidth==0.2.13 # via prompt-toolkit -widgetsnbextension==4.0.9 +widgetsnbextension==4.0.13 # via ipywidgets wrapt==1.16.0 # via deprecated -xyzservices==2023.10.1 +xyzservices==2024.9.0 # via bokeh -yarl==1.9.3 +yarl==1.16.0 # via aiohttp # The following packages are considered to be unsafe in a requirements file: diff --git a/v03_pipeline/bin/rsync_reference_data.bash b/v03_pipeline/bin/rsync_reference_data.bash index 9dfc91d74..825c583e5 100755 --- a/v03_pipeline/bin/rsync_reference_data.bash +++ b/v03_pipeline/bin/rsync_reference_data.bash @@ -16,12 +16,34 @@ case $REFERENCE_GENOME in exit 1 esac -mkdir -p $REFERENCE_DATASETS_DIR/$REFERENCE_GENOME; +case $REFERENCE_DATASETS_DIR in + "gs://seqr-reference-data") + echo "Cannot rsync to the authoritative source" + exit 1 + ;; + *) + ;; +esac -if [ -f "$REFERENCE_DATASETS_DIR"/"$REFERENCE_GENOME"/_SUCCESS ]; then - echo "Skipping rsync because already successful" - exit 0; +if ! [[ "$REFERENCE_DATASETS_DIR" =~ gs://* ]]; then + mkdir -p $REFERENCE_DATASETS_DIR/$REFERENCE_GENOME; + if [ -f "$REFERENCE_DATASETS_DIR"/"$REFERENCE_GENOME"/_SUCCESS ]; then + echo "Skipping rsync because already successful" + exit 0; + fi +else + result=$(gsutil -q stat "$REFERENCE_DATASETS_DIR"/"$REFERENCE_GENOME"/_SUCCESS || echo 1) + if [[ $result != 1 ]]; then + echo "Skipping rsync because already successful" + exit 0; + fi fi gsutil -m rsync -rd "gs://seqr-reference-data/v03/$REFERENCE_GENOME" $REFERENCE_DATASETS_DIR/$REFERENCE_GENOME -touch "$REFERENCE_DATASETS_DIR"/"$REFERENCE_GENOME"/_SUCCESS +if ! [[ $REFERENCE_DATASETS_DIR =~ gs://* ]]; then + touch "$REFERENCE_DATASETS_DIR"/"$REFERENCE_GENOME"/_SUCCESS +else + touch _SUCCESS + gsutil cp _SUCCESS "$REFERENCE_DATASETS_DIR"/"$REFERENCE_GENOME"/_SUCCESS + rm -rf _SUCCESS +fi diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 2295951f3..0ae158866 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -187,6 +187,22 @@ def relatedness_check_table_path( ) +def relatedness_check_tsv_path( + reference_genome: ReferenceGenome, + dataset_type: DatasetType, + callset_path: str, +) -> str: + return os.path.join( + _pipeline_prefix( + Env.LOADING_DATASETS_DIR, + reference_genome, + dataset_type, + ), + 'relatedness_check', + f'{hashlib.sha256(callset_path.encode("utf8")).hexdigest()}.tsv', + ) + + def remapped_and_subsetted_callset_path( reference_genome: ReferenceGenome, dataset_type: DatasetType, diff --git a/v03_pipeline/lib/reference_data/clinvar.py b/v03_pipeline/lib/reference_data/clinvar.py index fd5d47561..3e482e0b6 100644 --- a/v03_pipeline/lib/reference_data/clinvar.py +++ b/v03_pipeline/lib/reference_data/clinvar.py @@ -120,6 +120,7 @@ def download_and_import_latest_clinvar_vcf( clinvar_url: str, reference_genome: ReferenceGenome, ) -> hl.Table: + version = parse_clinvar_release_date(clinvar_url) with tempfile.NamedTemporaryFile(suffix='.vcf.gz', delete=False) as tmp_file: urllib.request.urlretrieve(clinvar_url, tmp_file.name) # noqa: S310 cached_tmp_file_name = os.path.join( @@ -139,27 +140,20 @@ def download_and_import_latest_clinvar_vcf( min_partitions=MIN_HT_PARTITIONS, force_bgz=True, ) - mt = mt.annotate_globals(version=_parse_clinvar_release_date(tmp_file.name)) + mt = mt.annotate_globals(version=version) return join_to_submission_summary_ht(mt.rows()) -def _parse_clinvar_release_date(local_vcf_path: str) -> str: - """Parse clinvar release date from the VCF header. - - Args: - local_vcf_path (str): clinvar vcf path on the local file system. - - Returns: - str: return VCF release date as string, or None if release date not found in header. - """ - with gzip.open(local_vcf_path, 'rt') as f: - for line in f: - if line.startswith('##fileDate='): - return line.split('=')[-1].strip() - - if not line.startswith('#'): - return None - +def parse_clinvar_release_date(clinvar_url: str) -> str: + response = requests.get(clinvar_url, stream=True, timeout=10) + for byte_line in gzip.GzipFile(fileobj=response.raw): + line = byte_line.decode('ascii').strip() + if not line: + continue + if line.startswith('##fileDate='): + return line.split('=')[-1].strip() + if not line.startswith('#'): + return None return None diff --git a/v03_pipeline/lib/reference_data/clinvar_test.py b/v03_pipeline/lib/reference_data/clinvar_test.py index 8e1b509ff..fd8d4e832 100644 --- a/v03_pipeline/lib/reference_data/clinvar_test.py +++ b/v03_pipeline/lib/reference_data/clinvar_test.py @@ -1,17 +1,43 @@ +import gzip import unittest from unittest import mock import hail as hl +import responses from v03_pipeline.lib.reference_data.clinvar import ( import_submission_table, join_to_submission_summary_ht, + parse_clinvar_release_date, parsed_and_mapped_clnsigconf, parsed_clnsig, ) +CLINVAR_VCF_DATA = b""" +##fileformat=VCFv4.1 +##fileDate=2024-10-27 +##source=ClinVar +##reference=GRCh37 +##ID= +##INFO= +""" + class ClinvarTest(unittest.TestCase): + @responses.activate + def test_parse_clinvar_release_date(self): + clinvar_url = ( + 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz' + ) + responses.get( + clinvar_url, + body=gzip.compress(CLINVAR_VCF_DATA), + ) + self.assertEqual( + parse_clinvar_release_date(clinvar_url), + '2024-10-27', + ) + def test_parsed_clnsig(self): ht = hl.Table.parallelize( [ diff --git a/v03_pipeline/lib/reference_data/compare_globals.py b/v03_pipeline/lib/reference_data/compare_globals.py index 1feb0ac12..c295b3a35 100644 --- a/v03_pipeline/lib/reference_data/compare_globals.py +++ b/v03_pipeline/lib/reference_data/compare_globals.py @@ -4,8 +4,10 @@ from v03_pipeline.lib.logger import get_logger from v03_pipeline.lib.model import ( + DatasetType, ReferenceGenome, ) +from v03_pipeline.lib.reference_data.clinvar import parse_clinvar_release_date from v03_pipeline.lib.reference_data.config import CONFIG from v03_pipeline.lib.reference_data.dataset_table_operations import ( get_all_select_fields, @@ -16,6 +18,17 @@ logger = get_logger(__name__) +def clinvar_versions_equal( + ht: hl.Table, + reference_genome: ReferenceGenome, + dataset_type: DatasetType, +) -> bool: + dataset = 'clinvar_mito' if dataset_type == DatasetType.MITO else 'clinvar' + return hl.eval(ht.globals.versions[dataset]) == parse_clinvar_release_date( + CONFIG[dataset][reference_genome.v02_value], + ) + + @dataclasses.dataclass class Globals: paths: dict[str, str] diff --git a/v03_pipeline/lib/reference_data/gencode/download_utils.py b/v03_pipeline/lib/reference_data/gencode/download_utils.py deleted file mode 100644 index 420e860b9..000000000 --- a/v03_pipeline/lib/reference_data/gencode/download_utils.py +++ /dev/null @@ -1,112 +0,0 @@ -import logging -import os -import tempfile -from contextlib import contextmanager - -import hail as hl -import requests -from google.cloud import storage - -logger = logging.getLogger(__name__) - - -def parse_gs_path_to_bucket(gs_path): - bucket_name = gs_path.replace('gs://', '').split('/')[0] - file_name = gs_path.split(bucket_name)[-1].lstrip('/') - - storage_client = storage.Client() - bucket = storage_client.bucket(bucket_name) - - return bucket, file_name - - -def stream_gs_file(gs_path, raw_download=False): - logger.info(f'Stream from GCS: {gs_path}') - bucket, file_name = parse_gs_path_to_bucket(gs_path) - - blob = bucket.get_blob(file_name) - - return blob and blob.download_as_string(raw_download=raw_download) - - -@contextmanager -def file_writer(file_path, get_existing_size=False): - bucket = None - size = None - if is_gs_path(file_path): - local_file_path = os.path.join( - tempfile.gettempdir(), - os.path.basename(file_path), - ) - bucket, file_name = parse_gs_path_to_bucket(file_path) - if get_existing_size: - blob = bucket.get_blob(file_name) - size = blob and blob.size - else: - local_file_path = file_path - if get_existing_size: - size = os.path.isfile(local_file_path) and os.path.getsize(local_file_path) - - local_file = open(local_file_path, 'wb') - - yield local_file, size - - local_file.close() - - if bucket: - blob = bucket.blob(file_name) - blob.upload_from_filename(local_file_path) - - -def is_gs_path(path): - return path.startswith('gs://') - - -def path_exists(path): - is_gs = is_gs_path(path) - return (is_gs and hl.hadoop_exists(path)) or (not is_gs and os.path.exists(path)) - - -DEFAULT_TO_DIR = tempfile.gettempdir() - - -def download_file(url, to_dir=None, verbose=True): - """Download the given file and returns its local path. - Args: - url (string): HTTP or FTP url - to_dir: optional save to directory - verbose: display verbose information - Returns: - string: local file path - """ - if to_dir is None: - to_dir = DEFAULT_TO_DIR - - if not (url and url.startswith(('http://', 'https://'))): - msg = f'Invalid url: {url}' - raise ValueError(msg) - remote_file_size = _get_remote_file_size(url) - - file_path = os.path.join(to_dir, os.path.basename(url)) - with file_writer(file_path, get_existing_size=True) as fw: - f, file_size = fw - if file_size and file_size == remote_file_size: - logger.info( - f'Re-using {file_path} previously downloaded from {url}', - ) - return file_path - - is_gz = url.endswith('.gz') - response = requests.get(url, stream=is_gz) - input_iter = response if is_gz else response.iter_content() - if verbose: - logger.info(f'Downloading {url} to {file_path}') - - f.writelines(input_iter) - input_iter.close() - - return file_path - - -def _get_remote_file_size(url): - return int(requests.head(url).headers.get('Content-Length', '0')) diff --git a/v03_pipeline/lib/reference_data/gencode/download_utils_test.py b/v03_pipeline/lib/reference_data/gencode/download_utils_test.py deleted file mode 100644 index d990bf6f6..000000000 --- a/v03_pipeline/lib/reference_data/gencode/download_utils_test.py +++ /dev/null @@ -1,130 +0,0 @@ -import unittest -from unittest import mock - -import responses - -from v03_pipeline.lib.reference_data.gencode.download_utils import download_file - -DEFAULT_TEST_DIR = 'default_test/dir' -TEST_DIR = 'test/dir' -GS_TEST_DIR = 'gs://test-bucket/test/dir' -TEST_TXT_FILE = 'test_file.txt' -TEST_GZ_FILE = 'test_file.gz' -TXT_DATA_URL = 'https://mock_url/test_file.txt' -GZ_DATA_URL = 'https://mock_url/test_file.gz' -GZ_DATA = b'test data\nanother line\n' - - -class DownloadUtilsTest(unittest.TestCase): - @responses.activate - @mock.patch( - 'v03_pipeline.lib.reference_data.gencode.download_utils.DEFAULT_TO_DIR', - DEFAULT_TEST_DIR, - ) - @mock.patch('v03_pipeline.lib.reference_data.gencode.download_utils.logger') - @mock.patch('v03_pipeline.lib.reference_data.gencode.download_utils.os.path.isfile') - @mock.patch( - 'v03_pipeline.lib.reference_data.gencode.download_utils.os.path.getsize', - ) - @mock.patch('v03_pipeline.lib.reference_data.gencode.download_utils.open') - @mock.patch( - 'v03_pipeline.lib.reference_data.gencode.download_utils.tempfile.gettempdir', - ) - @mock.patch( - 'v03_pipeline.lib.reference_data.gencode.download_utils.parse_gs_path_to_bucket', - ) - def test_download_file( - self, - mock_get_bucket, - mock_gettempdir, - mock_open, - mock_getsize, - mock_isfile, - mock_logger, - ): - responses.add( - responses.HEAD, - GZ_DATA_URL, - headers={'Content-Length': '1024'}, - status=200, - body=b' ' * 1024, - ) - responses.add(responses.GET, GZ_DATA_URL, body=GZ_DATA) - responses.add( - responses.HEAD, - TXT_DATA_URL, - headers={'Content-Length': '1024'}, - status=200, - body=b' ' * 1024, - ) - responses.add(responses.GET, TXT_DATA_URL, body='test data\nanother line\n') - - # Test bad url - with self.assertRaises(ValueError) as ve: - download_file('bad_url') - self.assertEqual(str(ve.exception), 'Invalid url: bad_url') - - # Test already downloaded - mock_isfile.return_value = True - mock_getsize.return_value = 1024 - result = download_file(GZ_DATA_URL) - self.assertEqual(result, 'default_test/dir/test_file.gz') - mock_open.assert_called_with('default_test/dir/test_file.gz', 'wb') - mock_isfile.assert_called_with('default_test/dir/test_file.gz') - mock_getsize.assert_called_with('default_test/dir/test_file.gz') - mock_logger.info.assert_called_with( - f'Re-using default_test/dir/test_file.gz previously downloaded from {GZ_DATA_URL}', - ) - - # Test download, .gz file format, verbose - mock_isfile.reset_mock() - mock_getsize.reset_mock() - mock_logger.reset_mock() - mock_open.reset_mock() - mock_isfile.return_value = False - result = download_file(GZ_DATA_URL, TEST_DIR) - self.assertEqual(result, 'test/dir/test_file.gz') - mock_isfile.assert_called_with('test/dir/test_file.gz') - mock_getsize.assert_not_called() - mock_open.assert_called_with('test/dir/test_file.gz', 'wb') - mock_logger.info.assert_called_with( - f'Downloading {GZ_DATA_URL} to test/dir/test_file.gz', - ) - - # Test download, non-.gz file format, non-verbose - mock_isfile.reset_mock() - mock_logger.reset_mock() - mock_open.reset_mock() - mock_isfile.return_value = False - result = download_file(TXT_DATA_URL, TEST_DIR, verbose=False) - self.assertEqual(result, 'test/dir/test_file.txt') - mock_isfile.assert_called_with('test/dir/test_file.txt') - mock_getsize.assert_not_called() - mock_open.assert_called_with('test/dir/test_file.txt', 'wb') - mock_open.return_value.writelines.assert_called_once() - mock_logger.info.assert_not_called() - - mock_gettempdir.assert_not_called() - mock_get_bucket.assert_not_called() - - # Test using Google Storage - mock_isfile.reset_mock() - mock_logger.reset_mock() - mock_open.reset_mock() - mock_gettempdir.return_value = TEST_DIR - mock_bucket = mock.MagicMock() - mock_get_bucket.return_value = mock_bucket, 'test/dir/test_file.gz' - result = download_file(GZ_DATA_URL, GS_TEST_DIR) - self.assertEqual(result, 'gs://test-bucket/test/dir/test_file.gz') - mock_gettempdir.assert_called_once() - mock_isfile.assert_not_called() - mock_getsize.assert_not_called() - mock_open.assert_called_with('test/dir/test_file.gz', 'wb') - mock_logger.info.assert_called_with( - f'Downloading {GZ_DATA_URL} to gs://test-bucket/test/dir/test_file.gz', - ) - mock_bucket.get_blob.assert_called_with('test/dir/test_file.gz') - mock_bucket.blob.assert_called_with('test/dir/test_file.gz') - mock_bucket.blob.return_value.upload_from_filename.assert_called_with( - 'test/dir/test_file.gz', - ) diff --git a/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids.py b/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids.py index fed40301f..96597f815 100644 --- a/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids.py +++ b/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids.py @@ -1,21 +1,8 @@ import gzip import logging -import os -import pickle import requests -from v03_pipeline.lib.reference_data.gencode.download_utils import ( - download_file, - file_writer, - is_gs_path, - path_exists, - stream_gs_file, -) - -GENOME_VERSION_GRCh37 = '37' -GENOME_VERSION_GRCh38 = '38' - logger = logging.getLogger(__name__) GENCODE_GTF_URL = 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_{gencode_release}/gencode.v{gencode_release}.annotation.gtf.gz' @@ -33,115 +20,43 @@ 'phase', 'info', ] +EXPECTED_ENSEMBLE_TO_REFSEQ_FIELDS = 3 -def _get_pickle_file(path): - root, ext = os.path.splitext(path) - return root + '.pickle' - - -def _load_parsed_data_or_download(gencode_release, download_path): - gene_id_mapping = {} +def load_gencode_gene_symbol_to_gene_id(gencode_release: int) -> dict[str, str]: url = GENCODE_GTF_URL.format(gencode_release=gencode_release) - gencode_gtf_path = os.path.join(download_path, os.path.basename(url)) - pickle_file = _get_pickle_file(gencode_gtf_path) - if path_exists(pickle_file): - logger.info( - 'Use the existing pickle file {}.\nIf you want to reload the data, please delete it and re-run the data loading.'.format( - pickle_file, - ), - ) - if is_gs_path(pickle_file): - p = pickle.loads(stream_gs_file(pickle_file)) - else: - with open(pickle_file, 'rb') as handle: - p = pickle.load(handle) - gene_id_mapping.update(p) - elif not path_exists(gencode_gtf_path): - gencode_gtf_path = download_file(url, to_dir=download_path) - logger.info(f'Downloaded to {gencode_gtf_path}') - else: - logger.info( - 'Use the existing downloaded file {}.\nIf you want to re-download it, please delete the file and re-run the pipeline.'.format( - gencode_gtf_path, - ), - ) - - return gene_id_mapping, gencode_gtf_path - - -def _parse_gtf_data(gencode_gtf_path): - gene_id_mapping = {} - logger.info(f'Loading {gencode_gtf_path}') - is_gs = is_gs_path(gencode_gtf_path) - gencode_file = ( - gzip.decompress(stream_gs_file(gencode_gtf_path, raw_download=True)) - .decode() - .split('\n') - if is_gs - else gzip.open(gencode_gtf_path, 'rt') - ) - for i, line in enumerate(gencode_file): - line = line.rstrip('\r\n') + response = requests.get(url, stream=True, timeout=10) + gene_symbol_to_gene_id = {} + for line in gzip.GzipFile(fileobj=response.raw): + line = line.decode('ascii') # noqa: PLW2901 if not line or line.startswith('#'): continue - fields = line.split('\t') - + fields = line.strip().split('\t') if len(fields) != len(GENCODE_FILE_HEADER): + msg = f'Unexpected number of fields: {fields}' raise ValueError( - 'Unexpected number of fields on line #%s: %s' % (i, fields), + msg, ) - - record = dict(zip(GENCODE_FILE_HEADER, fields)) - + record = dict(zip(GENCODE_FILE_HEADER, fields, strict=False)) if record['feature_type'] != 'gene': continue - # parse info field info_fields = [x.strip().split() for x in record['info'].split(';') if x != ''] info_fields = {k: v.strip('"') for k, v in info_fields} + gene_symbol_to_gene_id[info_fields['gene_name']] = info_fields['gene_id'].split( + '.', + )[0] + return gene_symbol_to_gene_id - gene_id_mapping[info_fields['gene_name']] = info_fields['gene_id'].split('.')[0] - - if not is_gs: - gencode_file.close() - - pickle_file = _get_pickle_file(gencode_gtf_path) - logger.info(f'Saving to pickle {pickle_file}') - with file_writer(pickle_file) as fw: - f, _ = fw - pickle.dump(gene_id_mapping, f, protocol=pickle.HIGHEST_PROTOCOL) - - return gene_id_mapping - - -def load_gencode_gene_symbol_to_gene_id(gencode_release, download_path=''): - """Load Gencode to create a gene symbols to gene ids mapping table. - - Args: - gencode_release (int): the gencode release to load (eg. 25) - download_path (str): The path for downloaded data - """ - gene_id_mapping, gencode_gtf_path = _load_parsed_data_or_download( - gencode_release, - download_path, - ) - - if not gene_id_mapping: - gene_id_mapping = _parse_gtf_data(gencode_gtf_path) - - logger.info(f'Got {len(gene_id_mapping)} gene id mapping records') - return gene_id_mapping def load_gencode_ensembl_to_refseq_id(gencode_release: int): url = GENCODE_ENSEMBL_TO_REFSEQ_URL.format(gencode_release=gencode_release) - response = requests.get(url, stream=True) + response = requests.get(url, stream=True, timeout=10) ensembl_to_refseq_ids = {} for line in gzip.GzipFile(fileobj=response.raw): - line = line.decode('ascii').strip().split('\t') - if len(line) > 3: - raise ValueError( - 'Unexpected number of fields on line in ensemble_to_refseq mapping', - ) - ensembl_to_refseq_ids[line[0].split('.')[0]] = line[1] + fields = line.decode('ascii').strip().split('\t') + if len(fields) > EXPECTED_ENSEMBLE_TO_REFSEQ_FIELDS: + msg = 'Unexpected number of fields on line in ensemble_to_refseq mapping' + raise ValueError(msg) + ensembl_to_refseq_ids[fields[0].split('.')[0]] = fields[1] return ensembl_to_refseq_ids diff --git a/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids_tests.py b/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids_tests.py index 585278a7b..58c037048 100644 --- a/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids_tests.py +++ b/v03_pipeline/lib/reference_data/gencode/mapping_gene_ids_tests.py @@ -1,162 +1,52 @@ import gzip import unittest -from unittest import mock import responses -from v03_pipeline.lib.reference_data.gencode.mapping_gene_ids import load_gencode_ensembl_to_refseq_id, load_gencode_gene_symbol_to_gene_id, GENCODE_ENSEMBL_TO_REFSEQ_URL +from v03_pipeline.lib.reference_data.gencode.mapping_gene_ids import ( + GENCODE_ENSEMBL_TO_REFSEQ_URL, + GENCODE_GTF_URL, + load_gencode_ensembl_to_refseq_id, + load_gencode_gene_symbol_to_gene_id, +) -DOWNLOAD_PATH = 'test/path' -GS_DOWNLOAD_PATH ='gs://test-bucket/test/path' -DOWNLOAD_FILE = 'test/path/gencode.v29.annotation.gtf.gz' -PICKLE_FILE = 'test/path/gencode.v29.annotation.gtf.pickle' -PICKLE_FILE_HANDLE = 'handle' GTF_DATA = [ - '#description: evidence-based annotation of the human genome, version 31 (Ensembl 97), mapped to GRCh37 with gencode-backmap\n', - 'chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972.5_2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; hgnc_id "HGNC:37102"; havana_gene "OTTHUMG00000000961.2_2"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";\n', - 'chr1 HAVANA gene 621059 622053 . - . gene_id "ENSG00000284662.1_2"; gene_type "protein_coding"; gene_name "OR4F16"; level 2; hgnc_id "HGNC:15079"; havana_gene "OTTHUMG00000002581.3_2"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";\n', - 'GL000193.1 HAVANA gene 77815 78162 . + . gene_id "ENSG00000279783.1_5"; gene_type "processed_pseudogene"; gene_name "AC018692.2"; level 2; havana_gene "OTTHUMG00000189459.1_5"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "new";\n', + '#description: evidence-based annotation of the human genome, version 31 (Ensembl 97), mapped to GRCh37 with gencode-backmap', + 'chr1 HAVANA gene 11869 14409 . + . gene_id "ENSG00000223972.5_2"; gene_type "transcribed_unprocessed_pseudogene"; gene_name "DDX11L1"; level 2; hgnc_id "HGNC:37102"; havana_gene "OTTHUMG00000000961.2_2"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";', + 'chr1 HAVANA gene 621059 622053 . - . gene_id "ENSG00000284662.1_2"; gene_type "protein_coding"; gene_name "OR4F16"; level 2; hgnc_id "HGNC:15079"; havana_gene "OTTHUMG00000002581.3_2"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "overlap";', + 'GL000193.1 HAVANA gene 77815 78162 . + . gene_id "ENSG00000279783.1_5"; gene_type "processed_pseudogene"; gene_name "AC018692.2"; level 2; havana_gene "OTTHUMG00000189459.1_5"; remap_status "full_contig"; remap_num_mappings 1; remap_target_status "new";', ] -GENE_ID_MAPPING = {"DDX11L1": "ENSG00000223972", "OR4F16": "ENSG00000284662", "AC018692.2": "ENSG00000279783"} +GENE_ID_MAPPING = { + 'DDX11L1': 'ENSG00000223972', + 'OR4F16': 'ENSG00000284662', + 'AC018692.2': 'ENSG00000279783', +} -ENSEMBL_TO_REFSEQ_DATA = b'''ENST00000424215.1\tNR_121638.1 +ENSEMBL_TO_REFSEQ_DATA = b"""ENST00000424215.1\tNR_121638.1 ENST00000378391.6\tNM_199454.3\tNP_955533.2 ENST00000270722.10\tNM_022114.4\tNP_071397.3 -ENST00000288774.8\tNM_001374425.1\tNP_001361354.1''' +ENST00000288774.8\tNM_001374425.1\tNP_001361354.1""" -class LoadGencodeTestCase(unittest.TestCase): - - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.logger') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.path_exists') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.pickle') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.open') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.gzip.open') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.file_writer') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.download_file') - def test_load_gencode_local(self, mock_download_file, mock_file_writer, mock_gopen, mock_open, mock_pickle, - mock_path_exists, mock_logger): - # test using saved file - mock_path_exists.side_effect = [True] - mock_pickle.load.return_value = GENE_ID_MAPPING - gene_id_mapping = load_gencode_gene_symbol_to_gene_id(23, download_path=DOWNLOAD_PATH) - mock_file_writer.assert_not_called() - mock_download_file.assert_not_called() - mock_gopen.assert_not_called() - mock_open.assert_called_with('test/path/gencode.v23.annotation.gtf.pickle', 'rb') - mock_pickle.load.assert_called_with(mock_open.return_value.__enter__.return_value) - mock_path_exists.assert_called_with('test/path/gencode.v23.annotation.gtf.pickle') - mock_logger.info.assert_has_calls([ - mock.call('Use the existing pickle file test/path/gencode.v23.annotation.gtf.pickle.\nIf you want to reload the data, please delete it and re-run the data loading.'), - mock.call('Got 3 gene id mapping records'), - ]) - self.assertEqual(gene_id_mapping, GENE_ID_MAPPING) - # test downloading and parsing gtf data - mock_path_exists.reset_mock() - mock_logger.reset_mock() - mock_pickle.reset_mock() - mock_open.reset_mock() - mock_path_exists.side_effect = [False, False] - mock_download_file.return_value = 'test/path/gencode.v24.annotation.gtf.gz' - mock_gopen.return_value.__iter__.return_value = GTF_DATA - mock_f = mock.MagicMock() - mock_file_writer.return_value.__enter__.return_value = mock_f, None - gene_id_mapping = load_gencode_gene_symbol_to_gene_id(24, download_path=DOWNLOAD_PATH) - self.assertEqual(gene_id_mapping, GENE_ID_MAPPING) - mock_path_exists.assert_has_calls([ - mock.call('test/path/gencode.v24.annotation.gtf.pickle'), - mock.call('test/path/gencode.v24.annotation.gtf.gz'), - ]) - mock_download_file.assert_called_with( - 'http://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_24/gencode.v24.annotation.gtf.gz', - to_dir='test/path', +class LoadGencodeTestCase(unittest.TestCase): + @responses.activate + def test_load_gencode_gene_symbol_to_gene_id(self): + url = GENCODE_GTF_URL.format(gencode_release=12) + responses.add( + responses.GET, + url, + body=gzip.compress(('\n'.join(GTF_DATA)).encode()), + ) + mapping = load_gencode_gene_symbol_to_gene_id(12) + self.assertDictEqual( + mapping, + { + 'AC018692.2': 'ENSG00000279783', + 'DDX11L1': 'ENSG00000223972', + 'OR4F16': 'ENSG00000284662', + }, ) - mock_file_writer.assert_called_with('test/path/gencode.v24.annotation.gtf.pickle') - mock_pickle.dump.assert_called_with(GENE_ID_MAPPING, mock_f, protocol=mock.ANY) - mock_gopen.assert_called_with('test/path/gencode.v24.annotation.gtf.gz', 'rt') - mock_open.assert_not_called() - mock_logger.info.assert_has_calls([ - mock.call('Downloaded to test/path/gencode.v24.annotation.gtf.gz'), - mock.call('Loading test/path/gencode.v24.annotation.gtf.gz'), - mock.call('Saving to pickle test/path/gencode.v24.annotation.gtf.pickle'), - mock.call('Got 3 gene id mapping records') - ]) - mock_pickle.load.assert_not_called() - - # test using downloaded file - mock_path_exists.reset_mock() - mock_logger.reset_mock() - mock_download_file.reset_mock() - mock_pickle.reset_mock() - mock_path_exists.side_effect = [False, True] - mock_gopen.return_value.__iter__.return_value = GTF_DATA - gene_id_mapping = load_gencode_gene_symbol_to_gene_id(24, download_path=DOWNLOAD_PATH) - self.assertEqual(gene_id_mapping, GENE_ID_MAPPING) - mock_path_exists.assert_has_calls([ - mock.call('test/path/gencode.v24.annotation.gtf.pickle'), - mock.call('test/path/gencode.v24.annotation.gtf.gz'), - ]) - mock_gopen.assert_called_with('test/path/gencode.v24.annotation.gtf.gz', 'rt') - mock_download_file.assert_not_called() - mock_file_writer.assert_called_with('test/path/gencode.v24.annotation.gtf.pickle') - mock_pickle.dump.assert_called_with(GENE_ID_MAPPING, mock_f, protocol=mock.ANY) - mock_open.assert_not_called() - mock_logger.info.assert_has_calls([ - mock.call('Use the existing downloaded file test/path/gencode.v24.annotation.gtf.gz.\nIf you want to re-download it, please delete the file and re-run the pipeline.'), - mock.call('Loading test/path/gencode.v24.annotation.gtf.gz'), - mock.call('Saving to pickle test/path/gencode.v24.annotation.gtf.pickle'), - mock.call('Got 3 gene id mapping records') - ]) - mock_pickle.load.assert_not_called() - - # bad gtf data test - mock_path_exists.side_effect = [False, False] - mock_gopen.return_value.__iter__.return_value = ['bad data'] - with self.assertRaises(ValueError) as ve: - _ = load_gencode_gene_symbol_to_gene_id(24, download_path=DOWNLOAD_PATH) - self.assertEqual(str(ve.exception), "Unexpected number of fields on line #0: ['bad data']") - - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.gzip') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.logger') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.path_exists') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.pickle') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.stream_gs_file') - @mock.patch('v03_pipeline.lib.reference_data.gencode.mapping_gene_ids.file_writer') - def test_load_gencode_using_gs(self, mock_file_writer, mock_stream_gs_file, mock_pickle, mock_path_exists, - mock_logger, mock_gzip): - - # test using saved file. - mock_path_exists.side_effect = [True] - mock_pickle.loads.return_value = GENE_ID_MAPPING - gene_id_mapping = load_gencode_gene_symbol_to_gene_id(25, download_path=GS_DOWNLOAD_PATH) - self.assertEqual(gene_id_mapping, GENE_ID_MAPPING) - mock_path_exists.assert_called_with('gs://test-bucket/test/path/gencode.v25.annotation.gtf.pickle') - mock_logger.info.assert_has_calls([ - mock.call('Use the existing pickle file gs://test-bucket/test/path/gencode.v25.annotation.gtf.pickle.\n' - 'If you want to reload the data, please delete it and re-run the data loading.'), - mock.call('Got 3 gene id mapping records') - ]) - mock_stream_gs_file.assert_called_with('gs://test-bucket/test/path/gencode.v25.annotation.gtf.pickle') - mock_pickle.dump.assert_not_called() - mock_file_writer.assert_not_called() - - # test using downloaded file. - mock_path_exists.side_effect = [False, True] - mock_gzip.decompress.return_value = ''.join(GTF_DATA).encode() - mock_f = mock.MagicMock() - mock_file_writer.return_value.__enter__.return_value = mock_f, None - gene_id_mapping = load_gencode_gene_symbol_to_gene_id(25, download_path=GS_DOWNLOAD_PATH) - self.assertEqual(gene_id_mapping, GENE_ID_MAPPING) - mock_path_exists.assert_has_calls([ - mock.call('gs://test-bucket/test/path/gencode.v25.annotation.gtf.pickle'), - mock.call('gs://test-bucket/test/path/gencode.v25.annotation.gtf.gz'), - ]) - mock_stream_gs_file.assert_called_with('gs://test-bucket/test/path/gencode.v25.annotation.gtf.gz', raw_download=True) - mock_gzip.decompress.assert_called_with(mock_stream_gs_file.return_value) - mock_file_writer.assert_called_with('gs://test-bucket/test/path/gencode.v25.annotation.gtf.pickle') - mock_pickle.dump.assert_called_with(GENE_ID_MAPPING, mock_f, protocol=mock.ANY) - @responses.activate def test_load_gencode_ensembl_to_refseq_id(self): @@ -170,6 +60,5 @@ def test_load_gencode_ensembl_to_refseq_id(self): 'ENST00000378391': 'NM_199454.3', 'ENST00000270722': 'NM_022114.4', 'ENST00000288774': 'NM_001374425.1', - } + }, ) - diff --git a/v03_pipeline/lib/tasks/base/base_loading_run_params.py b/v03_pipeline/lib/tasks/base/base_loading_run_params.py index cde621c4f..7c79b00d6 100644 --- a/v03_pipeline/lib/tasks/base/base_loading_run_params.py +++ b/v03_pipeline/lib/tasks/base/base_loading_run_params.py @@ -19,6 +19,9 @@ class BaseLoadingRunParams(luigi.Task): run_id = luigi.Parameter() sample_type = luigi.EnumParameter(enum=SampleType) callset_path = luigi.Parameter() + project_guids = luigi.ListParameter(default=[]) + project_remap_paths = luigi.ListParameter(default=[]) + project_pedigree_paths = luigi.ListParameter(default=[]) ignore_missing_samples_when_remapping = luigi.BoolParameter( default=False, parsing=luigi.BoolParameter.EXPLICIT_PARSING, diff --git a/v03_pipeline/lib/tasks/base/base_project_info_params.py b/v03_pipeline/lib/tasks/base/base_project_info_params.py deleted file mode 100644 index 3bb5f5873..000000000 --- a/v03_pipeline/lib/tasks/base/base_project_info_params.py +++ /dev/null @@ -1,11 +0,0 @@ -import luigi -import luigi.util - -from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams - - -@luigi.util.inherits(BaseLoadingRunParams) -class BaseLoadingRunWithProjectInfoParams(luigi.Task): - project_guids = luigi.ListParameter() - project_remap_paths = luigi.ListParameter() - project_pedigree_paths = luigi.ListParameter() diff --git a/v03_pipeline/lib/tasks/base/base_update_project_table.py b/v03_pipeline/lib/tasks/base/base_update_project_table.py deleted file mode 100644 index 473a31bc2..000000000 --- a/v03_pipeline/lib/tasks/base/base_update_project_table.py +++ /dev/null @@ -1,42 +0,0 @@ -import hail as hl -import luigi - -from v03_pipeline.lib.model import SampleType -from v03_pipeline.lib.paths import project_table_path -from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask -from v03_pipeline.lib.tasks.files import GCSorLocalTarget - - -class BaseUpdateProjectTableTask(BaseUpdateTask): - sample_type = luigi.EnumParameter(enum=SampleType) - project_guid = luigi.Parameter() - - def output(self) -> luigi.Target: - return GCSorLocalTarget( - project_table_path( - self.reference_genome, - self.dataset_type, - self.sample_type, - self.project_guid, - ), - ) - - def initialize_table(self) -> hl.Table: - key_type = self.dataset_type.table_key_type(self.reference_genome) - return hl.Table.parallelize( - [], - hl.tstruct( - **key_type, - filters=hl.tset(hl.tstr), - # NB: entries is missing here because it is untyped - # until we read the type off of the first callset aggregation. - ), - key=key_type.fields, - globals=hl.Struct( - family_guids=hl.empty_array(hl.tstr), - family_samples=hl.empty_dict(hl.tstr, hl.tarray(hl.tstr)), - updates=hl.empty_set( - hl.tstruct(callset=hl.tstr, remap_pedigree_hash=hl.tint32), - ), - ), - ) diff --git a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py index 21b0253b3..31c718034 100644 --- a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py +++ b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py @@ -36,16 +36,12 @@ def output(self) -> luigi.Target: def requires(self) -> list[luigi.Task]: requirements = [ - UpdateCachedReferenceDatasetQueries( - reference_genome=self.reference_genome, - dataset_type=self.dataset_type, - ), + self.clone(UpdateCachedReferenceDatasetQueries), ] requirements.extend( - UpdatedReferenceDatasetCollectionTask( - self.reference_genome, - self.dataset_type, - rdc, + self.clone( + UpdatedReferenceDatasetCollectionTask, + reference_dataset_collection=rdc, ) for rdc in ReferenceDatasetCollection.for_reference_genome_dataset_type( self.reference_genome, diff --git a/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries.py b/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries.py index 58e2cae18..dc9c2a17e 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries.py +++ b/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries.py @@ -4,15 +4,15 @@ from v03_pipeline.lib.model import ( CachedReferenceDatasetQuery, ) -from v03_pipeline.lib.tasks.base.base_loading_pipeline_params import ( - BaseLoadingPipelineParams, +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, ) from v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query import ( UpdatedCachedReferenceDatasetQuery, ) -@luigi.util.inherits(BaseLoadingPipelineParams) +@luigi.util.inherits(BaseLoadingRunParams) class UpdateCachedReferenceDatasetQueries(luigi.Task): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries_test.py b/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries_test.py index 794a77897..d6bf33d36 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries_test.py +++ b/v03_pipeline/lib/tasks/reference_data/update_cached_reference_dataset_queries_test.py @@ -7,6 +7,7 @@ CachedReferenceDatasetQuery, DatasetType, ReferenceGenome, + SampleType, ) from v03_pipeline.lib.tasks.reference_data.update_cached_reference_dataset_queries import ( UpdateCachedReferenceDatasetQueries, @@ -21,99 +22,100 @@ class UpdateCachedReferenceDatasetQueriesTest(unittest.TestCase): def test_37_snv_indel(self, mock_crdq_task): mock_crdq_task.return_value = MockCompleteTask() worker = luigi.worker.Worker() + kwargs = { + 'sample_type': SampleType.WGS, + 'callset_path': '', + 'project_guids': [], + 'project_remap_paths': [], + 'project_pedigree_paths': [], + 'skip_validation': True, + 'run_id': '1', + } task = UpdateCachedReferenceDatasetQueries( reference_genome=ReferenceGenome.GRCh37, dataset_type=DatasetType.SNV_INDEL, + **kwargs, ) worker.add(task) worker.run() self.assertTrue(task.complete()) - mock_crdq_task.assert_has_calls( - [ - mock.call( - reference_genome=ReferenceGenome.GRCh37, - dataset_type=DatasetType.SNV_INDEL, - crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, - ), - mock.call( - reference_genome=ReferenceGenome.GRCh37, - dataset_type=DatasetType.SNV_INDEL, - crdq=CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS, - ), - mock.call( - reference_genome=ReferenceGenome.GRCh37, - dataset_type=DatasetType.SNV_INDEL, - crdq=CachedReferenceDatasetQuery.GNOMAD_QC, - ), - mock.call( - reference_genome=ReferenceGenome.GRCh37, - dataset_type=DatasetType.SNV_INDEL, - crdq=CachedReferenceDatasetQuery.HIGH_AF_VARIANTS, - ), - ], + call_args_list = mock_crdq_task.call_args_list + self.assertEqual(len(call_args_list), 4) + self.assertEqual( + [x.kwargs['crdq'] for x in call_args_list], + list(CachedReferenceDatasetQuery), ) def test_38_snv_indel(self, mock_crdq_task): mock_crdq_task.return_value = MockCompleteTask() worker = luigi.worker.Worker() + kwargs = { + 'sample_type': SampleType.WGS, + 'callset_path': '', + 'project_guids': [], + 'project_remap_paths': [], + 'project_pedigree_paths': [], + 'skip_validation': True, + 'run_id': '2', + } task = UpdateCachedReferenceDatasetQueries( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, + **kwargs, ) worker.add(task) worker.run() self.assertTrue(task.complete()) - mock_crdq_task.assert_has_calls( - [ - mock.call( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, - ), - mock.call( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - crdq=CachedReferenceDatasetQuery.GNOMAD_CODING_AND_NONCODING_VARIANTS, - ), - mock.call( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - crdq=CachedReferenceDatasetQuery.GNOMAD_QC, - ), - mock.call( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.SNV_INDEL, - crdq=CachedReferenceDatasetQuery.HIGH_AF_VARIANTS, - ), - ], + call_args_list = mock_crdq_task.call_args_list + self.assertEqual(len(call_args_list), 4) + self.assertEqual( + [x.kwargs['crdq'] for x in call_args_list], + list(CachedReferenceDatasetQuery), ) def test_38_mito(self, mock_crdq_task): mock_crdq_task.return_value = MockCompleteTask() worker = luigi.worker.Worker() + kwargs = { + 'sample_type': SampleType.WGS, + 'callset_path': '', + 'project_guids': [], + 'project_remap_paths': [], + 'project_pedigree_paths': [], + 'skip_validation': True, + 'run_id': '3', + } task = UpdateCachedReferenceDatasetQueries( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.MITO, + **kwargs, ) worker.add(task) worker.run() self.assertTrue(task.complete()) - mock_crdq_task.assert_has_calls( - [ - mock.call( - reference_genome=ReferenceGenome.GRCh38, - dataset_type=DatasetType.MITO, - crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, - ), - ], + call_args_list = mock_crdq_task.call_args_list + self.assertEqual(len(call_args_list), 1) + self.assertEqual( + next(x.kwargs['crdq'] for x in call_args_list), + CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, ) def test_38_sv(self, mock_crdq_task): mock_crdq_task.return_value = MockCompleteTask() worker = luigi.worker.Worker() + kwargs = { + 'sample_type': SampleType.WGS, + 'callset_path': '', + 'project_guids': [], + 'project_remap_paths': [], + 'project_pedigree_paths': [], + 'skip_validation': True, + 'run_id': '4', + } task = UpdateCachedReferenceDatasetQueries( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SV, + **kwargs, ) worker.add(task) worker.run() diff --git a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py index f03526c50..9a0aeca2d 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py +++ b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset.py @@ -1,13 +1,18 @@ import hail as hl +import luigi from v03_pipeline.lib.annotations.fields import get_fields from v03_pipeline.lib.logger import get_logger from v03_pipeline.lib.model import ReferenceDatasetCollection from v03_pipeline.lib.reference_data.compare_globals import ( Globals, + clinvar_versions_equal, get_datasets_to_update, ) from v03_pipeline.lib.reference_data.config import CONFIG +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, +) from v03_pipeline.lib.tasks.base.base_update_variant_annotations_table import ( BaseUpdateVariantAnnotationsTableTask, ) @@ -15,6 +20,7 @@ logger = get_logger(__name__) +@luigi.util.inherits(BaseLoadingRunParams) class UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( BaseUpdateVariantAnnotationsTableTask, ): @@ -49,6 +55,17 @@ def complete(self) -> bool: for rdc in self.reference_dataset_collections for dataset in rdc.datasets(self.dataset_type) ] + + if any( + 'clinvar' in d for d in datasets_to_check + ) and not clinvar_versions_equal( + hl.read_table(self.output().path), + self.reference_genome, + self.dataset_type, + ): + datasets_to_check.remove('clinvar') + self._datasets_to_update.add('clinvar') + annotations_ht_globals = Globals.from_ht( hl.read_table(self.output().path), datasets_to_check, diff --git a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py index b5a5ced2f..5c30630e9 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py +++ b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py @@ -19,6 +19,7 @@ DatasetType, ReferenceDatasetCollection, ReferenceGenome, + SampleType, ) from v03_pipeline.lib.paths import valid_reference_dataset_collection_path from v03_pipeline.lib.reference_data.clinvar import CLINVAR_ASSERTIONS @@ -37,6 +38,8 @@ TEST_INTERVAL_MITO_1 = 'v03_pipeline/var/test/reference_data/test_interval_mito_1.ht' TEST_COMBINED_37 = 'v03_pipeline/var/test/reference_data/test_combined_37.ht' TEST_HGMD_37 = 'v03_pipeline/var/test/reference_data/test_hgmd_37.ht' +TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' +TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt' MOCK_CADD_CONFIG = { @@ -58,7 +61,7 @@ } MOCK_CLINVAR_CONFIG = { **CONFIG['clinvar']['38'], - 'source_path': 'ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', + 'source_path': 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', 'custom_import': lambda *_: hl.Table.parallelize( [], hl.tstruct( @@ -483,7 +486,7 @@ 'clinvar_mito': { '38': { **CONFIG['clinvar_mito']['38'], - 'source_path': 'ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', + 'source_path': 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', 'custom_import': lambda *_: hl.Table.parallelize( [], hl.tstruct( @@ -719,12 +722,17 @@ def setUp(self) -> None: 'v03_pipeline.lib.reference_data.compare_globals.CONFIG', MOCK_CONFIG, ) + @mock.patch( + 'v03_pipeline.lib.tasks.reference_data.update_variant_annotations_table_with_updated_reference_dataset.clinvar_versions_equal', + ) def test_update_vat_with_updated_rdc_snv_indel_38( self, + mock_clinvar_versions_equal, mock_initialize_table, mock_update_crdqs_task, mock_update_rdc_task, ): + mock_clinvar_versions_equal.return_value = True mock_update_rdc_task.return_value = MockCompleteTask() mock_update_crdqs_task.return_value = MockCompleteTask() mock_initialize_table.return_value = hl.Table.parallelize( @@ -754,6 +762,13 @@ def test_update_vat_with_updated_rdc_snv_indel_38( task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path=TEST_SNV_INDEL_VCF, + project_guids=[], + project_remap_paths=[], + project_pedigree_paths=[], + skip_validation=True, + run_id='3', ) worker = luigi.worker.Worker() worker.add(task) @@ -830,7 +845,7 @@ def test_update_vat_with_updated_rdc_snv_indel_38( hl.Struct( paths=hl.Struct( cadd='gs://seqr-reference-data/GRCh37/CADD/CADD_snvs_and_indels.v1.6.ht', - clinvar='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', + clinvar='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', dbnsfp='gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.ht', eigen='gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.ht', exac='gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vep.ht', @@ -929,12 +944,17 @@ def test_update_vat_with_updated_rdc_snv_indel_38( 'v03_pipeline.lib.reference_data.compare_globals.CONFIG', MOCK_CONFIG_MITO, ) + @mock.patch( + 'v03_pipeline.lib.tasks.reference_data.update_variant_annotations_table_with_updated_reference_dataset.clinvar_versions_equal', + ) def test_update_vat_with_updated_rdc_mito_38( self, + mock_clinvar_versions_equal, mock_initialize_table, mock_update_crdqs_task, mock_update_rdc_task, ): + mock_clinvar_versions_equal.return_value = (True,) mock_update_rdc_task.return_value = MockCompleteTask() mock_update_crdqs_task.return_value = MockCompleteTask() mock_initialize_table.return_value = hl.Table.parallelize( @@ -964,6 +984,13 @@ def test_update_vat_with_updated_rdc_mito_38( task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.MITO, + sample_type=SampleType.WGS, + callset_path=TEST_MITO_MT, + project_guids=[], + project_remap_paths=[], + project_pedigree_paths=[], + skip_validation=True, + run_id='1', ) worker = luigi.worker.Worker() worker.add(task) @@ -982,7 +1009,7 @@ def test_update_vat_with_updated_rdc_mito_38( hmtvar='gs://seqr-reference-data/GRCh38/mitochondrial/HmtVar/HmtVar%20Jan.%2010%202022.ht', mitomap='gs://seqr-reference-data/GRCh38/mitochondrial/MITOMAP/mitomap-confirmed-mutations-2022-02-04.ht', mitimpact='gs://seqr-reference-data/GRCh38/mitochondrial/MitImpact/MitImpact_db_3.0.7.ht', - clinvar_mito='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', + clinvar_mito='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', dbnsfp_mito='gs://seqr-reference-data/GRCh38/dbNSFP/v4.2/dbNSFP4.2a_variant.with_new_scores.ht', high_constraint_region_mito='gs://seqr-reference-data/GRCh38/mitochondrial/Helix high constraint intervals Feb-15-2022.tsv', local_constraint_mito='gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv', @@ -1079,12 +1106,17 @@ def test_update_vat_with_updated_rdc_mito_38( 'v03_pipeline.lib.reference_data.compare_globals.CONFIG', MOCK_CONFIG, ) + @mock.patch( + 'v03_pipeline.lib.tasks.reference_data.update_variant_annotations_table_with_updated_reference_dataset.clinvar_versions_equal', + ) def test_update_vat_with_updated_rdc_snv_indel_37( self, + mock_clinvar_versions_equal, mock_initialize_table, mock_update_crdqs_task, mock_update_rdc_task, ): + mock_clinvar_versions_equal.return_value = True mock_update_rdc_task.return_value = MockCompleteTask() mock_update_crdqs_task.return_value = MockCompleteTask() mock_initialize_table.return_value = hl.Table.parallelize( @@ -1114,6 +1146,13 @@ def test_update_vat_with_updated_rdc_snv_indel_37( task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( reference_genome=ReferenceGenome.GRCh37, dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path=TEST_SNV_INDEL_VCF, + project_guids=[], + project_remap_paths=[], + project_pedigree_paths=[], + skip_validation=True, + run_id='2', ) worker = luigi.worker.Worker() worker.add(task) @@ -1128,7 +1167,7 @@ def test_update_vat_with_updated_rdc_snv_indel_37( hl.Struct( paths=hl.Struct( cadd='gs://seqr-reference-data/GRCh37/CADD/CADD_snvs_and_indels.v1.6.ht', - clinvar='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', + clinvar='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', dbnsfp='gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.ht', eigen='gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.ht', exac='gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vep.ht', diff --git a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py index 9aa4a3a74..57d163146 100644 --- a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py +++ b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query.py @@ -12,6 +12,7 @@ ) from v03_pipeline.lib.reference_data.compare_globals import ( Globals, + clinvar_versions_equal, get_datasets_to_update, ) from v03_pipeline.lib.reference_data.config import CONFIG @@ -19,15 +20,16 @@ get_ht_path, import_ht_from_config_path, ) +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, +) from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget, HailTableTask -from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection import ( - UpdatedReferenceDatasetCollectionTask, -) logger = get_logger(__name__) +@luigi.util.inherits(BaseLoadingRunParams) class UpdatedCachedReferenceDatasetQuery(BaseWriteTask): crdq = luigi.EnumParameter(enum=CachedReferenceDatasetQuery) @@ -38,14 +40,21 @@ def complete(self) -> bool: ) return False - datasets_to_check = [self.crdq.dataset(self.dataset_type)] + dataset = self.crdq.dataset(self.dataset_type) + if 'clinvar' in dataset and not clinvar_versions_equal( + hl.read_table(self.output().path), + self.reference_genome, + self.dataset_type, + ): + return False + crdq_globals = Globals.from_ht( hl.read_table(self.output().path), - datasets_to_check, + [dataset], ) dataset_config_globals = Globals.from_dataset_configs( self.reference_genome, - datasets_to_check, + [dataset], ) return not get_datasets_to_update( crdq_globals, @@ -71,6 +80,16 @@ def requires(self) -> luigi.Task: ], ), ) + # Special nested import to avoid a circular dependency issue + # (ValidateCallset -> this file -> UpdatedReferenceDatasetCollection -> ValidateCallset) + # The specific CRDQ referenced in ValidateCallset will never reach + # this line due to it being a "query_raw_dataset". In theory this + # would be fixed by splitting the CRDQ into raw_dataset and non-raw_dataset + # queries. + from v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection import ( + UpdatedReferenceDatasetCollectionTask, + ) + return UpdatedReferenceDatasetCollectionTask( self.reference_genome, self.dataset_type, diff --git a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py index 210a8cc8a..566337f2e 100644 --- a/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py +++ b/v03_pipeline/lib/tasks/reference_data/updated_cached_reference_dataset_query_test.py @@ -5,12 +5,14 @@ import hail as hl import luigi +import v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection from v03_pipeline.lib.annotations.enums import CLINVAR_PATHOGENICITIES from v03_pipeline.lib.model import ( CachedReferenceDatasetQuery, DatasetType, ReferenceDatasetCollection, ReferenceGenome, + SampleType, ) from v03_pipeline.lib.paths import ( cached_reference_dataset_query_path, @@ -28,6 +30,7 @@ CLINVAR_CRDQ_PATH = ( 'v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht' ) +TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' MOCK_CONFIG = { 'gnomad_qc': { @@ -57,7 +60,7 @@ 'clinvar': { '38': { **CONFIG['clinvar']['38'], - 'source_path': 'ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', + 'source_path': 'https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', 'custom_import': lambda *_: hl.Table.parallelize( [], hl.tstruct( @@ -109,6 +112,13 @@ def test_gnomad_qc( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, crdq=CachedReferenceDatasetQuery.GNOMAD_QC, + sample_type=SampleType.WGS, + callset_path=TEST_SNV_INDEL_VCF, + project_guids=[], + project_remap_paths=[], + project_pedigree_paths=[], + skip_validation=True, + run_id='1', ) worker.add(task) worker.run() @@ -143,14 +153,19 @@ def test_gnomad_qc( 'v03_pipeline.lib.reference_data.compare_globals.CONFIG', MOCK_CONFIG, ) - @mock.patch( - 'v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query.UpdatedReferenceDatasetCollectionTask', + @mock.patch.object( + v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection, + 'UpdatedReferenceDatasetCollectionTask', ) @mock.patch( 'v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query.CachedReferenceDatasetQuery.query', ) + @mock.patch( + 'v03_pipeline.lib.tasks.reference_data.updated_cached_reference_dataset_query.clinvar_versions_equal', + ) def test_clinvar( self, + mock_clinvar_versions_equal, mock_crdq_query, mock_updated_rdc_task, ) -> None: @@ -158,6 +173,8 @@ def test_clinvar( Given a crdq task where there exists a clinvar crdq table and a clinvar rdc table, expect task to replace the clinvar crdq table with new version. """ + mock_clinvar_versions_equal.return_value = True + # rdc dependency exists mock_updated_rdc_task.return_value = MockCompleteTask() @@ -198,6 +215,13 @@ def _clinvar_path_variants(table, **_: Any): reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, crdq=CachedReferenceDatasetQuery.CLINVAR_PATH_VARIANTS, + sample_type=SampleType.WGS, + callset_path=TEST_SNV_INDEL_VCF, + project_guids=[], + project_remap_paths=[], + project_pedigree_paths=[], + skip_validation=True, + run_id='2', ) worker.add(task) worker.run() diff --git a/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection.py b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection.py index 253e2f526..af2144839 100644 --- a/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection.py +++ b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection.py @@ -6,17 +6,23 @@ from v03_pipeline.lib.paths import valid_reference_dataset_collection_path from v03_pipeline.lib.reference_data.compare_globals import ( Globals, + clinvar_versions_equal, get_datasets_to_update, ) from v03_pipeline.lib.reference_data.dataset_table_operations import ( update_or_create_joined_ht, ) +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, +) from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget +from v03_pipeline.lib.tasks.validate_callset import ValidateCallsetTask logger = get_logger(__name__) +@luigi.util.inherits(BaseLoadingRunParams) class UpdatedReferenceDatasetCollectionTask(BaseUpdateTask): reference_dataset_collection = luigi.EnumParameter(enum=ReferenceDatasetCollection) @@ -24,6 +30,17 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._datasets_to_update = [] + def requires(self) -> luigi.Task: + # Though there is no explicit functional dependency between + # validing the callset and updating the reference data, it's + # a more user-friendly experience for the callset validation + # to fail/succeed prior to attempting any + # compute intensive work. + # + # Note that, if validation is disabled or skipped the task + # still runs but is a no-op. + return self.clone(ValidateCallsetTask) + def complete(self) -> bool: self._datasets_to_update = [] datasets = self.reference_dataset_collection.datasets(self.dataset_type) @@ -37,6 +54,14 @@ def complete(self) -> bool: ) return False + if any('clinvar' in d for d in datasets) and not clinvar_versions_equal( + hl.read_table(self.output().path), + self.reference_genome, + self.dataset_type, + ): + datasets.remove('clinvar') + self._datasets_to_update.add('clinvar') + joined_ht_globals = Globals.from_ht( hl.read_table(self.output().path), datasets, diff --git a/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py index b3fdde4bb..bc19d39d5 100644 --- a/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py +++ b/v03_pipeline/lib/tasks/reference_data/updated_reference_dataset_collection_test.py @@ -10,6 +10,7 @@ DatasetType, ReferenceDatasetCollection, ReferenceGenome, + SampleType, ) from v03_pipeline.lib.paths import valid_reference_dataset_collection_path from v03_pipeline.lib.reference_data.clinvar import CLINVAR_ASSERTIONS @@ -20,6 +21,7 @@ from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase COMBINED_2_PATH = 'v03_pipeline/var/test/reference_data/test_combined_2.ht' +TEST_SNV_INDEL_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' MOCK_PRIMATE_AI_DATASET_HT = hl.Table.parallelize( [ @@ -156,20 +158,32 @@ class UpdatedReferenceDatasetCollectionTaskTest(MockedDatarootTestCase): MOCK_CONFIG, ) @mock.patch.object(ReferenceDatasetCollection, 'datasets') + @mock.patch( + 'v03_pipeline.lib.tasks.reference_data.updated_reference_dataset_collection.clinvar_versions_equal', + ) def test_update_task_with_empty_reference_data_table( self, + mock_clinvar_versions_equal, mock_rdc_datasets, ) -> None: """ Given a new task with no existing reference dataset collection table, expect the task to create a new reference dataset collection table for all datasets in the collection. """ + mock_clinvar_versions_equal.return_value = True mock_rdc_datasets.return_value = ['cadd', 'primate_ai', 'clinvar'] worker = luigi.worker.Worker() task = UpdatedReferenceDatasetCollectionTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, reference_dataset_collection=ReferenceDatasetCollection.COMBINED, + sample_type=SampleType.WGS, + callset_path=TEST_SNV_INDEL_VCF, + project_guids=[], + project_remap_paths=[], + project_pedigree_paths=[], + skip_validation=True, + run_id='2', ) worker.add(task) worker.run() @@ -279,6 +293,13 @@ def test_update_task_with_existing_reference_dataset_collection_table( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, reference_dataset_collection=ReferenceDatasetCollection.COMBINED, + sample_type=SampleType.WGS, + callset_path=TEST_SNV_INDEL_VCF, + project_guids=[], + project_remap_paths=[], + project_pedigree_paths=[], + skip_validation=True, + run_id='2', ) worker.add(task) worker.run() diff --git a/v03_pipeline/lib/tasks/trigger_hail_backend_reload.py b/v03_pipeline/lib/tasks/trigger_hail_backend_reload.py index 427ba23fd..f4e8d36fb 100644 --- a/v03_pipeline/lib/tasks/trigger_hail_backend_reload.py +++ b/v03_pipeline/lib/tasks/trigger_hail_backend_reload.py @@ -4,15 +4,15 @@ from v03_pipeline.lib.logger import get_logger from v03_pipeline.lib.model import Env -from v03_pipeline.lib.tasks.base.base_project_info_params import ( - BaseLoadingRunWithProjectInfoParams, +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, ) from v03_pipeline.lib.tasks.write_success_file import WriteSuccessFileTask logger = get_logger(__name__) -@luigi.util.inherits(BaseLoadingRunWithProjectInfoParams) +@luigi.util.inherits(BaseLoadingRunParams) class TriggerHailBackendReload(luigi.Task): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) diff --git a/v03_pipeline/lib/tasks/update_lookup_table.py b/v03_pipeline/lib/tasks/update_lookup_table.py index 1dd0b746a..fad438f57 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table.py +++ b/v03_pipeline/lib/tasks/update_lookup_table.py @@ -10,8 +10,8 @@ ) from v03_pipeline.lib.model.constants import PROJECTS_EXCLUDED_FROM_LOOKUP from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path -from v03_pipeline.lib.tasks.base.base_project_info_params import ( - BaseLoadingRunWithProjectInfoParams, +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, ) from v03_pipeline.lib.tasks.base.base_update_lookup_table import ( BaseUpdateLookupTableTask, @@ -21,7 +21,7 @@ ) -@luigi.util.inherits(BaseLoadingRunWithProjectInfoParams) +@luigi.util.inherits(BaseLoadingRunParams) class UpdateLookupTableTask(BaseUpdateLookupTableTask): def complete(self) -> bool: return super().complete() and hl.eval( diff --git a/v03_pipeline/lib/tasks/update_project_table.py b/v03_pipeline/lib/tasks/update_project_table.py index cd582009f..6c723ffde 100644 --- a/v03_pipeline/lib/tasks/update_project_table.py +++ b/v03_pipeline/lib/tasks/update_project_table.py @@ -9,19 +9,30 @@ remove_family_guids, ) from v03_pipeline.lib.misc.io import remap_pedigree_hash +from v03_pipeline.lib.paths import project_table_path from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams -from v03_pipeline.lib.tasks.base.base_update_project_table import ( - BaseUpdateProjectTableTask, +from v03_pipeline.lib.tasks.base.base_update import ( + BaseUpdateTask, ) +from v03_pipeline.lib.tasks.files import GCSorLocalTarget from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import ( WriteRemappedAndSubsettedCallsetTask, ) @luigi.util.inherits(BaseLoadingRunParams) -class UpdateProjectTableTask(BaseUpdateProjectTableTask): - project_remap_path = luigi.Parameter() - project_pedigree_path = luigi.Parameter() +class UpdateProjectTableTask(BaseUpdateTask): + project_i = luigi.IntParameter() + + def output(self) -> luigi.Target: + return GCSorLocalTarget( + project_table_path( + self.reference_genome, + self.dataset_type, + self.sample_type, + self.project_guids[self.project_i], + ), + ) def complete(self) -> bool: return super().complete() and hl.eval( @@ -29,8 +40,8 @@ def complete(self) -> bool: hl.Struct( callset=self.callset_path, remap_pedigree_hash=remap_pedigree_hash( - self.project_remap_path, - self.project_pedigree_path, + self.project_remap_paths[self.project_i], + self.project_pedigree_paths[self.project_i], ), ), ), @@ -39,6 +50,26 @@ def complete(self) -> bool: def requires(self) -> luigi.Task: return self.clone(WriteRemappedAndSubsettedCallsetTask) + def initialize_table(self) -> hl.Table: + key_type = self.dataset_type.table_key_type(self.reference_genome) + return hl.Table.parallelize( + [], + hl.tstruct( + **key_type, + filters=hl.tset(hl.tstr), + # NB: entries is missing here because it is untyped + # until we read the type off of the first callset aggregation. + ), + key=key_type.fields, + globals=hl.Struct( + family_guids=hl.empty_array(hl.tstr), + family_samples=hl.empty_dict(hl.tstr, hl.tarray(hl.tstr)), + updates=hl.empty_set( + hl.tstruct(callset=hl.tstr, remap_pedigree_hash=hl.tint32), + ), + ), + ) + def update_table(self, ht: hl.Table) -> hl.Table: callset_mt = hl.read_matrix_table(self.input().path) callset_ht = compute_callset_family_entries_ht( @@ -69,8 +100,8 @@ def update_table(self, ht: hl.Table) -> hl.Table: hl.Struct( callset=self.callset_path, remap_pedigree_hash=remap_pedigree_hash( - self.project_remap_path, - self.project_pedigree_path, + self.project_remap_paths[self.project_i], + self.project_pedigree_paths[self.project_i], ), ), ), diff --git a/v03_pipeline/lib/tasks/update_project_table_test.py b/v03_pipeline/lib/tasks/update_project_table_test.py index 7e6ab67f9..0daad72e0 100644 --- a/v03_pipeline/lib/tasks/update_project_table_test.py +++ b/v03_pipeline/lib/tasks/update_project_table_test.py @@ -25,9 +25,10 @@ def test_update_project_table_task(self) -> None: run_id=TEST_RUN_ID, sample_type=SampleType.WGS, callset_path=TEST_VCF, - project_guid='R0113_test_project', - project_remap_path=TEST_REMAP, - project_pedigree_path=TEST_PEDIGREE_3, + project_guids=['R0113_test_project'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_3], + project_i=0, skip_validation=True, ) worker.add(upt_task) @@ -134,9 +135,10 @@ def test_update_project_table_task_different_pedigree(self) -> None: run_id=TEST_RUN_ID, sample_type=SampleType.WGS, callset_path=TEST_VCF, - project_guid='R0113_test_project', - project_remap_path=TEST_REMAP, - project_pedigree_path=TEST_PEDIGREE_3, + project_guids=['R0113_test_project'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_3], + project_i=0, skip_validation=True, ) worker.add(upt_task) @@ -147,9 +149,10 @@ def test_update_project_table_task_different_pedigree(self) -> None: run_id=TEST_RUN_ID, sample_type=SampleType.WGS, callset_path=TEST_VCF, - project_guid='R0113_test_project', - project_remap_path=TEST_REMAP, - project_pedigree_path=TEST_PEDIGREE_3_DIFFERENT_FAMILIES, + project_guids=['R0113_test_project'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_3_DIFFERENT_FAMILIES], + project_i=0, skip_validation=True, ) worker.add(upt_task) diff --git a/v03_pipeline/lib/tasks/update_project_table_with_deleted_families.py b/v03_pipeline/lib/tasks/update_project_table_with_deleted_families.py index 90f1937dc..56277f34b 100644 --- a/v03_pipeline/lib/tasks/update_project_table_with_deleted_families.py +++ b/v03_pipeline/lib/tasks/update_project_table_with_deleted_families.py @@ -2,14 +2,27 @@ import luigi from v03_pipeline.lib.misc.family_entries import remove_family_guids -from v03_pipeline.lib.tasks.base.base_update_project_table import ( - BaseUpdateProjectTableTask, -) +from v03_pipeline.lib.model import SampleType +from v03_pipeline.lib.paths import project_table_path +from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask +from v03_pipeline.lib.tasks.files import GCSorLocalTarget -class UpdateProjectTableWithDeletedFamiliesTask(BaseUpdateProjectTableTask): +class UpdateProjectTableWithDeletedFamiliesTask(BaseUpdateTask): + sample_type = luigi.EnumParameter(enum=SampleType) + project_guid = luigi.Parameter() family_guids = luigi.ListParameter() + def output(self) -> luigi.Target: + return GCSorLocalTarget( + project_table_path( + self.reference_genome, + self.dataset_type, + self.sample_type, + self.project_guid, + ), + ) + def complete(self) -> bool: return super().complete() and hl.eval( hl.bind( @@ -26,6 +39,26 @@ def complete(self) -> bool: ), ) + def initialize_table(self) -> hl.Table: + key_type = self.dataset_type.table_key_type(self.reference_genome) + return hl.Table.parallelize( + [], + hl.tstruct( + **key_type, + filters=hl.tset(hl.tstr), + # NB: entries is missing here because it is untyped + # until we read the type off of the first callset aggregation. + ), + key=key_type.fields, + globals=hl.Struct( + family_guids=hl.empty_array(hl.tstr), + family_samples=hl.empty_dict(hl.tstr, hl.tarray(hl.tstr)), + updates=hl.empty_set( + hl.tstruct(callset=hl.tstr, remap_pedigree_hash=hl.tint32), + ), + ), + ) + def update_table(self, ht: hl.Table) -> hl.Table: return remove_family_guids( ht, diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py index 96ded8491..739247770 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py @@ -9,8 +9,8 @@ lookup_table_path, new_variants_table_path, ) -from v03_pipeline.lib.tasks.base.base_project_info_params import ( - BaseLoadingRunWithProjectInfoParams, +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, ) from v03_pipeline.lib.tasks.base.base_update_variant_annotations_table import ( BaseUpdateVariantAnnotationsTableTask, @@ -18,7 +18,7 @@ from v03_pipeline.lib.tasks.write_new_variants_table import WriteNewVariantsTableTask -@luigi.util.inherits(BaseLoadingRunWithProjectInfoParams) +@luigi.util.inherits(BaseLoadingRunParams) class UpdateVariantAnnotationsTableWithNewSamplesTask( BaseUpdateVariantAnnotationsTableTask, ): @@ -74,6 +74,12 @@ def update_table(self, ht: hl.Table) -> hl.Table: # and either present or not present in the existing annotations table. callset_variants_ht = ht.semi_join(callset_ht) ht = ht.anti_join(callset_ht) + lookup_ht = hl.read_table( + lookup_table_path( + self.reference_genome, + self.dataset_type, + ), + ) callset_variants_ht = callset_variants_ht.annotate( **get_fields( callset_variants_ht, @@ -89,6 +95,11 @@ def update_table(self, ht: hl.Table) -> hl.Table: ) ht = ht.union(callset_variants_ht, unify=True) + # Variants may have fallen out of the callset and + # have been removed from the lookup table during modification. + # Ensure we don't proceed with those variants. + ht = ht.semi_join(lookup_ht) + # Fix up the globals and mark the table as updated with these callset/project pairs. ht = self.annotate_globals(ht) return ht.annotate_globals( diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 0bc94f473..b5290e88c 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -585,7 +585,7 @@ def test_multiple_update_vat( }, paths=hl.Struct( cadd='gs://seqr-reference-data/GRCh37/CADD/CADD_snvs_and_indels.v1.6.ht', - clinvar='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', + clinvar='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', dbnsfp='gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.ht', eigen='gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.ht', exac='gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vep.ht', @@ -724,7 +724,7 @@ def test_update_vat_grch37( [ hl.Struct( cadd='gs://seqr-reference-data/GRCh37/CADD/CADD_snvs_and_indels.v1.6.ht', - clinvar='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', + clinvar='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz', dbnsfp='gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.ht', eigen='gs://seqr-reference-data/GRCh37/eigen/EIGEN_coding_noncoding.grch37.ht', exac='gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vep.ht', @@ -965,7 +965,7 @@ def test_mito_update_vat( hl.Struct( paths=hl.Struct( high_constraint_region_mito='gs://seqr-reference-data/GRCh38/mitochondrial/Helix high constraint intervals Feb-15-2022.tsv', - clinvar_mito='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', + clinvar_mito='https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', dbnsfp_mito='gs://seqr-reference-data/GRCh38/dbNSFP/v4.2/dbNSFP4.2a_variant.with_new_scores.ht', gnomad_mito='gs://gcp-public-data--gnomad/release/3.1/ht/genomes/gnomad.genomes.v3.1.sites.chrM.ht', helix_mito='gs://seqr-reference-data/GRCh38/mitochondrial/Helix/HelixMTdb_20200327.ht', diff --git a/v03_pipeline/lib/tasks/validate_callset_test.py b/v03_pipeline/lib/tasks/validate_callset_test.py index f00e5f125..991412824 100644 --- a/v03_pipeline/lib/tasks/validate_callset_test.py +++ b/v03_pipeline/lib/tasks/validate_callset_test.py @@ -62,6 +62,7 @@ def test_validate_callset_multiple_exceptions( # a NON_REF allele type at position chr1: 902024, missing # all contigs but chr1, and contains non-coding variants. callset_path=MULTIPLE_VALIDATION_EXCEPTIONS_VCF, + project_guids=['project_a'], skip_validation=False, run_id=TEST_RUN_ID, ) @@ -74,6 +75,7 @@ def test_validate_callset_multiple_exceptions( dataset_type=DatasetType.SNV_INDEL, sample_type=SampleType.WES, callset_path=MULTIPLE_VALIDATION_EXCEPTIONS_VCF, + project_guids=['project_a'], skip_validation=False, run_id=TEST_RUN_ID, ) @@ -82,6 +84,7 @@ def test_validate_callset_multiple_exceptions( self.assertDictEqual( json.load(f), { + 'project_guids': ['project_a'], 'error_messages': [ 'Alleles with invalid allele are present in the callset. This appears to be a GVCF containing records for sites with no variants.', "Variants are present multiple times in the callset: ['1-902088-G-A']", diff --git a/v03_pipeline/lib/tasks/write_family_table.py b/v03_pipeline/lib/tasks/write_family_table.py index 42715aff9..9ffbc5482 100644 --- a/v03_pipeline/lib/tasks/write_family_table.py +++ b/v03_pipeline/lib/tasks/write_family_table.py @@ -13,9 +13,7 @@ @luigi.util.inherits(BaseLoadingRunParams) class WriteFamilyTableTask(BaseWriteTask): - project_guid = luigi.Parameter() - project_remap_path = luigi.Parameter() - project_pedigree_path = luigi.Parameter() + project_i = luigi.IntParameter() family_guid = luigi.Parameter() def output(self) -> luigi.Target: diff --git a/v03_pipeline/lib/tasks/write_family_table_test.py b/v03_pipeline/lib/tasks/write_family_table_test.py index 5c6995146..60d6f0e41 100644 --- a/v03_pipeline/lib/tasks/write_family_table_test.py +++ b/v03_pipeline/lib/tasks/write_family_table_test.py @@ -24,9 +24,10 @@ def test_snv_write_family_table_task(self) -> None: run_id=TEST_RUN_ID, sample_type=SampleType.WGS, callset_path=TEST_SNV_INDEL_VCF, - project_guid='R0113_test_project', - project_remap_path=TEST_REMAP, - project_pedigree_path=TEST_PEDIGREE_3, + project_guids=['R0113_test_project'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_3], + project_i=0, family_guid='abc_1', skip_validation=True, ) @@ -162,9 +163,10 @@ def test_sv_write_family_table_task(self) -> None: run_id=TEST_RUN_ID, sample_type=SampleType.WGS, callset_path=TEST_SV_VCF, - project_guid='R0115_test_project2', - project_remap_path='not_a_real_file', - project_pedigree_path=TEST_PEDIGREE_5, + project_guids=['R0115_test_project2'], + project_remap_paths=['not_a_real_file'], + project_pedigree_paths=[TEST_PEDIGREE_5], + project_i=0, family_guid='family_2_1', skip_validation=True, ) @@ -415,9 +417,10 @@ def test_gcnv_write_family_table_task(self) -> None: run_id=TEST_RUN_ID, sample_type=SampleType.WES, callset_path=TEST_GCNV_BED_FILE, - project_guid='R0115_test_project2', - project_remap_path='not_a_real_file', - project_pedigree_path=TEST_PEDIGREE_5, + project_guids=['R0115_test_project2'], + project_remap_paths=['not_a_real_file'], + project_pedigree_paths=[TEST_PEDIGREE_5], + project_i=0, family_guid='family_2_1', skip_validation=True, ) diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run.py b/v03_pipeline/lib/tasks/write_metadata_for_run.py index 3432dd891..cc012a926 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run.py @@ -4,9 +4,12 @@ import luigi import luigi.util -from v03_pipeline.lib.paths import metadata_for_run_path -from v03_pipeline.lib.tasks.base.base_project_info_params import ( - BaseLoadingRunWithProjectInfoParams, +from v03_pipeline.lib.paths import ( + metadata_for_run_path, + relatedness_check_tsv_path, +) +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, ) from v03_pipeline.lib.tasks.files import GCSorLocalTarget from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import ( @@ -14,7 +17,7 @@ ) -@luigi.util.inherits(BaseLoadingRunWithProjectInfoParams) +@luigi.util.inherits(BaseLoadingRunParams) class WriteMetadataForRunTask(luigi.Task): def output(self) -> luigi.Target: return GCSorLocalTarget( @@ -29,16 +32,9 @@ def requires(self) -> list[luigi.Task]: return [ self.clone( WriteRemappedAndSubsettedCallsetTask, - project_guid=project_guid, - project_remap_path=project_remap_path, - project_pedigree_path=project_pedigree_path, - ) - for (project_guid, project_remap_path, project_pedigree_path) in zip( - self.project_guids, - self.project_remap_paths, - self.project_pedigree_paths, - strict=True, + project_i=i, ) + for i in range(len(self.project_guids)) ] def run(self) -> None: @@ -46,12 +42,18 @@ def run(self) -> None: 'callsets': [self.callset_path], 'run_id': self.run_id, 'sample_type': self.sample_type.value, + 'project_guids': self.project_guids, 'family_samples': {}, 'failed_family_samples': { 'missing_samples': {}, 'relatedness_check': {}, 'sex_check': {}, }, + 'relatedness_check_file_path': relatedness_check_tsv_path( + self.reference_genome, + self.dataset_type, + self.callset_path, + ), } for remapped_and_subsetted_callset in self.input(): callset_mt = hl.read_matrix_table(remapped_and_subsetted_callset.path) diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py index f5d733a79..dc007fbcb 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py @@ -3,6 +3,7 @@ import luigi.worker from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.paths import relatedness_check_tsv_path from v03_pipeline.lib.tasks.write_metadata_for_run import WriteMetadataForRunTask from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase @@ -37,6 +38,7 @@ def test_write_metadata_for_run_task(self) -> None: json.load(f), { 'callsets': [TEST_VCF], + 'project_guids': ['R0113_test_project', 'R0114_project4'], 'failed_family_samples': { 'missing_samples': { 'efg_1': { @@ -70,5 +72,10 @@ def test_write_metadata_for_run_task(self) -> None: }, 'run_id': 'run_123456', 'sample_type': SampleType.WGS.value, + 'relatedness_check_file_path': relatedness_check_tsv_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + TEST_VCF, + ), }, ) diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index f8b1c570b..a312084b4 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -24,8 +24,8 @@ load_gencode_ensembl_to_refseq_id, load_gencode_gene_symbol_to_gene_id, ) -from v03_pipeline.lib.tasks.base.base_project_info_params import ( - BaseLoadingRunWithProjectInfoParams, +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, ) from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget @@ -45,7 +45,7 @@ GENCODE_FOR_VEP_RELEASE = 44 -@luigi.util.inherits(BaseLoadingRunWithProjectInfoParams) +@luigi.util.inherits(BaseLoadingRunParams) class WriteNewVariantsTableTask(BaseWriteTask): @property def annotation_dependencies(self) -> dict[str, hl.Table]: @@ -58,7 +58,7 @@ def annotation_dependencies(self) -> dict[str, hl.Table]: ) if self.dataset_type.has_gencode_gene_symbol_to_gene_id_mapping: deps['gencode_gene_symbol_to_gene_id_mapping'] = hl.literal( - load_gencode_gene_symbol_to_gene_id(GENCODE_RELEASE, ''), + load_gencode_gene_symbol_to_gene_id(GENCODE_RELEASE), ) deps[ 'grch37_to_grch38_liftover_ref_path' @@ -79,10 +79,7 @@ def output(self) -> luigi.Target: def requires(self) -> list[luigi.Task]: requirements = [ - UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( - self.reference_genome, - self.dataset_type, - ), + self.clone(UpdateVariantAnnotationsTableWithUpdatedReferenceDataset), ] if self.dataset_type.has_lookup_table: # NB: the lookup table task has remapped and subsetted callset tasks as dependencies. diff --git a/v03_pipeline/lib/tasks/write_project_family_tables.py b/v03_pipeline/lib/tasks/write_project_family_tables.py index f9b7df74f..7085a3aa1 100644 --- a/v03_pipeline/lib/tasks/write_project_family_tables.py +++ b/v03_pipeline/lib/tasks/write_project_family_tables.py @@ -2,19 +2,18 @@ import luigi import luigi.util -from v03_pipeline.lib.misc.io import import_pedigree -from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families +from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams -from v03_pipeline.lib.tasks.files import RawFileTask from v03_pipeline.lib.tasks.update_project_table import UpdateProjectTableTask from v03_pipeline.lib.tasks.write_family_table import WriteFamilyTableTask +from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import ( + WriteRemappedAndSubsettedCallsetTask, +) @luigi.util.inherits(BaseLoadingRunParams) class WriteProjectFamilyTablesTask(luigi.Task): - project_guid = luigi.Parameter() - project_remap_path = luigi.Parameter() - project_pedigree_path = luigi.Parameter() + project_i = luigi.IntParameter() def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -26,27 +25,26 @@ def complete(self) -> bool: for write_family_table_task in self.dynamic_write_family_table_tasks ) - def run(self): - # https://luigi.readthedocs.io/en/stable/tasks.html#dynamic-dependencies - # Fetch family guids from project table - update_project_table_task: luigi.Target = yield self.clone( - UpdateProjectTableTask, - ) - project_ht = hl.read_table(update_project_table_task.path) - family_guids_in_project_table = set(hl.eval(project_ht.globals.family_guids)) + def requires(self) -> list[luigi.Task]: + return [ + self.clone( + WriteRemappedAndSubsettedCallsetTask, + ), + self.clone( + UpdateProjectTableTask, + ), + ] - # Fetch family guids from pedigree - pedigree_ht_task: luigi.Target = yield RawFileTask(self.project_pedigree_path) - pedigree_ht = import_pedigree(pedigree_ht_task.path) - families_guids_in_pedigree = { - f.family_guid for f in parse_pedigree_ht_to_families(pedigree_ht) - } - - # Intersect them - family_guids_to_load = ( - family_guids_in_project_table & families_guids_in_pedigree + def run(self): + ht = hl.read_matrix_table( + remapped_and_subsetted_callset_path( + self.reference_genome, + self.dataset_type, + self.callset_path, + self.project_guids[self.project_i], + ), ) - for family_guid in family_guids_to_load: + for family_guid in set(hl.eval(ht.globals.family_samples).keys()): self.dynamic_write_family_table_tasks.add( self.clone(WriteFamilyTableTask, family_guid=family_guid), ) diff --git a/v03_pipeline/lib/tasks/write_project_family_tables_test.py b/v03_pipeline/lib/tasks/write_project_family_tables_test.py index 3d23e9b60..dd535f988 100644 --- a/v03_pipeline/lib/tasks/write_project_family_tables_test.py +++ b/v03_pipeline/lib/tasks/write_project_family_tables_test.py @@ -2,7 +2,10 @@ import luigi.worker from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType -from v03_pipeline.lib.paths import project_table_path +from v03_pipeline.lib.paths import ( + project_table_path, + remapped_and_subsetted_callset_path, +) from v03_pipeline.lib.tasks.write_project_family_tables import ( WriteProjectFamilyTablesTask, ) @@ -25,9 +28,10 @@ def test_snv_write_project_family_tables_task(self) -> None: run_id=TEST_RUN_ID, sample_type=SampleType.WGS, callset_path=TEST_SNV_INDEL_VCF, - project_guid='R0113_test_project', - project_remap_path=TEST_REMAP, - project_pedigree_path=TEST_PEDIGREE_4, + project_guids=['R0113_test_project'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_4], + project_i=0, skip_validation=True, skip_check_sex_and_relatedness=True, ) @@ -38,6 +42,33 @@ def test_snv_write_project_family_tables_task(self) -> None: hl.read_table(write_family_table_task.output().path) for write_family_table_task in write_project_family_tables.dynamic_write_family_table_tasks ] + # Validate remapped and subsetted callset families + remapped_and_subsetted_callset = hl.read_matrix_table( + remapped_and_subsetted_callset_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + TEST_SNV_INDEL_VCF, + 'R0113_test_project', + ), + ) + self.assertCountEqual( + hl.eval(remapped_and_subsetted_callset.globals.family_samples.keys()), + { + '123_1', + '234_1', + '345_1', + '456_1', + '567_1', + '678_1', + '789_1', + '890_1', + '901_1', + 'bcd_1', + 'cde_1', + 'def_1', + 'efg_1', + }, + ) self.assertCountEqual( [ht.globals.sample_ids.collect() for ht in hts], [ @@ -63,9 +94,10 @@ def test_snv_write_project_family_tables_task(self) -> None: run_id=TEST_RUN_ID, sample_type=SampleType.WGS, callset_path=TEST_SNV_INDEL_VCF, - project_guid='R0113_test_project', - project_remap_path=TEST_REMAP, - project_pedigree_path=TEST_PEDIGREE_4_SUBSET, + project_guids=['R0113_test_project'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_4_SUBSET], + project_i=0, skip_validation=True, skip_check_sex_and_relatedness=True, ) @@ -73,13 +105,39 @@ def test_snv_write_project_family_tables_task(self) -> None: worker.run() self.assertTrue(write_project_family_tables_subset.complete()) hts = [ - hl.read_table(write_family_table_task.output().path) + write_family_table_task.output().path for write_family_table_task in write_project_family_tables_subset.dynamic_write_family_table_tasks ] - # Only one family table written - self.assertEqual( - len(hts), - 1, + self.assertTrue(len(hts)) + self.assertTrue( + '123_1' in hts[0], + ) + # Validate remapped and subsetted callset families + # (and that it was re-written) + remapped_and_subsetted_callset = hl.read_matrix_table( + remapped_and_subsetted_callset_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + TEST_SNV_INDEL_VCF, + 'R0113_test_project', + ), + ) + self.assertCountEqual( + hl.eval(remapped_and_subsetted_callset.globals.family_samples.keys()), + {'123_1'}, + ) + self.assertCountEqual( + hl.eval(remapped_and_subsetted_callset.globals.failed_family_samples), + hl.Struct( + missing_samples={ + '234_1': { + 'reasons': ["Missing samples: {'NA19678_999'}"], + 'samples': ['NA19678_1', 'NA19678_999'], + }, + }, + relatedness_check={}, + sex_check={}, + ), ) # Project table still contains all family guids self.assertCountEqual( diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_tsv.py b/v03_pipeline/lib/tasks/write_relatedness_check_tsv.py new file mode 100644 index 000000000..bfe303a4a --- /dev/null +++ b/v03_pipeline/lib/tasks/write_relatedness_check_tsv.py @@ -0,0 +1,29 @@ +import hail as hl +import luigi +import luigi.util + +from v03_pipeline.lib.paths import relatedness_check_tsv_path +from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams +from v03_pipeline.lib.tasks.files import GCSorLocalTarget +from v03_pipeline.lib.tasks.write_relatedness_check_table import ( + WriteRelatednessCheckTableTask, +) + + +@luigi.util.inherits(BaseLoadingRunParams) +class WriteRelatednessCheckTsvTask(luigi.Task): + def output(self) -> luigi.Target: + return GCSorLocalTarget( + relatedness_check_tsv_path( + self.reference_genome, + self.dataset_type, + self.callset_path, + ), + ) + + def requires(self): + return [self.clone(WriteRelatednessCheckTableTask)] + + def run(self): + ht = hl.read_table(self.input()[0].path) + ht.export(self.output().path) diff --git a/v03_pipeline/lib/tasks/write_relatedness_check_tsv_test.py b/v03_pipeline/lib/tasks/write_relatedness_check_tsv_test.py new file mode 100644 index 000000000..49f174340 --- /dev/null +++ b/v03_pipeline/lib/tasks/write_relatedness_check_tsv_test.py @@ -0,0 +1,53 @@ +import shutil + +import luigi.worker + +from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.paths import relatedness_check_table_path +from v03_pipeline.lib.tasks.write_relatedness_check_tsv import ( + WriteRelatednessCheckTsvTask, +) +from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase + +TEST_RELATEDNESS_CHECK_1 = ( + 'v03_pipeline/var/test/relatedness_check/test_relatedness_check_1.ht' +) +TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' +TEST_RUN_ID = 'manual__2024-04-03' + + +class WriteRelatednessCheckTsvTaskTest(MockedDatarootTestCase): + def setUp(self) -> None: + super().setUp() + shutil.copytree( + TEST_RELATEDNESS_CHECK_1, + relatedness_check_table_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + TEST_VCF, + ), + ) + + def test_write_relatedness_check_tsv_task( + self, + ) -> None: + worker = luigi.worker.Worker() + task = WriteRelatednessCheckTsvTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + callset_path=TEST_VCF, + run_id=TEST_RUN_ID, + sample_type=SampleType.WES, + ) + worker.add(task) + worker.run() + self.assertTrue(task.complete()) + with task.output().open('r') as f: + lines = f.readlines() + expected_lines = [ + 'i\tj\tibd0\tibd1\tibd2\tpi_hat\n', + 'HG00731_1\tHG00733_1\t0\t1\t0\t5.0000e-01\n', + 'HG00732_1\tHG00733_1\t0\t1\t0\t5.0000e-01\n', + ] + for expected_line, actual_line in zip(expected_lines, lines, strict=False): + self.assertEqual(expected_line, actual_line) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index e3e0a0e4f..f4c934662 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -17,13 +17,16 @@ from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families from v03_pipeline.lib.misc.sample_ids import remap_sample_ids, subset_samples from v03_pipeline.lib.model.environment import Env -from v03_pipeline.lib.paths import remapped_and_subsetted_callset_path +from v03_pipeline.lib.paths import ( + relatedness_check_table_path, + remapped_and_subsetted_callset_path, +) from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_write import BaseWriteTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget, RawFileTask from v03_pipeline.lib.tasks.validate_callset import ValidateCallsetTask -from v03_pipeline.lib.tasks.write_relatedness_check_table import ( - WriteRelatednessCheckTableTask, +from v03_pipeline.lib.tasks.write_relatedness_check_tsv import ( + WriteRelatednessCheckTsvTask, ) from v03_pipeline.lib.tasks.write_sex_check_table import WriteSexCheckTableTask @@ -32,16 +35,14 @@ @luigi.util.inherits(BaseLoadingRunParams) class WriteRemappedAndSubsettedCallsetTask(BaseWriteTask): - project_guid = luigi.Parameter() - project_remap_path = luigi.Parameter() - project_pedigree_path = luigi.Parameter() + project_i = luigi.IntParameter() def complete(self) -> luigi.Target: return super().complete() and hl.eval( hl.read_matrix_table(self.output().path).globals.remap_pedigree_hash == remap_pedigree_hash( - self.project_remap_path, - self.project_pedigree_path, + self.project_remap_paths[self.project_i], + self.project_pedigree_paths[self.project_i], ), ) @@ -51,14 +52,14 @@ def output(self) -> luigi.Target: self.reference_genome, self.dataset_type, self.callset_path, - self.project_guid, + self.project_guids[self.project_i], ), ) def requires(self) -> list[luigi.Task]: requirements = [ self.clone(ValidateCallsetTask), - RawFileTask(self.project_pedigree_path), + RawFileTask(self.project_pedigree_paths[self.project_i]), ] if ( Env.CHECK_SEX_AND_RELATEDNESS @@ -67,7 +68,7 @@ def requires(self) -> list[luigi.Task]: ): requirements = [ *requirements, - self.clone(WriteRelatednessCheckTableTask), + self.clone(WriteRelatednessCheckTsvTask), self.clone(WriteSexCheckTableTask), ] return requirements @@ -78,8 +79,8 @@ def create_table(self) -> hl.MatrixTable: # Remap, but only if the remap file is present! remap_lookup = hl.empty_dict(hl.tstr, hl.tstr) - if does_file_exist(self.project_remap_path): - project_remap_ht = import_remap(self.project_remap_path) + if does_file_exist(self.project_remap_paths[self.project_i]): + project_remap_ht = import_remap(self.project_remap_paths[self.project_i]) callset_mt = remap_sample_ids( callset_mt, project_remap_ht, @@ -101,7 +102,13 @@ def create_table(self) -> hl.MatrixTable: and self.dataset_type.check_sex_and_relatedness and not self.skip_check_sex_and_relatedness ): - relatedness_check_ht = hl.read_table(self.input()[2].path) + relatedness_check_ht = hl.read_table( + relatedness_check_table_path( + self.reference_genome, + self.dataset_type, + self.callset_path, + ), + ) sex_check_ht = hl.read_table(self.input()[3].path) families_failed_relatedness_check = get_families_failed_relatedness_check( families - families_failed_missing_samples.keys(), @@ -153,8 +160,8 @@ def create_table(self) -> hl.MatrixTable: mt = mt.drop(field) return mt.select_globals( remap_pedigree_hash=remap_pedigree_hash( - self.project_remap_path, - self.project_pedigree_path, + self.project_remap_paths[self.project_i], + self.project_pedigree_paths[self.project_i], ), family_samples=( { diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py index 1ed7550a6..4a0c84660 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py @@ -84,9 +84,10 @@ def test_write_remapped_and_subsetted_callset_task( run_id=TEST_RUN_ID, sample_type=SampleType.WGS, callset_path=TEST_VCF, - project_guid='R0113_test_project', - project_remap_path=TEST_REMAP, - project_pedigree_path=TEST_PEDIGREE_3, + project_guids=['R0113_test_project'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_3], + project_i=0, skip_validation=True, ) worker.add(wrsc_task) @@ -127,9 +128,10 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family( run_id=TEST_RUN_ID, sample_type=SampleType.WGS, callset_path=TEST_VCF, - project_guid='R0114_project4', - project_remap_path=TEST_REMAP, - project_pedigree_path=TEST_PEDIGREE_4, + project_guids=['R0114_project4'], + project_remap_paths=[TEST_REMAP], + project_pedigree_paths=[TEST_PEDIGREE_4], + project_i=0, skip_validation=True, ) worker.add(wrsc_task) diff --git a/v03_pipeline/lib/tasks/write_success_file.py b/v03_pipeline/lib/tasks/write_success_file.py index 3576a8d33..3dc471063 100644 --- a/v03_pipeline/lib/tasks/write_success_file.py +++ b/v03_pipeline/lib/tasks/write_success_file.py @@ -3,8 +3,8 @@ from v03_pipeline.lib.paths import pipeline_run_success_file_path from v03_pipeline.lib.tasks import WriteProjectFamilyTablesTask -from v03_pipeline.lib.tasks.base.base_project_info_params import ( - BaseLoadingRunWithProjectInfoParams, +from v03_pipeline.lib.tasks.base.base_loading_run_params import ( + BaseLoadingRunParams, ) from v03_pipeline.lib.tasks.files import GCSorLocalTarget from v03_pipeline.lib.tasks.update_variant_annotations_table_with_new_samples import ( @@ -12,7 +12,7 @@ ) -@luigi.util.inherits(BaseLoadingRunWithProjectInfoParams) +@luigi.util.inherits(BaseLoadingRunParams) class WriteSuccessFileTask(luigi.Task): def output(self) -> luigi.Target: return GCSorLocalTarget( @@ -32,9 +32,7 @@ def requires(self): *[ self.clone( WriteProjectFamilyTablesTask, - project_guid=self.project_guids[i], - project_remap_path=self.project_remap_paths[i], - project_pedigree_path=self.project_pedigree_paths[i], + project_i=i, ) for i in range(len(self.project_guids)) ], diff --git a/v03_pipeline/lib/tasks/write_validation_errors_for_run.py b/v03_pipeline/lib/tasks/write_validation_errors_for_run.py index eaefb0e8c..9149f6158 100644 --- a/v03_pipeline/lib/tasks/write_validation_errors_for_run.py +++ b/v03_pipeline/lib/tasks/write_validation_errors_for_run.py @@ -10,6 +10,7 @@ @luigi.util.inherits(BaseLoadingRunParams) class WriteValidationErrorsForRunTask(luigi.Task): + project_guids = luigi.ListParameter() error_messages = luigi.ListParameter(default=[]) def to_single_error_message(self) -> str: @@ -30,6 +31,7 @@ def output(self) -> luigi.Target: def run(self) -> None: validation_errors_json = { + 'project_guids': self.project_guids, 'error_messages': self.error_messages, } with self.output().open('w') as f: diff --git a/v03_pipeline/var/test/pedigrees/test_pedigree_4_subset.tsv b/v03_pipeline/var/test/pedigrees/test_pedigree_4_subset.tsv index 63e2addd8..dc022f159 100644 --- a/v03_pipeline/var/test/pedigrees/test_pedigree_4_subset.tsv +++ b/v03_pipeline/var/test/pedigrees/test_pedigree_4_subset.tsv @@ -1,2 +1,4 @@ Project_GUID Family_GUID Family_ID Individual_ID Paternal_ID Maternal_ID Sex R0114_project4 123_1 123 NA19675_1 F +R0114_project4 234_1 234 NA19678_1 M +R0114_project4 234_1 234 NA19678_999 F diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/.README.txt.crc index 22d3757fc..add5a1942 100644 Binary files a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/.README.txt.crc and b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/.metadata.json.gz.crc index 800ce4b09..3a7d8101c 100644 Binary files a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/README.txt b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/README.txt index 7ed0c7ae4..9aea8fa4b 100644 --- a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/README.txt +++ b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. - Written with version 0.2.120-f00f916faf78 - Created at 2024/03/15 15:45:48 \ No newline at end of file + Written with version 0.2.133-4c60fddb171a + Created at 2024/11/02 13:12:12 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/parts/.part-0.crc index 7e41d08ed..c96ad70c9 100644 Binary files a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/parts/.part-0.crc and b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/parts/part-0 index 67904bd84..bb1d53943 100644 Binary files a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/parts/part-0 and b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/.index.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.idx/.index.crc rename to v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/.index.crc diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.idx/index b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/index similarity index 100% rename from v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.idx/index rename to v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/index diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/index/part-0-9e75273d-7113-40e4-a327-453f3451dc8c.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/metadata.json.gz index 213fc997c..5aed747bc 100644 Binary files a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/.metadata.json.gz.crc index d581eec45..682fea6e7 100644 Binary files a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/metadata.json.gz index 0b6e1772c..d37774da9 100644 Binary files a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/.part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.crc b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/.part-0-9e75273d-7113-40e4-a327-453f3451dc8c.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/.part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a.crc rename to v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/.part-0-9e75273d-7113-40e4-a327-453f3451dc8c.crc diff --git a/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a b/v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/part-0-9e75273d-7113-40e4-a327-453f3451dc8c similarity index 100% rename from v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/part-0-d5c50933-cc72-4716-b930-1e885aa0ba7a rename to v03_pipeline/var/test/reference_data/test_clinvar_path_variants_crdq.ht/rows/parts/part-0-9e75273d-7113-40e4-a327-453f3451dc8c diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/.README.txt.crc new file mode 100644 index 000000000..e175e8da4 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/._SUCCESS.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/._SUCCESS.crc new file mode 100644 index 000000000..3b7b04493 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/._SUCCESS.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/.metadata.json.gz.crc new file mode 100644 index 000000000..5def68f7f Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/README.txt b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/README.txt new file mode 100644 index 000000000..1b764aef2 --- /dev/null +++ b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/README.txt @@ -0,0 +1,3 @@ +This folder comprises a Hail (www.hail.is) native Table or MatrixTable. + Written with version 0.2.133-4c60fddb171a + Created at 2024/11/02 13:13:26 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/_SUCCESS b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/_SUCCESS new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/.metadata.json.gz.crc new file mode 100644 index 000000000..92c2ee4f3 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/metadata.json.gz new file mode 100644 index 000000000..26e678a01 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/parts/.part-0.crc new file mode 100644 index 000000000..66c495184 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/parts/part-0 new file mode 100644 index 000000000..31232639d Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/index/part-0-3569201c-d630-43c4-9056-cbace806fe8d.idx/.index.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/.index.crc rename to v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/index/part-0-3569201c-d630-43c4-9056-cbace806fe8d.idx/.index.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/index/part-0-3569201c-d630-43c4-9056-cbace806fe8d.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/index/part-0-3569201c-d630-43c4-9056-cbace806fe8d.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/index b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/index/part-0-3569201c-d630-43c4-9056-cbace806fe8d.idx/index similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/index rename to v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/index/part-0-3569201c-d630-43c4-9056-cbace806fe8d.idx/index diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/index/part-0-3569201c-d630-43c4-9056-cbace806fe8d.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/index/part-0-3569201c-d630-43c4-9056-cbace806fe8d.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/metadata.json.gz new file mode 100644 index 000000000..351b9c8a1 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/.metadata.json.gz.crc new file mode 100644 index 000000000..edeb97082 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/metadata.json.gz new file mode 100644 index 000000000..8ab2a9563 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/parts/.part-0-3569201c-d630-43c4-9056-cbace806fe8d.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad.crc rename to v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/parts/.part-0-3569201c-d630-43c4-9056-cbace806fe8d.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad b/v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/parts/part-0-3569201c-d630-43c4-9056-cbace806fe8d similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-6a5a9d6a-4ded-424b-9735-922a5346e7ad rename to v03_pipeline/var/test/reference_data/test_combined_1.ht.ht/rows/parts/part-0-3569201c-d630-43c4-9056-cbace806fe8d diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/.README.txt.crc index 1c47b9a3c..2796480e9 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/.README.txt.crc and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/.metadata.json.gz.crc index db7a7824c..5def68f7f 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/README.txt b/v03_pipeline/var/test/reference_data/test_combined_1.ht/README.txt index e46de4296..9b284affa 100644 --- a/v03_pipeline/var/test/reference_data/test_combined_1.ht/README.txt +++ b/v03_pipeline/var/test/reference_data/test_combined_1.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. - Written with version 0.2.130-bea04d9c79b5 - Created at 2024/05/20 13:48:16 \ No newline at end of file + Written with version 0.2.133-4c60fddb171a + Created at 2024/11/02 15:22:20 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/.metadata.json.gz.crc index b47637bf1..92c2ee4f3 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/metadata.json.gz index 534d126c7..26e678a01 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/parts/.part-0.crc index 808712b8f..66c495184 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/parts/.part-0.crc and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/parts/part-0 index ef03f366e..31232639d 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/parts/part-0 and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/.index.crc new file mode 100644 index 000000000..7cb9c5aaf Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/.index.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/.metadata.json.gz.crc new file mode 100644 index 000000000..9af5fa925 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/index b/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/index new file mode 100644 index 000000000..a979d82bf Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/index differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/metadata.json.gz new file mode 100644 index 000000000..051d3e03d Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/index/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.idx/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht/metadata.json.gz index d00565756..351b9c8a1 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/.metadata.json.gz.crc index ddb5e7f25..e7c96acca 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/metadata.json.gz index 19968eb85..d2c7ccb1c 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.crc b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.crc new file mode 100644 index 000000000..dd555f553 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/.part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2 b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2 new file mode 100644 index 000000000..446fb5491 Binary files /dev/null and b/v03_pipeline/var/test/reference_data/test_combined_1.ht/rows/parts/part-0-1d126232-414b-4ffa-aa43-9ed52895fbf2 differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/.README.txt.crc index 1b96b5393..394adb99d 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/.README.txt.crc and b/v03_pipeline/var/test/reference_data/test_combined_37.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/.metadata.json.gz.crc index 82e0d4035..6b72fb1f0 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_combined_37.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/README.txt b/v03_pipeline/var/test/reference_data/test_combined_37.ht/README.txt index e38d73d71..f5927612a 100644 --- a/v03_pipeline/var/test/reference_data/test_combined_37.ht/README.txt +++ b/v03_pipeline/var/test/reference_data/test_combined_37.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. - Written with version 0.2.130-bea04d9c79b5 - Created at 2024/05/20 15:38:26 \ No newline at end of file + Written with version 0.2.133-4c60fddb171a + Created at 2024/11/02 13:18:45 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/parts/.part-0.crc index f3ed5e11b..3181e5991 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/parts/.part-0.crc and b/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/parts/part-0 index 259a7345f..92eda86fb 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/parts/part-0 and b/v03_pipeline/var/test/reference_data/test_combined_37.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx/.index.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/.index.crc rename to v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx/.index.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/index b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx/index similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/index rename to v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx/index diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_data/test_combined_37.ht/index/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_37.ht/metadata.json.gz index 00685b5ad..91f89d511 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_combined_37.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/.metadata.json.gz.crc index 8474f90b1..580630336 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/metadata.json.gz index b83d7239a..c22d07b9f 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/.part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.crc b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/.part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/.part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff.crc rename to v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/.part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff b/v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2 similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-ac85fcb0-1e7c-453f-be81-9cd356dc49ff rename to v03_pipeline/var/test/reference_data/test_combined_37.ht/rows/parts/part-0-6353b1d7-bc23-4f3a-9fa2-dd9321ab97a2 diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc index e08d4d12b..b76813439 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.metadata.json.gz.crc index d328f484c..02e51be97 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/README.txt b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/README.txt index 704275b10..f7cb50ea3 100644 --- a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/README.txt +++ b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/README.txt @@ -1,3 +1,3 @@ This folder comprises a Hail (www.hail.is) native Table or MatrixTable. - Written with version 0.2.130-bea04d9c79b5 - Created at 2024/07/24 14:11:11 \ No newline at end of file + Written with version 0.2.133-4c60fddb171a + Created at 2024/11/02 15:10:48 \ No newline at end of file diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/.part-0.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/.part-0.crc index 7b3d99c48..21abd8af5 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/.part-0.crc and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/.part-0.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/part-0 index 2493dddf9..6e7d4be57 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/part-0 and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/part-0 differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-3c042736-0e6c-4911-9b80-b9356af9df25.idx/.index.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/.index.crc rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-3c042736-0e6c-4911-9b80-b9356af9df25.idx/.index.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-3c042736-0e6c-4911-9b80-b9356af9df25.idx/.metadata.json.gz.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/.metadata.json.gz.crc rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-3c042736-0e6c-4911-9b80-b9356af9df25.idx/.metadata.json.gz.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/index b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-3c042736-0e6c-4911-9b80-b9356af9df25.idx/index similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/index rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-3c042736-0e6c-4911-9b80-b9356af9df25.idx/index diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-3c042736-0e6c-4911-9b80-b9356af9df25.idx/metadata.json.gz similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/metadata.json.gz rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-3c042736-0e6c-4911-9b80-b9356af9df25.idx/metadata.json.gz diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz index 95672cd45..a43d1f48c 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc index a927fa9da..9a55fd307 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/.metadata.json.gz.crc differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz index 213bdb7aa..dc89f5aa6 100644 Binary files a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz and b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz differ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-3c042736-0e6c-4911-9b80-b9356af9df25.crc similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.crc rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-3c042736-0e6c-4911-9b80-b9356af9df25.crc diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90 b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-3c042736-0e6c-4911-9b80-b9356af9df25 similarity index 100% rename from v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90 rename to v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-3c042736-0e6c-4911-9b80-b9356af9df25