From f7992ebdc0563fb8667cb3513d2f9865b891fc6d Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 24 Jul 2024 11:03:45 -0400 Subject: [PATCH 1/9] Move vep files (#844) --- v03_pipeline/bin/vep-110-GRCh38.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/v03_pipeline/bin/vep-110-GRCh38.sh b/v03_pipeline/bin/vep-110-GRCh38.sh index cd649b2ad..dead36366 100644 --- a/v03_pipeline/bin/vep-110-GRCh38.sh +++ b/v03_pipeline/bin/vep-110-GRCh38.sh @@ -12,7 +12,7 @@ export PROJECT="$(gcloud config get-value project)" export VEP_CONFIG_PATH="$(/usr/share/google/get_metadata_value attributes/VEP_CONFIG_PATH)" export VEP_REPLICATE="$(/usr/share/google/get_metadata_value attributes/VEP_REPLICATE)" export ASSEMBLY=GRCh38 -export VEP_DOCKER_IMAGE=gcr.io/seqr-project/vep-docker-image:110 +export VEP_DOCKER_IMAGE=gcr.io/seqr-project/vep-docker-image:GRCh38 mkdir -p /vep_data @@ -36,26 +36,26 @@ sleep 60 sudo service docker restart # Copied from the repo at v03_pipeline/var/vep_config -gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/110/vep-${ASSEMBLY}.json $VEP_CONFIG_PATH +gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/vep-${ASSEMBLY}.json $VEP_CONFIG_PATH # Copied from the UTRAnnotator repo (https://github.com/ImperialCardioGenetics/UTRannotator/tree/master) -gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/110/uORF_5UTR_${ASSEMBLY}_PUBLIC.txt /vep_data/ & +gcloud storage cp --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/uORF_5UTR_${ASSEMBLY}_PUBLIC.txt /vep_data/ & # Raw data files copied from the bucket (https://console.cloud.google.com/storage/browser/dm_alphamissense;tab=objects?prefix=&forceOnObjectsSortingFiltering=false) # tabix -s 1 -b 2 -e 2 -f -S 1 AlphaMissense_hg38.tsv.gz -gcloud storage cp --billing-project $PROJECT 'gs://seqr-reference-data/vep/110/AlphaMissense_hg38.tsv.*' /vep_data/ & +gcloud storage cp --billing-project $PROJECT 'gs://seqr-reference-data/vep/GRCh38/AlphaMissense_hg38.tsv.*' /vep_data/ & gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep_data/loftee-beta/${ASSEMBLY}.tar | tar -xf - -C /vep_data/ & # Copied from ftp://ftp.ensembl.org/pub/release-110/variation/indexed_vep_cache/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz -gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep/110/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz | tar -xzf - -C /vep_data/ & +gcloud storage cat --billing-project $PROJECT gs://seqr-reference-data/vep/GRCh38/homo_sapiens_vep_110_${ASSEMBLY}.tar.gz | tar -xzf - -C /vep_data/ & # Generated with: # curl -O ftp://ftp.ensembl.org/pub/release-110/fasta/homo_sapiens/dna/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz > Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz # gzip -d Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz # bgzip Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa # samtools faidx Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.gz -gcloud storage cp --billing-project $PROJECT "gs://seqr-reference-data/vep/110/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.*" /vep_data/ & +gcloud storage cp --billing-project $PROJECT "gs://seqr-reference-data/vep/GRCh38/Homo_sapiens.${ASSEMBLY}.dna.primary_assembly.fa.*" /vep_data/ & docker pull ${VEP_DOCKER_IMAGE} & wait From 63191c37035deef8d3a3e59a33924c9e4b51578e Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 24 Jul 2024 15:29:28 -0400 Subject: [PATCH 2/9] Add mito local constraint (#845) * Add mito local constraint * Fix tests * lint --- .../lib/model/reference_dataset_collection.py | 1 + v03_pipeline/lib/reference_data/config.py | 13 ++++++++++++ v03_pipeline/lib/reference_data/mito.py | 16 +++++++++++++++ ...ble_with_updated_reference_dataset_test.py | 19 ++++++++++++++++++ ...annotations_table_with_new_samples_test.py | 8 ++++++++ .../test_combined_mito_1.ht/.README.txt.crc | Bin 12 -> 12 bytes .../.metadata.json.gz.crc | Bin 16 -> 16 bytes .../test_combined_mito_1.ht/README.txt | 2 +- .../globals/.metadata.json.gz.crc | Bin 12 -> 12 bytes .../globals/metadata.json.gz | Bin 448 -> 474 bytes .../globals/parts/.part-0.crc | Bin 16 -> 16 bytes .../globals/parts/part-0 | Bin 670 -> 712 bytes .../.index.crc | Bin 12 -> 0 bytes .../.index.crc | Bin 0 -> 12 bytes .../.metadata.json.gz.crc | Bin .../index | Bin 130 -> 130 bytes .../metadata.json.gz | Bin .../test_combined_mito_1.ht/metadata.json.gz | Bin 564 -> 586 bytes .../rows/.metadata.json.gz.crc | Bin 16 -> 16 bytes .../rows/metadata.json.gz | Bin 855 -> 877 bytes ...0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.crc | Bin 12 -> 0 bytes ...0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.crc | Bin 0 -> 12 bytes ...art-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2 | Bin 224 -> 0 bytes ...art-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90 | Bin 0 -> 222 bytes 24 files changed, 58 insertions(+), 1 deletion(-) create mode 100644 v03_pipeline/lib/reference_data/mito.py delete mode 100644 v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/.index.crc create mode 100644 v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx/.index.crc rename v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/{part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx => part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx}/.metadata.json.gz.crc (100%) rename v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/{part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx => part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx}/index (61%) rename v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/{part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx => part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.idx}/metadata.json.gz (100%) delete mode 100644 v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.crc create mode 100644 v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/.part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90.crc delete mode 100644 v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2 create mode 100644 v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90 diff --git a/v03_pipeline/lib/model/reference_dataset_collection.py b/v03_pipeline/lib/model/reference_dataset_collection.py index 7f87ff154..f784b105d 100644 --- a/v03_pipeline/lib/model/reference_dataset_collection.py +++ b/v03_pipeline/lib/model/reference_dataset_collection.py @@ -45,6 +45,7 @@ def datasets(self, dataset_type: DatasetType) -> list[str]: 'hmtvar', 'mitomap', 'mitimpact', + 'local_constraint_mito', ], (ReferenceDatasetCollection.HGMD, DatasetType.SNV_INDEL): ['hgmd'], (ReferenceDatasetCollection.INTERVAL, DatasetType.SNV_INDEL): [ diff --git a/v03_pipeline/lib/reference_data/config.py b/v03_pipeline/lib/reference_data/config.py index f38d2773f..714ee9a57 100644 --- a/v03_pipeline/lib/reference_data/config.py +++ b/v03_pipeline/lib/reference_data/config.py @@ -16,6 +16,9 @@ parsed_clnsig, ) from v03_pipeline.lib.reference_data.hgmd import download_and_import_hgmd_vcf +from v03_pipeline.lib.reference_data.mito import ( + download_and_import_local_constraint_tsv, +) def import_locus_intervals( @@ -533,4 +536,14 @@ def custom_mpc_select(ht): 'custom_import': import_locus_intervals, }, }, + 'local_constraint_mito': { + '38': { + 'version': '2024-07-24', + # Originally sourced from https://www.biorxiv.org/content/10.1101/2022.12.16.520778v2.supplementary-material + # Supplementary Table 7. + 'source_path': 'gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv', + 'custom_import': download_and_import_local_constraint_tsv, + 'select': {'score': 'MLC_score'}, + }, + }, } diff --git a/v03_pipeline/lib/reference_data/mito.py b/v03_pipeline/lib/reference_data/mito.py new file mode 100644 index 000000000..7df647324 --- /dev/null +++ b/v03_pipeline/lib/reference_data/mito.py @@ -0,0 +1,16 @@ +import hail as hl + +from v03_pipeline.lib.model.definitions import ReferenceGenome + + +def download_and_import_local_constraint_tsv( + url: str, + reference_genome: ReferenceGenome, +) -> hl.Table: + ht = hl.import_table(url, types={'Position': hl.tint32, 'MLC_score': hl.tfloat32}) + ht = ht.select( + locus=hl.locus('chrM', ht.Position, reference_genome.value), + alleles=[ht.Reference, ht.Alternate], + MLC_score=ht.MLC_score, + ) + return ht.key_by('locus', 'alleles') diff --git a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py index fe68eebe3..33ab87ed0 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py +++ b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py @@ -628,6 +628,21 @@ ), }, }, + 'local_constraint_mito': { + '38': { + **CONFIG['local_constraint_mito']['38'], + 'custom_import': lambda *_: hl.Table.parallelize( + [], + hl.tstruct( + locus=hl.tlocus('GRCh38'), + alleles=hl.tarray(hl.tstr), + MLC_score=hl.tfloat32, + ), + key=['locus', 'alleles'], + globals=hl.Struct(), + ), + }, + }, } @@ -960,6 +975,7 @@ def test_update_vat_with_updated_rdc_mito_38( clinvar_mito='ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz', dbnsfp_mito='gs://seqr-reference-data/GRCh38/dbNSFP/v4.2/dbNSFP4.2a_variant.with_new_scores.ht', high_constraint_region_mito='gs://seqr-reference-data/GRCh38/mitochondrial/Helix high constraint intervals Feb-15-2022.tsv', + local_constraint_mito='gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv', ), versions=hl.Struct( gnomad_mito='v3.1', @@ -970,6 +986,7 @@ def test_update_vat_with_updated_rdc_mito_38( clinvar_mito='2023-07-22', dbnsfp_mito='4.2', high_constraint_region_mito='Feb-15-2022', + local_constraint_mito='2024-07-24', ), enums=hl.Struct( gnomad_mito=hl.Struct(), @@ -985,6 +1002,7 @@ def test_update_vat_with_updated_rdc_mito_38( MutationTaster_pred=['D', 'A', 'N', 'P'], ), high_constraint_region_mito=hl.Struct(), + local_constraint_mito=hl.Struct(), sorted_transcript_consequences=hl.Struct( biotype=BIOTYPES, consequence_term=TRANSCRIPT_CONSEQUENCE_TERMS, @@ -1041,6 +1059,7 @@ def test_update_vat_with_updated_rdc_mito_38( mitomap=None, mitimpact=hl.Struct(score=0.5199999809265137), high_constraint_region_mito=True, + local_constraint_mito=hl.Struct(score=0.5), ), ], ) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index d22dae749..3d23a1278 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -842,6 +842,7 @@ def test_mito_update_vat( hmtvar='gs://seqr-reference-data/GRCh38/mitochondrial/HmtVar/HmtVar%20Jan.%2010%202022.ht', mitomap='gs://seqr-reference-data/GRCh38/mitochondrial/MITOMAP/mitomap-confirmed-mutations-2022-02-04.ht', mitimpact='gs://seqr-reference-data/GRCh38/mitochondrial/MitImpact/MitImpact_db_3.0.7.ht', + local_constraint_mito='gs://seqr-reference-data/GRCh38/mitochondrial/local_constraint.tsv', ), versions=hl.Struct( high_constraint_region_mito='Feb-15-2022', @@ -852,9 +853,11 @@ def test_mito_update_vat( hmtvar='Jan. 10 2022', mitomap='Feb. 04 2022', mitimpact='3.0.7', + local_constraint_mito='2024-07-24', ), enums=hl.Struct( high_constraint_region_mito=hl.Struct(), + local_constraint_mito=hl.Struct(), clinvar_mito=hl.Struct( assertion=CLINVAR_ASSERTIONS, pathogenicity=CLINVAR_PATHOGENICITIES, @@ -920,6 +923,7 @@ def test_mito_update_vat( AF_hom=0.0, AN=4, ), + local_constraint_mito=None, ), hl.Struct( locus=hl.Locus( @@ -955,6 +959,7 @@ def test_mito_update_vat( AF_hom=0.0, AN=4, ), + local_constraint_mito=None, ), hl.Struct( locus=hl.Locus( @@ -990,6 +995,7 @@ def test_mito_update_vat( AF_hom=0.0, AN=4, ), + local_constraint_mito=None, ), hl.Struct( locus=hl.Locus( @@ -1025,6 +1031,7 @@ def test_mito_update_vat( AF_hom=0.0, AN=4, ), + local_constraint_mito=None, ), hl.Struct( locus=hl.Locus( @@ -1060,6 +1067,7 @@ def test_mito_update_vat( AF_hom=0.0, AN=4, ), + local_constraint_mito=None, ), ], ) diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/.README.txt.crc index 436531ab22aa29ccf30ce13b907c450e9fba8bf5..e08d4d12b482e6c65d7edabd267cd91cdc06caf5 100644 GIT binary patch literal 12 TcmYc;N@ieSU}88p-EuDg6LvA3d5uO7# diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/metadata.json.gz index 9e842012f5afdaa83c672ec2a46246aaf1a81eb7..d7c36221c8829177d8514b09785ffc87237cd55f 100644 GIT binary patch literal 474 zcmV<00VVz)iwFP!000000M%8&j?*v@{TE*e5-i#k%8fudfsnQ<4y;zy)Sk9x6DP7K z6`{(%Gv1_)?POQt206v=&6~_Ty(A}LphA(0Z&sm2b^HC(xBYPUQCwd?#y# z(&HjOodtJYgX-gj;41VGGp|+!I9!u$C7l|egUcgmZ|VxvrqWbR}Dy-AX#d|qNJ1B8n;0W!e;SA=h57t6=Kd$o7&uod~RqxSgsi{ka z890a5cb~U>*1llszW1>{j#Y?a(Hl*)OZ& z?bZ8YqB0>5x>P|)!Ob7}F;UUhgKPbp3vRBj-`&h*KOCti>0j!2XoM8{vN;KX1Y8v8Y?4JarO|nB=xBLZgNoZDiRhe*c~*HU}@*%GY?L_uYWV QI_IrrS0dw13Yku(7kk~hUc0iV1-O*3Ev5|HH%3KGs2_{LHW%;F?G zoCLEX1O9j>unH`M%+)xe8m30&%p5XmwmNj6jZ72Dmx(Z`C4{7^2(}Q5FCRD2)Y$ diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/part-0 b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/globals/parts/part-0 index f4252dc82dc9242247d62d3e5c3c47d95270ee03..2493dddf92151c3aa051dd855305b602a99d846b 100644 GIT binary patch literal 712 zcmV;(0yq7$0ssJk1pojjwJ-f(fdnlT0EXy$C7|xO0b~u66xrla%rYykH`!`0yj>oX z-H_OV7|f?~Y)UKbAQE!`Z2)%wpOlu8PkilBRK?VpsEX&(WTGmpeECh~wUW^QIG8Ar5vmds_S3sK2(RKkhLIH@L?* z-t;`ty5djCZ6)8I*IqSA)FesQYrp)^CSAPQ#=K66lN+cbv7$gcK%VYdUkq>gmZ!To z(0pSx&c^y)NoUNwe>TOueqby>Knev3$doCS5$3jy7i+hvCI5!NiiwM$2%y4`R^h&Z+OZ&bW`%Gm6lMd2m+VuksXDp94+f}O`k+kI*)-jQ?fIs4liXma*r*PRuv5Z00000001bpFa00@0RR9&Yg9r2 literal 670 zcmV;P0%84$0ssIr1pojjwJ-f(GX&ie02U{BB%t2t0dy95)g-5r{!MqT7G&Z$sK@9- z03Y3x$6!3N9$-5WZEgT=0CE8EF-K7rQ)i+qoX3-ivam7cGnU7k2IZM1YNB9ey?|<( za>=8yt>qPy^*ehT9t@;~2D$F^~Uk#Zl( zeYUqU;*wHXHRA`Kpliwt6|N-DT8WSJIvK^F0pjXNQ}d2UZq*8MVyh z8}yj_OcFIo()HROKa@!pXSSzTDQPiV@c^m3n1BMc-}1c?)|3c{$I|`ko8cYbvUHdS zo9|hn?AiB5D&gh*vgz~+lED!1Di$UlQ>HXRncEgxjNPVH{44T48nmG-(1P7lGX z+6gZP)64rp#va-_Vf71JU&eXBTfgM5Rx_}SeHF0S>aAK0!9F`{kfu@vzFxJ<7G5E1 zCD?p#{e7J;yp|g*57i03I(%qZ?~ModCC}+W5JdbbrI=j@TM{Uf zd8x}=mue~bjr}VC)h$-F<`)Izh=3v3fI11t%WKcxEn}_@+5~OMkVXmkvV(t#HUpZ5 z9>_ZoDr<=bmsPW1I4?*X_?t9wA_HOX0m?+7oS9pWXaXc0p+RO1x2GlV&OA|vnH=C( zy$p248jg%dr?ZV3+I_VZm)93vou+Y}&Di0T42|XPTEGMp4FCWD00000D77#BAOHaX E0CY1nwEzGB diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/.index.crc b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/index/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2.idx/.index.crc deleted file mode 100644 index 12ec58de2343ec7796852d9dcc1c8da821fc24d9..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 12 TcmYc;N@ieSU}E?wB+H7{chdozeM|`(`F)_tz{y0cS?Xxy^X488Vq?oc63?^68tLxe1 zQw(@=lRS{Shxl`V0ttJNzzL{G0*i92Q$uc1L(w=k?e#GSliAqdbYLRJEL>|@5Ph;m z$_EE7GMq#i)xtuQA5aR6zV(qzzhLPjo5^tA32ByNqCG$u<%Zttki=Z=^B%+m!7L(z zD=jhQx|=LfJ&FHbC-5S>B+^3VKxU0KLaVqVwE%vd4jWw@gHun(bPz0A zYb;fry-?Il#z1}fN*das6%>9Ef&u0u z1{t&%am#wO+(bPZ8Zl6_;W6jV7f}lJ8dSXg zeVrqlq8uBOIh#3`b{jnU)WT(k%{+g5J=xhtQsmaK&}-3vby1&{P-V^((d_sgj*tYQ Y84}pT?d#qD#FdtR0EcpXCddW=0HNIwsQ>@~ literal 564 zcmV-40?Yj$iwFP!000000L@g(Zrd;r{1=^CfDIfw%|kcWg$tt(1hLVJVF(npwun-s zKvGErL;qfq67{lDw6|Ua?dqB`UY9wT%+3bq6B99J;a1Cn=(8Jib%ZecFYxG7E0-CI zdH()-cCbQH>|3uJd^SC5Z)n diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/metadata.json.gz index 29e15e9d80d58b8e943112439dbdfeaf3e22c342..213bdb7aaaaac35564612c520537f123f0367231 100644 GIT binary patch literal 877 zcmV-z1Csn7iwFP!000000NqzxZ`v>v{x5mjw6K+1NAm^@CZx8S3fjXe6q(o|UX2sk zrh=jT_Z=sY#EH9>YJF)RP<`&-@ptU_;Bd4dj_1rcCOd-fetw%LBK;>jlc5l3e}6Pwx&fp75ea`oRqPpAa*rAI@(?cM2pJ6& z~1@Mh#${-fRn@m}ZvexO=Y*6Ca(dnkeF(Zg)mUNS#K>afw zUU3;FmX6P7(}izZyEj{fbnOcy0iB-+f^|=deT@jCJ4>wL*pEojXLmR~E>CfXw}ozl zc7DXll-f1k7_Ql}0I=rJ5BwjKUiO|(3ZzF2riM!;5ar5}K5t#=lj;e2JWh}?JJlTn|EXGfxlWu1lUC^QG*i8-*1L7t{L1T`q-1Q?2f{zd~E z#BXxm!gza#D%YDVe}?2uikmz$SXSYf!LS;u4SHD_{n{KQiS0S;1h%#H8fT8r&2F+Vm%J_G@*IM|VS?=yok_w1zCpYu>^kT=ofTWCWisSoxQ?bS+6W0;H7CNz(K$E#LQayA1#U D4gR4P literal 855 zcmV-d1E~BTiwFP!000000Ns~SZ`v>r$G=OSHZ5$00%Cas1`|?SO$F^?6^e}Q5U+_H z*`|V3`R+R>ki<#4mTG-zA5e35zPtbB7dv@$IEty`ISWqco|4<2-Evk6jkspSsq$1I&!wP3+D z+jSlXM7r9_9jW8m7w5#tX?`Z3eOv#R7P7R1X_xKJaA z7}kt$e;^%-E#(YDl`^TdJgLMSd{#-brF)5!F`LJHLRiKN8Lq`0pkjo7{=*LdoOw3xFyo*yqydB-<} zFGhPmjLMQ`sfewm*<~SM%V8emKNcPKpDmi2r{UrFG+58UmHGYf{dO8&jy@Kg=QkPx z0oNMkb-tg)yU5kbbrtI-)K4(ohIC%033N?LNweiBP`0RRK-}cFh2HiQRc@QaJ_BqM zSDP7&mo93JBo$n=oFU-x-sZqu&b_&z|xjR+SRimd`H}8Z#*djQJZUCCs(UjGbl(+Mf;pzu0qJHgWHz~|5 zZ%bTWLsHl^yPAJaA|PY8LBYP+mhjCxRWXo0cCbPW#J literal 0 HcmV?d00001 diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2 b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-2e25c2cc-84c0-4816-b8dc-5e8c19c4f1d2 deleted file mode 100644 index f78553dbff70cac10c26a3a847bcbbc696081582..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 224 zcmX@jz`&r(#K54jRsU~-F5_EPh9w+~94yHhMZSzo42+J9&h-rPt=jesJLFjH8JQRv z`hIgWxYsin8ynj*yb0W47m~E#xF#bv0~1hn2%}>?LrlMuJwyK?Q+wy6{RbJD_!t{JmO*44JJ7`bLtkwf&Ys~3Nm?{(rlcLi z^~DAZA`BNS9!j|#*bx@NaelL!NQ{nFUYNJQx+?Z#yexYdm=6f=a28=y%RHKUX4?S| Tu>vMupgkY}@{R%nBLf2fOqV>) diff --git a/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90 b/v03_pipeline/var/test/reference_data/test_combined_mito_1.ht/rows/parts/part-0-4fe48beb-19ef-445d-82f1-325a3c7c0b90 new file mode 100644 index 0000000000000000000000000000000000000000..bf1bf60c2cbd563437b266452542d45310317f67 GIT binary patch literal 222 zcmV<403rX$000100ssIgwJ-f(RRP@v0OH6YDsXK&d=c=~EqAnz3PC(Rplw{wVf1@T zSpKZhiOE#oRQyfkKQe~W;!Y`Ma7fk=n9OE&1JTHtDXgYa;4wI>-o@RKs;C6qgdYS= zd2ra?jMA|S@!)52+X$=oQ9aY`-5sN?Um?xxv@cTfVi;yB*S4+G#gE@W;`1&n3RCXe z9S^BYBvkbj0Dy4Ng^fUTuZ9qSr;RCS*{Y Date: Thu, 25 Jul 2024 16:21:31 -0400 Subject: [PATCH 3/9] Benb/migration task (#834) * split import and validate * lint and share function * ruff * change dep * tweak update * lint * wrong method * correct method * mocks * change sample type annotation on test * hack on migration * sort return list * move the migration * still hacking * better! * getting there * Cleaner * ruff * Finish it off * migration * rename var * add migrations to annotations table * fix test import * actually fix the test * add migrations * not used here * use globals * missed one * a hilarious typo * Update migrate_variant_annotations_table.py * correct sign * add lookup migration * Add lookup table migration * adjust migration * ruff * Add to tasks * ensure a migration cannot run before a previous migration! * ruff * fix bug * lint * add referencegenomedatasetype * Annoying but fixed --- pyproject.toml | 3 + v03_pipeline/lib/migration/__init__.py | 0 v03_pipeline/lib/migration/base_migration.py | 16 ++ v03_pipeline/lib/migration/misc.py | 33 ++++ v03_pipeline/lib/migration/misc_test.py | 47 ++++++ v03_pipeline/lib/tasks/__init__.py | 6 + v03_pipeline/lib/tasks/base/base_migrate.py | 49 ++++++ .../base_update_variant_annotations_table.py | 2 + .../lib/tasks/migrate_all_lookup_tables.py | 37 +++++ .../migrate_all_variant_annotations_tables.py | 36 ++++ .../lib/tasks/migrate_lookup_table.py | 48 ++++++ .../migrate_variant_annotations_table.py | 38 +++++ .../migrate_variant_annotations_table_test.py | 154 ++++++++++++++++++ ...ble_with_updated_reference_dataset_test.py | 6 + ...annotations_table_with_new_samples_test.py | 4 + v03_pipeline/migrations/__init__.py | 0 .../annotations/0001_add_migrations_global.py | 24 +++ .../migrations/annotations/__init__.py | 0 .../lookup/0001_remove_null_families.py | 27 +++ v03_pipeline/migrations/lookup/__init__.py | 0 20 files changed, 530 insertions(+) create mode 100644 v03_pipeline/lib/migration/__init__.py create mode 100644 v03_pipeline/lib/migration/base_migration.py create mode 100644 v03_pipeline/lib/migration/misc.py create mode 100644 v03_pipeline/lib/migration/misc_test.py create mode 100644 v03_pipeline/lib/tasks/base/base_migrate.py create mode 100644 v03_pipeline/lib/tasks/migrate_all_lookup_tables.py create mode 100644 v03_pipeline/lib/tasks/migrate_all_variant_annotations_tables.py create mode 100644 v03_pipeline/lib/tasks/migrate_lookup_table.py create mode 100644 v03_pipeline/lib/tasks/migrate_variant_annotations_table.py create mode 100644 v03_pipeline/lib/tasks/migrate_variant_annotations_table_test.py create mode 100644 v03_pipeline/migrations/__init__.py create mode 100644 v03_pipeline/migrations/annotations/0001_add_migrations_global.py create mode 100644 v03_pipeline/migrations/annotations/__init__.py create mode 100644 v03_pipeline/migrations/lookup/0001_remove_null_families.py create mode 100644 v03_pipeline/migrations/lookup/__init__.py diff --git a/pyproject.toml b/pyproject.toml index adc5d947a..73826b005 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,9 @@ inline-quotes = "single" 'SLF001', # allow private access 'PLR0913', # allow high arity functions ] +'*migration*' = [ + 'N999', # allow invalid module names +] [tool.ruff.pylint] max-args = 6 diff --git a/v03_pipeline/lib/migration/__init__.py b/v03_pipeline/lib/migration/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/lib/migration/base_migration.py b/v03_pipeline/lib/migration/base_migration.py new file mode 100644 index 000000000..0259a50f4 --- /dev/null +++ b/v03_pipeline/lib/migration/base_migration.py @@ -0,0 +1,16 @@ +from abc import ABC, abstractmethod + +import hail as hl + +from v03_pipeline.lib.model import DatasetType, ReferenceGenome + + +class BaseMigration(ABC): + reference_genome_dataset_types: frozenset[ + tuple[ReferenceGenome, DatasetType] + ] = None + + @staticmethod + @abstractmethod + def migrate(ht: hl.Table) -> hl.Table: + pass diff --git a/v03_pipeline/lib/migration/misc.py b/v03_pipeline/lib/migration/misc.py new file mode 100644 index 000000000..5fe8ea911 --- /dev/null +++ b/v03_pipeline/lib/migration/misc.py @@ -0,0 +1,33 @@ +import inspect +import pkgutil +import re + +from v03_pipeline.lib.migration.base_migration import BaseMigration + +MIGRATION_NAME_PATTERN = r'(\d\d\d\d_.*)' + + +def list_migrations( + path: str, +) -> list[tuple[str, BaseMigration]]: + migrations = [] + for loader, name, _ in pkgutil.iter_modules([path]): + match = re.search(MIGRATION_NAME_PATTERN, name) + if match: + module = loader.find_module(name).load_module(name) + implemented_migration = next( + ( + m + for m in module.__dict__.values() + # Return objects that are + # classes, subclasses of the BaseMigration + # and also NOT the BaseMigration class. + if inspect.isclass(m) + and issubclass(m, BaseMigration) + and m != BaseMigration + ), + None, + ) + if implemented_migration: + migrations.append((match.group(1), implemented_migration)) + return sorted(migrations, key=lambda x: x[0]) diff --git a/v03_pipeline/lib/migration/misc_test.py b/v03_pipeline/lib/migration/misc_test.py new file mode 100644 index 000000000..8f8f16fdd --- /dev/null +++ b/v03_pipeline/lib/migration/misc_test.py @@ -0,0 +1,47 @@ +import os +import shutil +import tempfile +import unittest + +from v03_pipeline.lib.migration.base_migration import BaseMigration +from v03_pipeline.lib.migration.misc import list_migrations + + +class TestListMigrations(unittest.TestCase): + def setUp(self): + self.tmpdir = tempfile.TemporaryDirectory() + for migration in [ + '__init__.py', + '1111_a_migration.py', + '0000_migration.py', + '001_test.py', + 'abcd_test.py', + '0000_migration.txt', + ]: + with open(os.path.join(self.tmpdir.name, migration), 'w') as f: + f.write( + """ +from v03_pipeline.lib.migration.base_migration import BaseMigration +class ImplementedMigration(BaseMigration): + pass + """, + ) + + def tearDown(self): + if os.path.isdir(self.tmpdir.name): + shutil.rmtree(self.tmpdir.name) + + def test_list_migrations(self): + self.assertEqual( + [ + (x, issubclass(y, BaseMigration)) + for (x, y) in list_migrations(self.tmpdir.name) + ], + [ + ('0000_migration', True), + ('1111_a_migration', True), + ], + ) + self.assertTrue( + all(hasattr(x[1], 'migrate') for x in list_migrations(self.tmpdir.name)), + ) diff --git a/v03_pipeline/lib/tasks/__init__.py b/v03_pipeline/lib/tasks/__init__.py index 4223135cc..624173f72 100644 --- a/v03_pipeline/lib/tasks/__init__.py +++ b/v03_pipeline/lib/tasks/__init__.py @@ -4,6 +4,10 @@ DeleteProjectFamilyTablesTask, ) from v03_pipeline.lib.tasks.delete_project_table import DeleteProjectTableTask +from v03_pipeline.lib.tasks.migrate_all_lookup_tables import MigrateAllLookupTablesTask +from v03_pipeline.lib.tasks.migrate_all_variant_annotations_tables import ( + MigrateAllVariantAnnotationsTablesTask, +) from v03_pipeline.lib.tasks.reference_data.update_cached_reference_dataset_queries import ( UpdateCachedReferenceDatasetQueries, ) @@ -39,6 +43,8 @@ 'DeleteFamilyTablesTask', 'DeleteProjectFamilyTablesTask', 'DeleteProjectTableTask', + 'MigrateAllLookupTablesTask', + 'MigrateAllVariantAnnotationsTablesTask', 'UpdateProjectTableTask', 'UpdateProjectTableWithDeletedFamiliesTask', 'UpdateLookupTableTask', diff --git a/v03_pipeline/lib/tasks/base/base_migrate.py b/v03_pipeline/lib/tasks/base/base_migrate.py new file mode 100644 index 000000000..4611d97c9 --- /dev/null +++ b/v03_pipeline/lib/tasks/base/base_migrate.py @@ -0,0 +1,49 @@ +import hail as hl +import luigi + +from v03_pipeline.lib.migration.misc import list_migrations +from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask + + +class BaseMigrateTask(BaseUpdateTask): + migration_name = luigi.Parameter() + + @property + def migrations_path(self): + raise NotImplementedError + + def requires(self) -> luigi.Task | None: + # Require the previous migration + defined_migrations = [x[0] for x in list_migrations(self.migrations_path)] + for i, migration in enumerate(defined_migrations): + if i > 0 and migration == self.migration_name: + return self.clone( + self.__class__, + migration_name=defined_migrations[i - 1], + ) + return None + + def complete(self) -> luigi.Target: + if super().complete(): + migration = dict( + list_migrations(self.migrations_path), + )[self.migration_name] + if ( + self.reference_genome, + self.dataset_type, + ) not in migration.reference_genome_dataset_types: + return True + mt = hl.read_table(self.output().path) + return hl.eval(mt.globals.migrations.index(self.migration_name) >= 0) + return False + + def update_table(self, ht: hl.Table) -> hl.Table: + migration = dict(list_migrations(self.migrations_path))[self.migration_name] + if ( + (self.reference_genome, self.dataset_type) + ) in migration.reference_genome_dataset_types: + ht = migration.migrate(ht) + return ht.annotate_globals( + migrations=ht.globals.migrations.append(self.migration_name), + ) + return ht diff --git a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py index 56dc00ea8..2fcce0f45 100644 --- a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py +++ b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py @@ -67,6 +67,7 @@ def initialize_table(self) -> hl.Table: versions=hl.Struct(), enums=hl.Struct(), updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), + migrations=hl.empty_array(hl.tstr), ), ) @@ -102,5 +103,6 @@ def annotate_globals( **rdc_globals.enums, ), updates=ht.globals.updates, + migrations=ht.globals.migrations, ) return annotate_enums(ht, self.reference_genome, self.dataset_type) diff --git a/v03_pipeline/lib/tasks/migrate_all_lookup_tables.py b/v03_pipeline/lib/tasks/migrate_all_lookup_tables.py new file mode 100644 index 000000000..0f8978234 --- /dev/null +++ b/v03_pipeline/lib/tasks/migrate_all_lookup_tables.py @@ -0,0 +1,37 @@ +import luigi + +import v03_pipeline.migrations.lookup +from v03_pipeline.lib.migration.misc import list_migrations +from v03_pipeline.lib.model import DatasetType, ReferenceGenome +from v03_pipeline.lib.tasks.migrate_lookup_table import ( + MigrateLookupTableTask, +) + + +class MigrateAllLookupTablesTask(luigi.Task): + reference_genome = luigi.EnumParameter(enum=ReferenceGenome) + dataset_type = luigi.EnumParameter(enum=DatasetType) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.dynamic_migration_tasks = [] + + def complete(self) -> bool: + return len(self.dynamic_migration_tasks) >= 1 and all( + migration_task.complete() for migration_task in self.migration_task + ) + + def run(self): + for migration in list_migrations(v03_pipeline.migrations.lookup.__path__): + if ( + (self.reference_genome, self.dataset_type) + in migration.reference_genome_dataset_types + and self.dataset_type.has_lookup_table + ): + self.dynamic_migration_tasks.append( + MigrateLookupTableTask( + self.reference_genome, + self.dataset_type, + ), + ) + yield self.dynamic_migration_tasks diff --git a/v03_pipeline/lib/tasks/migrate_all_variant_annotations_tables.py b/v03_pipeline/lib/tasks/migrate_all_variant_annotations_tables.py new file mode 100644 index 000000000..dff1a251c --- /dev/null +++ b/v03_pipeline/lib/tasks/migrate_all_variant_annotations_tables.py @@ -0,0 +1,36 @@ +import luigi + +import v03_pipeline.migrations.annotations +from v03_pipeline.lib.migration.misc import list_migrations +from v03_pipeline.lib.model import DatasetType, ReferenceGenome +from v03_pipeline.lib.tasks.migrate_variant_annotations_table import ( + MigrateVariantAnnotationsTableTask, +) + + +class MigrateAllVariantAnnotationsTablesTask(luigi.Task): + reference_genome = luigi.EnumParameter(enum=ReferenceGenome) + dataset_type = luigi.EnumParameter(enum=DatasetType) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.dynamic_migration_tasks = [] + + def complete(self) -> bool: + return len(self.dynamic_migration_tasks) >= 1 and all( + migration_task.complete() for migration_task in self.migration_task + ) + + def run(self): + for migration in list_migrations(v03_pipeline.migrations.annotations.__path__): + if ( + self.reference_genome, + self.dataset_type, + ) in migration.reference_genome_dataset_types: + self.dynamic_migration_tasks.append( + MigrateVariantAnnotationsTableTask( + self.reference_genome, + self.dataset_type, + ), + ) + yield self.dynamic_migration_tasks diff --git a/v03_pipeline/lib/tasks/migrate_lookup_table.py b/v03_pipeline/lib/tasks/migrate_lookup_table.py new file mode 100644 index 000000000..bd7ce0d40 --- /dev/null +++ b/v03_pipeline/lib/tasks/migrate_lookup_table.py @@ -0,0 +1,48 @@ +import hail as hl +import luigi + +import v03_pipeline.migrations.lookup +from v03_pipeline.lib.paths import ( + lookup_table_path, +) +from v03_pipeline.lib.tasks.base.base_migrate import BaseMigrateTask +from v03_pipeline.lib.tasks.files import GCSorLocalTarget + + +class MigrateLookupTableTask(BaseMigrateTask): + @property + def migrations_path(self): + return v03_pipeline.migrations.lookup.__path__[0] + + def output(self) -> luigi.Target: + return GCSorLocalTarget( + lookup_table_path( + self.reference_genome, + self.dataset_type, + ), + ) + + def initialize_table(self) -> hl.Table: + key_type = self.dataset_type.table_key_type(self.reference_genome) + return hl.Table.parallelize( + [], + hl.tstruct( + **key_type, + project_stats=hl.tarray( + hl.tarray( + hl.tstruct( + **{ + field: hl.tint32 + for field in self.dataset_type.lookup_table_fields_and_genotype_filter_fns + }, + ), + ), + ), + ), + key=key_type.fields, + globals=hl.Struct( + project_guids=hl.empty_array(hl.tstr), + project_families=hl.empty_dict(hl.tstr, hl.tarray(hl.tstr)), + updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), + ), + ) diff --git a/v03_pipeline/lib/tasks/migrate_variant_annotations_table.py b/v03_pipeline/lib/tasks/migrate_variant_annotations_table.py new file mode 100644 index 000000000..fd8bec321 --- /dev/null +++ b/v03_pipeline/lib/tasks/migrate_variant_annotations_table.py @@ -0,0 +1,38 @@ +import hail as hl +import luigi + +import v03_pipeline.migrations.annotations +from v03_pipeline.lib.paths import ( + variant_annotations_table_path, +) +from v03_pipeline.lib.tasks.base.base_migrate import BaseMigrateTask +from v03_pipeline.lib.tasks.files import GCSorLocalTarget + + +class MigrateVariantAnnotationsTableTask(BaseMigrateTask): + @property + def migrations_path(self): + return v03_pipeline.migrations.annotations.__path__[0] + + def output(self) -> luigi.Target: + return GCSorLocalTarget( + variant_annotations_table_path( + self.reference_genome, + self.dataset_type, + ), + ) + + def initialize_table(self) -> hl.Table: + key_type = self.dataset_type.table_key_type(self.reference_genome) + return hl.Table.parallelize( + [], + key_type, + key=key_type.fields, + globals=hl.Struct( + paths=hl.Struct(), + versions=hl.Struct(), + enums=hl.Struct(), + updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), + migrations=hl.empty_array(hl.tstr), + ), + ) diff --git a/v03_pipeline/lib/tasks/migrate_variant_annotations_table_test.py b/v03_pipeline/lib/tasks/migrate_variant_annotations_table_test.py new file mode 100644 index 000000000..550a8a330 --- /dev/null +++ b/v03_pipeline/lib/tasks/migrate_variant_annotations_table_test.py @@ -0,0 +1,154 @@ +from unittest import mock + +import hail as hl +import luigi.worker + +from v03_pipeline.lib.migration.base_migration import BaseMigration +from v03_pipeline.lib.model import DatasetType, ReferenceGenome +from v03_pipeline.lib.tasks.migrate_variant_annotations_table import ( + MigrateVariantAnnotationsTableTask, +) +from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase + + +class MockMigration(BaseMigration): + reference_genome_dataset_types: frozenset[ + tuple[ReferenceGenome, DatasetType] + ] = frozenset( + ((ReferenceGenome.GRCh38, DatasetType.GCNV),), + ) + + @staticmethod + def migrate(ht: hl.Table) -> hl.Table: + ht = ht.annotate( + variant_id_id=hl.format('%s_id', ht.variant_id), + ) + return ht.annotate_globals(mock_migration='a mock migration') + + +class MockMigration2(BaseMigration): + reference_genome_dataset_types: frozenset[ + tuple[ReferenceGenome, DatasetType] + ] = frozenset( + ((ReferenceGenome.GRCh38, DatasetType.GCNV),), + ) + + @staticmethod + def migrate(ht: hl.Table) -> hl.Table: + return ht.annotate_globals(mock_migration2='a second mock migration') + + +class MigrateVariantAnnotationsTableTaskTest(MockedDatarootTestCase): + @mock.patch( + 'v03_pipeline.lib.tasks.base.base_migrate.list_migrations', + ) + def test_mock_migration( + self, + mock_list_migrations: mock.Mock, + ) -> None: + mock_list_migrations.return_value = [ + ('0012_mock_migration', MockMigration), + ] + worker = luigi.worker.Worker() + task = MigrateVariantAnnotationsTableTask( + dataset_type=DatasetType.GCNV, + reference_genome=ReferenceGenome.GRCh38, + migration_name='0012_mock_migration', + ) + worker.add(task) + worker.run() + self.assertTrue(task.output().exists()) + self.assertTrue(task.complete()) + ht = hl.read_table(task.output().path) + self.assertEqual( + ht.globals.collect(), + [ + hl.Struct( + paths=hl.Struct(), + versions=hl.Struct(), + enums=hl.Struct(), + updates=set(), + migrations=['0012_mock_migration'], + mock_migration='a mock migration', + ), + ], + ) + self.assertEqual( + ht.collect(), + [], + ) + + @mock.patch( + 'v03_pipeline.lib.tasks.base.base_migrate.list_migrations', + ) + def test_migration_is_noop_for_other_dataset_types( + self, + mock_list_migrations: mock.Mock, + ) -> None: + mock_list_migrations.return_value = [ + ('0012_mock_migration', MockMigration), + ] + worker = luigi.worker.Worker() + task = MigrateVariantAnnotationsTableTask( + dataset_type=DatasetType.SV, + reference_genome=ReferenceGenome.GRCh38, + migration_name='0012_mock_migration', + ) + worker.add(task) + worker.run() + self.assertTrue(task.output().exists()) + self.assertTrue(task.complete()) + ht = hl.read_table(task.output().path) + self.assertEqual( + ht.globals.collect(), + [ + hl.Struct( + paths=hl.Struct(), + versions=hl.Struct(), + enums=hl.Struct(), + updates=set(), + migrations=[], + ), + ], + ) + + @mock.patch( + 'v03_pipeline.lib.tasks.base.base_migrate.list_migrations', + ) + def test_migration_dependency( + self, + mock_list_migrations: mock.Mock, + ) -> None: + mock_list_migrations.return_value = [ + ('0012_mock_migration', MockMigration), + ('0013_mock_migration2', MockMigration2), + ] + worker = luigi.worker.Worker() + task = MigrateVariantAnnotationsTableTask( + dataset_type=DatasetType.GCNV, + reference_genome=ReferenceGenome.GRCh38, + migration_name='0013_mock_migration2', + ) + worker.add(task) + worker.run() + self.assertTrue(task.output().exists()) + self.assertTrue(task.complete()) + ht = hl.read_table(task.output().path) + self.assertEqual( + ht.globals.collect(), + [ + hl.Struct( + paths=hl.Struct(), + versions=hl.Struct(), + enums=hl.Struct(), + updates=set(), + migrations=['0012_mock_migration', '0013_mock_migration2'], + mock_migration='a mock migration', + mock_migration2='a second mock migration', + ), + ], + ) + self.assertEqual( + ht.collect(), + [], + ) diff --git a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py index 33ab87ed0..147dfe9e0 100644 --- a/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py +++ b/v03_pipeline/lib/tasks/reference_data/update_variant_annotations_table_with_updated_reference_dataset_test.py @@ -743,6 +743,7 @@ def test_update_vat_with_updated_rdc_snv_indel_38( versions=hl.Struct(), enums=hl.Struct(), updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), + migrations=hl.empty_array(hl.tstr), ), ) task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( @@ -913,6 +914,7 @@ def test_update_vat_with_updated_rdc_snv_indel_38( ), ), ), + migrations=[], updates=set(), ), ], @@ -949,6 +951,7 @@ def test_update_vat_with_updated_rdc_mito_38( versions=hl.Struct(), enums=hl.Struct(), updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), + migrations=hl.empty_array(hl.tstr), ), ) task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( @@ -1012,6 +1015,7 @@ def test_update_vat_with_updated_rdc_mito_38( trna_prediction=MITOTIP_PATHOGENICITIES, ), ), + migrations=[], updates=set(), ), ], @@ -1095,6 +1099,7 @@ def test_update_vat_with_updated_rdc_snv_indel_37( versions=hl.Struct(), enums=hl.Struct(), updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), + migrations=hl.empty_array(hl.tstr), ), ) task = UpdateVariantAnnotationsTableWithUpdatedReferenceDataset( @@ -1174,6 +1179,7 @@ def test_update_vat_with_updated_rdc_snv_indel_37( lof_filter=LOF_FILTERS, ), ), + migrations=[], updates=set(), ), ], diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 3d23a1278..7bc1e8c85 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -588,6 +588,7 @@ def test_multiple_update_vat( screen=None, hgmd='HGMD_Pro_2023', ), + migrations=[], enums=hl.Struct( cadd=hl.Struct(), clinvar=hl.Struct( @@ -877,6 +878,7 @@ def test_mito_update_vat( ), mitotip=hl.Struct(trna_prediction=MITOTIP_PATHOGENICITIES), ), + migrations=[], updates={ hl.Struct( callset='v03_pipeline/var/test/callsets/mito_1.mt', @@ -1119,6 +1121,7 @@ def test_sv_update_vat( major_consequence=SV_CONSEQUENCE_RANKS, ), ), + migrations=[], updates={ hl.Struct( callset=TEST_SV_VCF, @@ -1679,6 +1682,7 @@ def test_gcnv_update_vat( major_consequence=SV_CONSEQUENCE_RANKS, ), ), + migrations=[], updates={ hl.Struct( callset=TEST_GCNV_BED_FILE, diff --git a/v03_pipeline/migrations/__init__.py b/v03_pipeline/migrations/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/migrations/annotations/0001_add_migrations_global.py b/v03_pipeline/migrations/annotations/0001_add_migrations_global.py new file mode 100644 index 000000000..c255ff215 --- /dev/null +++ b/v03_pipeline/migrations/annotations/0001_add_migrations_global.py @@ -0,0 +1,24 @@ +import hail as hl + +from v03_pipeline.lib.migration.base_migration import BaseMigration +from v03_pipeline.lib.model import DatasetType, ReferenceGenome + + +class AddMigrationsGlobals(BaseMigration): + @property + def reference_genome_dataset_types() -> ( + frozenset[tuple[ReferenceGenome, DatasetType]] + ): + return frozenset( + ( + (ReferenceGenome.GRCh37, DatasetType.SNV_INDEL), + (ReferenceGenome.GRCh38, DatasetType.SNV_INDEL), + (ReferenceGenome.GRCh38, DatasetType.MITO), + (ReferenceGenome.GRCh38, DatasetType.GCNV), + (ReferenceGenome.GRCh38, DatasetType.SV), + ), + ) + + @staticmethod + def migrate(ht: hl.Table) -> hl.Table: + return ht.annotate_globals(migrations=hl.empty_list(hl.str)) diff --git a/v03_pipeline/migrations/annotations/__init__.py b/v03_pipeline/migrations/annotations/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/v03_pipeline/migrations/lookup/0001_remove_null_families.py b/v03_pipeline/migrations/lookup/0001_remove_null_families.py new file mode 100644 index 000000000..59123ea70 --- /dev/null +++ b/v03_pipeline/migrations/lookup/0001_remove_null_families.py @@ -0,0 +1,27 @@ +import hail as hl + +from v03_pipeline.lib.migration.base_migration import BaseMigration +from v03_pipeline.lib.model import DatasetType, ReferenceGenome + + +class RemoveNullFamilies(BaseMigration): + @property + def reference_genome_dataset_types() -> ( + frozenset[tuple[ReferenceGenome, DatasetType]] + ): + return frozenset( + ( + (ReferenceGenome.GRCh37, DatasetType.SNV_INDEL), + (ReferenceGenome.GRCh38, DatasetType.SNV_INDEL), + (ReferenceGenome.GRCh38, DatasetType.MITO), + ), + ) + + @staticmethod + def migrate(ht: hl.Table) -> hl.Table: + ht = ht.annotate( + project_stats=ht.project_stats.map( + lambda ps: hl.or_missing(hl.all(ps.map(hl.is_defined)), ps), + ), + ) + return ht.annotate_globals(migrations=hl.empty_list(hl.str)) diff --git a/v03_pipeline/migrations/lookup/__init__.py b/v03_pipeline/migrations/lookup/__init__.py new file mode 100644 index 000000000..e69de29bb From d1519f09d0a69380243e0e7406054c22eda420ea Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 31 Jul 2024 11:47:38 -0400 Subject: [PATCH 4/9] Add new SV annotations for VCF export. (#857) * Add SV annotations * ruff * push * ruff * Update update_variant_annotations_table_with_new_samples_test.py --- v03_pipeline/lib/annotations/sv.py | 39 ++++ v03_pipeline/lib/annotations/sv_test.py | 171 ++++++++++++++++++ v03_pipeline/lib/model/dataset_type.py | 12 ++ ...annotations_table_with_new_samples_test.py | 11 ++ 4 files changed, 233 insertions(+) create mode 100644 v03_pipeline/lib/annotations/sv_test.py diff --git a/v03_pipeline/lib/annotations/sv.py b/v03_pipeline/lib/annotations/sv.py index 582474df4..5c75b0c94 100644 --- a/v03_pipeline/lib/annotations/sv.py +++ b/v03_pipeline/lib/annotations/sv.py @@ -81,6 +81,41 @@ def _sv_types(ht: hl.Table) -> hl.ArrayExpression: return ht.alleles[1].replace('[<>]', '').split(':', 2) +def alleles(ht: hl.Table, **_: Any) -> hl.ArrayExpression: + return hl.array( + [ + 'N', + hl.if_else( + ( + hl.is_defined(ht.sv_type_detail_id) + & (hl.array(SV_TYPES)[ht.sv_type_id] != 'CPX') + ), + hl.format( + '<%s:%s>', + hl.array(SV_TYPES)[ht.sv_type_id], + hl.array(SV_TYPE_DETAILS)[ht.sv_type_detail_id], + ), + hl.format('<%s>', hl.array(SV_TYPES)[ht.sv_type_id]), + ), + ], + ) + + +def info(ht: hl.Table, **_: Any) -> hl.StructExpression: + return hl.Struct( + ALGORITHMS=ht.algorithms, + END=ht.start_locus.position, + CHR2=ht.end_locus.contig, + END2=ht.end_locus.position, + SVTYPE=hl.array(SV_TYPES)[ht.sv_type_id], + SVLEN=ht.sv_len, + ) + + +def locus(ht: hl.Table, **_: Any) -> hl.LocusExpression: + return ht.start_locus + + def algorithms(ht: hl.Table, **_: Any) -> hl.Expression: return hl.str(',').join(ht['info.ALGORITHMS']) @@ -205,6 +240,10 @@ def strvctvre(ht: hl.Table, **_: Any) -> hl.Expression: return hl.struct(score=hl.parse_float32(ht['info.StrVCTVRE'])) +def sv_len(ht: hl.Table, **_: Any) -> hl.Expression: + return ht['info.SVLEN'] + + def sv_type_id(ht: hl.Table, **_: Any) -> hl.Expression: return SV_TYPES_LOOKUP[_sv_types(ht)[0]] diff --git a/v03_pipeline/lib/annotations/sv_test.py b/v03_pipeline/lib/annotations/sv_test.py new file mode 100644 index 000000000..53e098069 --- /dev/null +++ b/v03_pipeline/lib/annotations/sv_test.py @@ -0,0 +1,171 @@ +import unittest + +import hail as hl + +from v03_pipeline.lib.annotations.fields import get_fields +from v03_pipeline.lib.model import DatasetType + + +class SVTest(unittest.TestCase): + def test_sv_export_annotations(self) -> None: + ht = hl.Table.parallelize( + [ + hl.Struct( + id=0, + algorithms='manta', + end_locus=hl.Locus( + contig='chr5', + position=20404, + reference_genome='GRCh38', + ), + start_locus=hl.Locus( + contig='chr1', + position=180928, + reference_genome='GRCh38', + ), + sv_len=123, + sv_type_id=2, + sv_type_detail_id=None, + ), + hl.Struct( + id=1, + algorithms='manta', + end_locus=hl.Locus( + contig='chr1', + position=789481, + reference_genome='GRCh38', + ), + start_locus=hl.Locus( + contig='chr1', + position=789481, + reference_genome='GRCh38', + ), + sv_len=245, + sv_type_id=2, + sv_type_detail_id=None, + ), + hl.Struct( + id=2, + algorithms='manta', + end_locus=hl.Locus( + contig='chr1', + position=6559723, + reference_genome='GRCh38', + ), + start_locus=hl.Locus( + contig='chr1', + position=6558902, + reference_genome='GRCh38', + ), + sv_len=245, + sv_type_id=3, + sv_type_detail_id=2, + ), + hl.Struct( + id=3, + algorithms='manta', + end_locus=hl.Locus( + contig='chr1', + position=6559723, + reference_genome='GRCh38', + ), + start_locus=hl.Locus( + contig='chr1', + position=6558902, + reference_genome='GRCh38', + ), + sv_len=245, + sv_type_id=7, + sv_type_detail_id=6, + ), + ], + hl.tstruct( + id=hl.tint32, + algorithms=hl.tstr, + end_locus=hl.tlocus('GRCh38'), + start_locus=hl.tlocus('GRCh38'), + sv_len=hl.tint32, + sv_type_id=hl.tint32, + sv_type_detail_id=hl.tint32, + ), + key='id', + ) + ht = ht.select( + **get_fields( + ht, + DatasetType.SV.export_vcf_annotation_fns, + ), + ) + self.assertEqual( + ht.collect(), + [ + hl.Struct( + id=0, + locus=hl.Locus( + contig='chr1', + position=180928, + reference_genome='GRCh38', + ), + alleles=['N', ''], + info=hl.Struct( + ALGORITHMS='manta', + END=180928, + CHR2='chr5', + END2=20404, + SVTYPE='BND', + SVLEN=123, + ), + ), + hl.Struct( + id=1, + locus=hl.Locus( + contig='chr1', + position=789481, + reference_genome='GRCh38', + ), + alleles=['N', ''], + info=hl.Struct( + ALGORITHMS='manta', + END=789481, + CHR2='chr1', + END2=789481, + SVTYPE='BND', + SVLEN=245, + ), + ), + hl.Struct( + id=2, + locus=hl.Locus( + contig='chr1', + position=6558902, + reference_genome='GRCh38', + ), + alleles=['N', ''], + info=hl.Struct( + ALGORITHMS='manta', + END=6558902, + CHR2='chr1', + END2=6559723, + SVTYPE='CPX', + SVLEN=245, + ), + ), + hl.Struct( + id=3, + locus=hl.Locus( + contig='chr1', + position=6558902, + reference_genome='GRCh38', + ), + alleles=['N', ''], + info=hl.Struct( + ALGORITHMS='manta', + END=6558902, + CHR2='chr1', + END2=6559723, + SVTYPE='INS', + SVLEN=245, + ), + ), + ], + ) diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py index e7af6983f..704a1eff7 100644 --- a/v03_pipeline/lib/model/dataset_type.py +++ b/v03_pipeline/lib/model/dataset_type.py @@ -116,6 +116,7 @@ def row_fields( 'info.N_HET': hl.tint32, 'info.N_HOMALT': hl.tint32, 'info.StrVCTVRE': hl.tstr, + 'info.SVLEN': hl.tint32, **sv.CONSEQ_PREDICTED_GENE_COLS, }, DatasetType.GCNV: { @@ -239,6 +240,7 @@ def formatting_annotation_fns( sv.strvctvre, sv.sv_type_id, sv.sv_type_detail_id, + sv.sv_len, shared.xpos, ], DatasetType.GCNV: [ @@ -335,3 +337,13 @@ def lookup_table_annotation_fns(self) -> list[Callable[..., hl.Expression]]: @property def should_send_to_allele_registry(self): return self == DatasetType.SNV_INDEL + + @property + def export_vcf_annotation_fns(self) -> list[Callable[..., hl.Expression]]: + return { + DatasetType.SV: [ + sv.locus, + sv.alleles, + sv.info, + ], + }[self] diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 7bc1e8c85..39ccdc799 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -1172,6 +1172,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=None), + sv_len=-1, sv_type_id=2, sv_type_detail_id=None, xpos=1000180928, @@ -1214,6 +1215,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=None), + sv_len=223225007, sv_type_id=2, sv_type_detail_id=None, xpos=1000789481, @@ -1266,6 +1268,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=None), + sv_len=821, sv_type_id=3, sv_type_detail_id=2, xpos=1006558902, @@ -1321,6 +1324,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=None), + sv_len=534718, sv_type_id=3, sv_type_detail_id=9, xpos=1180540234, @@ -1373,6 +1377,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=None), + sv_len=841, sv_type_id=3, sv_type_detail_id=12, xpos=1016088760, @@ -1430,6 +1435,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=None), + sv_len=52921, sv_type_id=3, sv_type_detail_id=13, xpos=1021427498, @@ -1471,6 +1477,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=None), + sv_len=14532, sv_type_id=5, sv_type_detail_id=None, xpos=1000413968, @@ -1508,6 +1515,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=None), + sv_len=6000, sv_type_id=6, sv_type_detail_id=None, xpos=1000257666, @@ -1549,6 +1557,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=None), + sv_len=955, sv_type_id=7, sv_type_detail_id=6, xpos=1017465707, @@ -1593,6 +1602,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=hl.eval(hl.float32(0.1255))), + sv_len=298, sv_type_id=7, sv_type_detail_id=4, xpos=1004228405, @@ -1634,6 +1644,7 @@ def test_sv_update_vat( reference_genome='GRCh38', ), strvctvre=hl.Struct(score=None), + sv_len=5520, sv_type_id=7, sv_type_detail_id=5, xpos=1048963084, From 5f1d2839bfc16e120ede926f7fec9ab92730ba1f Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 1 Aug 2024 14:14:06 -0400 Subject: [PATCH 5/9] Add a task to export the SV annotations table to VCF. (#858) * Export VCF task * Fix test * lint --- v03_pipeline/lib/model/dataset_type.py | 4 + v03_pipeline/lib/paths.py | 14 ++++ .../tasks/write_variant_annotations_vcf.py | 40 ++++++++++ .../write_variant_annotations_vcf_test.py | 76 +++++++++++++++++++ 4 files changed, 134 insertions(+) create mode 100644 v03_pipeline/lib/tasks/write_variant_annotations_vcf.py create mode 100644 v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py diff --git a/v03_pipeline/lib/model/dataset_type.py b/v03_pipeline/lib/model/dataset_type.py index 704a1eff7..51ed76b08 100644 --- a/v03_pipeline/lib/model/dataset_type.py +++ b/v03_pipeline/lib/model/dataset_type.py @@ -338,6 +338,10 @@ def lookup_table_annotation_fns(self) -> list[Callable[..., hl.Expression]]: def should_send_to_allele_registry(self): return self == DatasetType.SNV_INDEL + @property + def should_export_to_vcf(self): + return self == DatasetType.SV + @property def export_vcf_annotation_fns(self) -> list[Callable[..., hl.Expression]]: return { diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 628470212..3dc0ae179 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -269,6 +269,20 @@ def variant_annotations_table_path( ) +def variant_annotations_vcf_path( + reference_genome: ReferenceGenome, + dataset_type: DatasetType, +) -> str: + return os.path.join( + _v03_pipeline_prefix( + Env.HAIL_SEARCH_DATA, + reference_genome, + dataset_type, + ), + 'annotations.vcf.bgz', + ) + + def new_variants_table_path( reference_genome: ReferenceGenome, dataset_type: DatasetType, diff --git a/v03_pipeline/lib/tasks/write_variant_annotations_vcf.py b/v03_pipeline/lib/tasks/write_variant_annotations_vcf.py new file mode 100644 index 000000000..8c5d0e4f4 --- /dev/null +++ b/v03_pipeline/lib/tasks/write_variant_annotations_vcf.py @@ -0,0 +1,40 @@ +import hail as hl +import luigi + +from v03_pipeline.lib.annotations.fields import get_fields +from v03_pipeline.lib.paths import variant_annotations_vcf_path +from v03_pipeline.lib.tasks.base.base_hail_table import BaseHailTableTask +from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams +from v03_pipeline.lib.tasks.base.base_update_variant_annotations_table import ( + BaseUpdateVariantAnnotationsTableTask, +) +from v03_pipeline.lib.tasks.files import GCSorLocalTarget + + +@luigi.util.inherits(BaseLoadingRunParams) +class WriteVariantAnnotationsVCF(BaseHailTableTask): + def output(self) -> luigi.Target: + return GCSorLocalTarget( + variant_annotations_vcf_path( + self.reference_genome, + self.dataset_type, + ), + ) + + def complete(self) -> bool: + return not self.dataset_type.should_export_to_vcf + + def requires(self) -> luigi.Task: + return self.clone(BaseUpdateVariantAnnotationsTableTask, force=False) + + def run(self) -> None: + ht = hl.read_table(self.input().path) + ht = ht.annotate( + **get_fields( + ht, + self.dataset_type.export_vcf_annotation_fns, + **self.param_kwargs, + ), + ) + ht = ht.key_by('locus', 'alleles') + hl.export_vcf(ht, self.output().path, tabix=True) diff --git a/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py b/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py new file mode 100644 index 000000000..d96096be0 --- /dev/null +++ b/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py @@ -0,0 +1,76 @@ +from unittest.mock import Mock, patch + +import hailtop.fs as hfs +import luigi.worker + +from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.tasks.update_variant_annotations_table_with_new_samples import ( + UpdateVariantAnnotationsTableWithNewSamplesTask, +) +from v03_pipeline.lib.tasks.write_variant_annotations_vcf import ( + WriteVariantAnnotationsVCF, +) +from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase + +TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf' +TEST_PEDIGREE_5 = 'v03_pipeline/var/test/pedigrees/test_pedigree_5.tsv' + +GENE_ID_MAPPING = { + 'OR4F5': 'ENSG00000186092', + 'PLEKHG4B': 'ENSG00000153404', + 'OR4F16': 'ENSG00000186192', + 'OR4F29': 'ENSG00000284733', + 'FBXO28': 'ENSG00000143756', + 'SAMD11': 'ENSG00000187634', + 'C1orf174': 'ENSG00000198912', + 'TAS1R1': 'ENSG00000173662', + 'FAM131C': 'ENSG00000185519', + 'RCC2': 'ENSG00000179051', + 'NBPF3': 'ENSG00000142794', + 'AGBL4': 'ENSG00000186094', + 'KIAA1614': 'ENSG00000135835', + 'MR1': 'ENSG00000153029', + 'STX6': 'ENSG00000135823', + 'XPR1': 'ENSG00000143324', +} + + +class WriteVariantAnnotationsVCFTest(MockedDatarootTestCase): + @patch( + 'v03_pipeline.lib.tasks.write_new_variants_table.load_gencode_gene_symbol_to_gene_id', + ) + def test_sv_export_vcf( + self, + mock_load_gencode: Mock, + ) -> None: + mock_load_gencode.return_value = GENE_ID_MAPPING + worker = luigi.worker.Worker() + update_variant_annotations_task = ( + UpdateVariantAnnotationsTableWithNewSamplesTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SV, + sample_type=SampleType.WGS, + callset_path=TEST_SV_VCF, + project_guids=['R0115_test_project2'], + project_remap_paths=['not_a_real_file'], + project_pedigree_paths=[TEST_PEDIGREE_5], + skip_validation=True, + run_id='run_id1', + ) + ) + worker.add(update_variant_annotations_task) + worker.run() + self.assertTrue(update_variant_annotations_task.complete()) + write_variant_annotations_vcf_task = WriteVariantAnnotationsVCF( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SV, + sample_type=SampleType.WGS, + callset_path=TEST_SV_VCF, + ) + worker.add(write_variant_annotations_vcf_task) + worker.run() + self.assertTrue( + hfs.exists( + write_variant_annotations_vcf_task.output().path, + ), + ) From bf4b62c6ccbfebf32e2b86d53d722658dc1c3867 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 2 Aug 2024 14:46:50 -0400 Subject: [PATCH 6/9] Resolve the assumption in the pipeline that remap/pedigree files are immutable. (#856) * add remap_pedigree hash * add func * all the imports * ruff * Fix it * support missing remap * ruff * ruff * ruff * tweak the type * tweak the type * Fix test * ruff * add remap pedigree hash * Explicit int32 * lint * Update io.py * ruff * lint * hash * Flappy test * wrong pedigree * bad colon * finish tests * add a test * add pedigree * Fix test --- v03_pipeline/lib/misc/io.py | 13 +++ v03_pipeline/lib/misc/io_test.py | 14 +++ .../tasks/base/base_update_lookup_table.py | 8 +- .../tasks/base/base_update_project_table.py | 4 +- .../base_update_variant_annotations_table.py | 8 +- .../delete_project_family_tables_test.py | 7 +- .../lib/tasks/migrate_lookup_table.py | 8 +- .../migrate_variant_annotations_table.py | 8 +- v03_pipeline/lib/tasks/update_lookup_table.py | 15 ++- .../lib/tasks/update_lookup_table_test.py | 17 ++- ...lookup_table_with_deleted_families_test.py | 24 +++- ..._lookup_table_with_deleted_project_test.py | 20 +++- .../lib/tasks/update_project_table.py | 19 ++- .../lib/tasks/update_project_table_test.py | 108 +++++++++++++++++- ...ations_table_with_deleted_families_test.py | 36 +++++- ...tations_table_with_deleted_project_test.py | 34 +++++- ...iant_annotations_table_with_new_samples.py | 18 ++- ...annotations_table_with_new_samples_test.py | 34 ++++++ .../lib/tasks/write_new_variants_table.py | 18 ++- .../tasks/write_project_family_tables_test.py | 30 +++-- .../write_remapped_and_subsetted_callset.py | 17 ++- ...ite_remapped_and_subsetted_callset_test.py | 13 +++ .../0002_add_remap_pedigree_hash.py | 28 +++++ .../lookup/0002_add_remap_pedigree_hash.py | 26 +++++ .../test_pedigree_3_different_families.tsv | 3 + 25 files changed, 477 insertions(+), 53 deletions(-) create mode 100644 v03_pipeline/migrations/annotations/0002_add_remap_pedigree_hash.py create mode 100644 v03_pipeline/migrations/lookup/0002_add_remap_pedigree_hash.py create mode 100644 v03_pipeline/var/test/pedigrees/test_pedigree_3_different_families.tsv diff --git a/v03_pipeline/lib/misc/io.py b/v03_pipeline/lib/misc/io.py index c420c84e9..599e791b8 100644 --- a/v03_pipeline/lib/misc/io.py +++ b/v03_pipeline/lib/misc/io.py @@ -1,8 +1,10 @@ +import hashlib import math import os import uuid import hail as hl +import hailtop.fs as hfs from v03_pipeline.lib.misc.gcnv import parse_gcnv_genes from v03_pipeline.lib.misc.nested_field import parse_nested_field @@ -200,6 +202,17 @@ def import_pedigree(pedigree_path: str) -> hl.Table: ) +def remap_pedigree_hash(remap_path: str, pedigree_path: str) -> hl.Int32Expression: + sha256 = hashlib.sha256() + if hfs.exists(remap_path): + with hfs.open(remap_path) as f1: + sha256.update(f1.read().encode('utf8')) + with hfs.open(pedigree_path) as f2: + sha256.update(f2.read().encode('utf8')) + # maximum 4 byte int + return hl.int32(int(sha256.hexdigest()[:8], 16)) + + def checkpoint(t: hl.Table | hl.MatrixTable) -> tuple[hl.Table | hl.MatrixTable, str]: suffix = 'mt' if isinstance(t, hl.MatrixTable) else 'ht' read_fn = hl.read_matrix_table if isinstance(t, hl.MatrixTable) else hl.read_table diff --git a/v03_pipeline/lib/misc/io_test.py b/v03_pipeline/lib/misc/io_test.py index 2955557c1..ab0638d8c 100644 --- a/v03_pipeline/lib/misc/io_test.py +++ b/v03_pipeline/lib/misc/io_test.py @@ -6,13 +6,16 @@ compute_hail_n_partitions, file_size_bytes, import_imputed_sex, + remap_pedigree_hash, ) TEST_IMPUTED_SEX = 'v03_pipeline/var/test/sex_check/test_imputed_sex.tsv' TEST_IMPUTED_SEX_UNEXPECTED_VALUE = ( 'v03_pipeline/var/test/sex_check/test_imputed_sex_unexpected_value.tsv' ) +TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv' TEST_MITO_MT = 'v03_pipeline/var/test/callsets/mito_1.mt' +TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv' TEST_SV_VCF = 'v03_pipeline/var/test/callsets/sv_1.vcf' @@ -46,3 +49,14 @@ def test_import_imputed_sex_unexpected_value(self) -> None: 'Found unexpected value Unknown in imputed sex file', ht.collect, ) + + def test_remap_pedigree_hash(self) -> None: + self.assertEqual( + hl.eval( + remap_pedigree_hash( + TEST_REMAP, + TEST_PEDIGREE_3, + ), + ), + -560434714, + ) diff --git a/v03_pipeline/lib/tasks/base/base_update_lookup_table.py b/v03_pipeline/lib/tasks/base/base_update_lookup_table.py index 83f13be38..2c32eb507 100644 --- a/v03_pipeline/lib/tasks/base/base_update_lookup_table.py +++ b/v03_pipeline/lib/tasks/base/base_update_lookup_table.py @@ -36,6 +36,12 @@ def initialize_table(self) -> hl.Table: globals=hl.Struct( project_guids=hl.empty_array(hl.tstr), project_families=hl.empty_dict(hl.tstr, hl.tarray(hl.tstr)), - updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), + updates=hl.empty_set( + hl.tstruct( + callset=hl.tstr, + project_guid=hl.tstr, + remap_pedigree_hash=hl.tint32, + ), + ), ), ) diff --git a/v03_pipeline/lib/tasks/base/base_update_project_table.py b/v03_pipeline/lib/tasks/base/base_update_project_table.py index 0040b6ad9..ad7ee0d1e 100644 --- a/v03_pipeline/lib/tasks/base/base_update_project_table.py +++ b/v03_pipeline/lib/tasks/base/base_update_project_table.py @@ -32,6 +32,8 @@ def initialize_table(self) -> hl.Table: globals=hl.Struct( family_guids=hl.empty_array(hl.tstr), family_samples=hl.empty_dict(hl.tstr, hl.tarray(hl.tstr)), - updates=hl.empty_set(hl.tstr), + updates=hl.empty_set( + hl.tstruct(callset=hl.tstr, remap_pedigree_hash=hl.tint32), + ), ), ) diff --git a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py index 2fcce0f45..7505378d1 100644 --- a/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py +++ b/v03_pipeline/lib/tasks/base/base_update_variant_annotations_table.py @@ -66,7 +66,13 @@ def initialize_table(self) -> hl.Table: paths=hl.Struct(), versions=hl.Struct(), enums=hl.Struct(), - updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), + updates=hl.empty_set( + hl.tstruct( + callset=hl.tstr, + project_guid=hl.tstr, + remap_pedigree_hash=hl.tint32, + ), + ), migrations=hl.empty_array(hl.tstr), ), ) diff --git a/v03_pipeline/lib/tasks/delete_project_family_tables_test.py b/v03_pipeline/lib/tasks/delete_project_family_tables_test.py index 3cb56f1c8..75f11987a 100644 --- a/v03_pipeline/lib/tasks/delete_project_family_tables_test.py +++ b/v03_pipeline/lib/tasks/delete_project_family_tables_test.py @@ -115,7 +115,12 @@ def setUp(self) -> None: family_guids=['family_a', 'family_b'], family_samples={'family_a': ['1', '2', '3'], 'family_b': ['4']}, sample_type='WGS', - updates={'v03_pipeline/var/test/callsets/1kg_30variants.vcf'}, + updates={ + hl.Struct( + callset='v03_pipeline/var/test/callsets/1kg_30variants.vcf', + remap_pedigree_hash=123, + ), + }, ), ) ht.write( diff --git a/v03_pipeline/lib/tasks/migrate_lookup_table.py b/v03_pipeline/lib/tasks/migrate_lookup_table.py index bd7ce0d40..4fe9d7ac4 100644 --- a/v03_pipeline/lib/tasks/migrate_lookup_table.py +++ b/v03_pipeline/lib/tasks/migrate_lookup_table.py @@ -43,6 +43,12 @@ def initialize_table(self) -> hl.Table: globals=hl.Struct( project_guids=hl.empty_array(hl.tstr), project_families=hl.empty_dict(hl.tstr, hl.tarray(hl.tstr)), - updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), + updates=hl.empty_set( + hl.tstruct( + callset=hl.tstr, + project_guid=hl.tstr, + remap_pedigree_hash=hl.tint32, + ), + ), ), ) diff --git a/v03_pipeline/lib/tasks/migrate_variant_annotations_table.py b/v03_pipeline/lib/tasks/migrate_variant_annotations_table.py index fd8bec321..44e3debde 100644 --- a/v03_pipeline/lib/tasks/migrate_variant_annotations_table.py +++ b/v03_pipeline/lib/tasks/migrate_variant_annotations_table.py @@ -32,7 +32,13 @@ def initialize_table(self) -> hl.Table: paths=hl.Struct(), versions=hl.Struct(), enums=hl.Struct(), - updates=hl.empty_set(hl.tstruct(callset=hl.tstr, project_guid=hl.tstr)), + updates=hl.empty_set( + hl.tstruct( + callset=hl.tstr, + project_guid=hl.tstr, + remap_pedigree_hash=hl.tint32, + ), + ), migrations=hl.empty_array(hl.tstr), ), ) diff --git a/v03_pipeline/lib/tasks/update_lookup_table.py b/v03_pipeline/lib/tasks/update_lookup_table.py index eb04d1d48..3f9fc4b46 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table.py +++ b/v03_pipeline/lib/tasks/update_lookup_table.py @@ -2,6 +2,7 @@ import luigi import luigi.util +from v03_pipeline.lib.misc.io import remap_pedigree_hash from v03_pipeline.lib.misc.lookup import ( compute_callset_lookup_ht, join_lookup_hts, @@ -35,9 +36,13 @@ def complete(self) -> bool: hl.Struct( callset=self.callset_path, project_guid=project_guid, + remap_pedigree_hash=remap_pedigree_hash( + self.project_remap_paths[i], + self.project_pedigree_paths[i], + ), ), ) - for project_guid in self.project_guids + for i, project_guid in enumerate(self.project_guids) ], ), hl.read_table(self.output().path).updates, @@ -76,6 +81,10 @@ def update_table(self, ht: hl.Table) -> hl.Table: hl.Struct( callset=self.callset_path, project_guid=project_guid, + remap_pedigree_hash=remap_pedigree_hash( + self.project_remap_paths[i], + self.project_pedigree_paths[i], + ), ), ), ) @@ -102,6 +111,10 @@ def update_table(self, ht: hl.Table) -> hl.Table: hl.Struct( callset=self.callset_path, project_guid=project_guid, + remap_pedigree_hash=remap_pedigree_hash( + self.project_remap_paths[i], + self.project_pedigree_paths[i], + ), ), ), ) diff --git a/v03_pipeline/lib/tasks/update_lookup_table_test.py b/v03_pipeline/lib/tasks/update_lookup_table_test.py index 8551d873e..073627fa6 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table_test.py +++ b/v03_pipeline/lib/tasks/update_lookup_table_test.py @@ -1,6 +1,7 @@ import hail as hl import luigi.worker +from v03_pipeline.lib.misc.io import remap_pedigree_hash from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType from v03_pipeline.lib.tasks.update_lookup_table import ( UpdateLookupTableTask, @@ -39,7 +40,13 @@ def test_skip_update_lookup_table_task(self) -> None: project_guids=[], project_families={}, updates={ - hl.Struct(callset=TEST_VCF, project_guid='R0555_seqr_demo'), + hl.Struct( + callset=TEST_VCF, + project_guid='R0555_seqr_demo', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash(TEST_REMAP, TEST_PEDIGREE_3), + ), + ), }, ), ], @@ -70,7 +77,13 @@ def test_update_lookup_table_task(self) -> None: project_guids=['R0113_test_project'], project_families={'R0113_test_project': ['abc_1']}, updates={ - hl.Struct(callset=TEST_VCF, project_guid='R0113_test_project'), + hl.Struct( + callset=TEST_VCF, + project_guid='R0113_test_project', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash(TEST_REMAP, TEST_PEDIGREE_3), + ), + ), }, ), ], diff --git a/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_families_test.py b/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_families_test.py index 70915ef9d..5b103d453 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_families_test.py +++ b/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_families_test.py @@ -123,8 +123,16 @@ def test_delete_project( project_guids=['project_a', 'project_b'], project_families={'project_a': ['1', '2', '3'], 'project_b': ['4']}, updates={ - hl.Struct(project_guid='project_a', callset='abc'), - hl.Struct(project_guid='project_b', callset='abc'), + hl.Struct( + project_guid='project_a', + callset='abc', + remap_pedigree_hash=123, + ), + hl.Struct( + project_guid='project_b', + callset='abc', + remap_pedigree_hash=123, + ), }, ), ) @@ -147,8 +155,16 @@ def test_delete_project( project_guids=['project_a', 'project_b'], project_families={'project_a': ['2'], 'project_b': ['4']}, updates={ - hl.Struct(project_guid='project_a', callset='abc'), - hl.Struct(project_guid='project_b', callset='abc'), + hl.Struct( + project_guid='project_a', + callset='abc', + remap_pedigree_hash=123, + ), + hl.Struct( + project_guid='project_b', + callset='abc', + remap_pedigree_hash=123, + ), }, ), ], diff --git a/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_project_test.py b/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_project_test.py index e40e034ec..c1f112631 100644 --- a/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_project_test.py +++ b/v03_pipeline/lib/tasks/update_lookup_table_with_deleted_project_test.py @@ -122,8 +122,16 @@ def test_delete_project( project_guids=['project_a', 'project_b'], project_families={'project_a': ['1', '2', '3'], 'project_b': ['4']}, updates={ - hl.Struct(project_guid='project_a', callset='abc'), - hl.Struct(project_guid='project_b', callset='abc'), + hl.Struct( + project_guid='project_a', + callset='abc', + remap_pedigree_hash=123, + ), + hl.Struct( + project_guid='project_b', + callset='abc', + remap_pedigree_hash=123, + ), }, ), ) @@ -144,7 +152,13 @@ def test_delete_project( hl.Struct( project_guids=['project_b'], project_families={'project_b': ['4']}, - updates={hl.Struct(project_guid='project_b', callset='abc')}, + updates={ + hl.Struct( + project_guid='project_b', + callset='abc', + remap_pedigree_hash=123, + ), + }, ), ], ) diff --git a/v03_pipeline/lib/tasks/update_project_table.py b/v03_pipeline/lib/tasks/update_project_table.py index c7ea539e4..a2ca742ba 100644 --- a/v03_pipeline/lib/tasks/update_project_table.py +++ b/v03_pipeline/lib/tasks/update_project_table.py @@ -8,6 +8,7 @@ join_family_entries_hts, remove_family_guids, ) +from v03_pipeline.lib.misc.io import remap_pedigree_hash from v03_pipeline.lib.tasks.base.base_loading_run_params import BaseLoadingRunParams from v03_pipeline.lib.tasks.base.base_update_project_table import ( BaseUpdateProjectTableTask, @@ -28,7 +29,13 @@ def complete(self) -> bool: and super().complete() and hl.eval( hl.read_table(self.output().path).updates.contains( - self.callset_path, + hl.Struct( + callset=self.callset_path, + remap_pedigree_hash=remap_pedigree_hash( + self.project_remap_path, + self.project_pedigree_path, + ), + ), ), ) ) @@ -62,5 +69,13 @@ def update_table(self, ht: hl.Table) -> hl.Table: family_guids=ht.family_guids, family_samples=ht.family_samples, sample_type=self.sample_type.value, - updates=ht.updates.add(self.callset_path), + updates=ht.updates.add( + hl.Struct( + callset=self.callset_path, + remap_pedigree_hash=remap_pedigree_hash( + self.project_remap_path, + self.project_pedigree_path, + ), + ), + ), ) diff --git a/v03_pipeline/lib/tasks/update_project_table_test.py b/v03_pipeline/lib/tasks/update_project_table_test.py index c0a5a4e57..4dba40590 100644 --- a/v03_pipeline/lib/tasks/update_project_table_test.py +++ b/v03_pipeline/lib/tasks/update_project_table_test.py @@ -1,6 +1,7 @@ import hail as hl import luigi.worker +from v03_pipeline.lib.misc.io import remap_pedigree_hash from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType from v03_pipeline.lib.tasks.update_project_table import UpdateProjectTableTask from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase @@ -8,6 +9,9 @@ TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' TEST_REMAP = 'v03_pipeline/var/test/remaps/test_remap_1.tsv' TEST_PEDIGREE_3 = 'v03_pipeline/var/test/pedigrees/test_pedigree_3.tsv' +TEST_PEDIGREE_3_DIFFERENT_FAMILIES = ( + 'v03_pipeline/var/test/pedigrees/test_pedigree_3_different_families.tsv' +) class UpdateProjectTableTaskTest(MockedDatarootTestCase): @@ -36,7 +40,17 @@ def test_update_project_table_task(self) -> None: 'abc_1': ['HG00731_1', 'HG00732_1', 'HG00733_1'], }, sample_type=SampleType.WGS.value, - updates={'v03_pipeline/var/test/callsets/1kg_30variants.vcf'}, + updates={ + hl.Struct( + callset='v03_pipeline/var/test/callsets/1kg_30variants.vcf', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + TEST_REMAP, + TEST_PEDIGREE_3, + ), + ), + ), + }, ), ], ) @@ -108,3 +122,95 @@ def test_update_project_table_task(self) -> None: ), ], ) + + def test_update_project_table_task_different_pedigree(self) -> None: + worker = luigi.worker.Worker() + upt_task = UpdateProjectTableTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path=TEST_VCF, + project_guid='R0113_test_project', + project_remap_path=TEST_REMAP, + project_pedigree_path=TEST_PEDIGREE_3, + skip_validation=True, + ) + worker.add(upt_task) + worker.run() + upt_task = UpdateProjectTableTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WGS, + callset_path=TEST_VCF, + project_guid='R0113_test_project', + project_remap_path=TEST_REMAP, + project_pedigree_path=TEST_PEDIGREE_3_DIFFERENT_FAMILIES, + skip_validation=True, + ) + worker.add(upt_task) + worker.run() + self.assertTrue(upt_task.complete()) + worker.add(upt_task) + worker.run() + ht = hl.read_table(upt_task.output().path) + self.assertCountEqual( + ht.globals.collect(), + [ + hl.Struct( + family_guids=['abc_1'], + family_samples={ + 'abc_1': ['HG00731_1', 'HG00733_1'], + }, + sample_type=SampleType.WGS.value, + updates={ + hl.Struct( + callset='v03_pipeline/var/test/callsets/1kg_30variants.vcf', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + TEST_REMAP, + TEST_PEDIGREE_3, + ), + ), + ), + hl.Struct( + callset='v03_pipeline/var/test/callsets/1kg_30variants.vcf', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + TEST_REMAP, + TEST_PEDIGREE_3_DIFFERENT_FAMILIES, + ), + ), + ), + }, + ), + ], + ) + + self.assertCountEqual( + ht.collect()[0], + hl.Struct( + locus=hl.Locus( + contig='chr1', + position=876499, + reference_genome='GRCh38', + ), + alleles=['A', 'G'], + filters=set(), + family_entries=[ + [ + hl.Struct( + GQ=21, + AB=1.0, + DP=7, + GT=hl.Call(alleles=[1, 1], phased=False), + ), + hl.Struct( + GQ=12, + AB=1.0, + DP=4, + GT=hl.Call(alleles=[1, 1], phased=False), + ), + ], + ], + ), + ) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_families_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_families_test.py index 67410ef18..3d0c9df8b 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_families_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_families_test.py @@ -85,8 +85,16 @@ def setUp(self) -> None: project_guids=['project_a', 'project_b'], project_families={'project_a': ['1', '2', '3'], 'project_b': ['4']}, updates={ - hl.Struct(callset='abc', project_guid='project_a'), - hl.Struct(callset='123', project_guid='project_b'), + hl.Struct( + callset='abc', + project_guid='project_a', + remap_pedigree_hash=123, + ), + hl.Struct( + callset='123', + project_guid='project_b', + remap_pedigree_hash=123, + ), }, ), ) @@ -123,8 +131,16 @@ def setUp(self) -> None: key='id', globals=hl.Struct( updates={ - hl.Struct(callset='abc', project_guid='project_a'), - hl.Struct(callset='123', project_guid='project_b'), + hl.Struct( + callset='abc', + project_guid='project_a', + remap_pedigree_hash=123, + ), + hl.Struct( + callset='123', + project_guid='project_b', + remap_pedigree_hash=123, + ), }, ), ) @@ -151,8 +167,16 @@ def test_update_annotations_with_deleted_project(self) -> None: [ hl.Struct( updates={ - hl.Struct(callset='abc', project_guid='project_a'), - hl.Struct(callset='123', project_guid='project_b'), + hl.Struct( + callset='abc', + project_guid='project_a', + remap_pedigree_hash=123, + ), + hl.Struct( + callset='123', + project_guid='project_b', + remap_pedigree_hash=123, + ), }, ), ], diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project_test.py index 295a9577b..c144bffec 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_deleted_project_test.py @@ -93,8 +93,16 @@ def setUp(self) -> None: project_guids=['project_a', 'project_b'], project_families={'project_a': ['1', '2', '3'], 'project_b': ['4']}, updates={ - hl.Struct(callset='abc', project_guid='project_a'), - hl.Struct(callset='123', project_guid='project_b'), + hl.Struct( + callset='abc', + project_guid='project_a', + remap_pedigree_hash=123, + ), + hl.Struct( + callset='123', + project_guid='project_b', + remap_pedigree_hash=123, + ), }, ), ) @@ -131,8 +139,16 @@ def setUp(self) -> None: key='id', globals=hl.Struct( updates={ - hl.Struct(callset='abc', project_guid='project_a'), - hl.Struct(callset='123', project_guid='project_b'), + hl.Struct( + callset='abc', + project_guid='project_a', + remap_pedigree_hash=123, + ), + hl.Struct( + callset='123', + project_guid='project_b', + remap_pedigree_hash=123, + ), }, ), ) @@ -156,7 +172,15 @@ def test_update_annotations_with_deleted_project(self) -> None: self.assertEqual( ht.globals.collect(), [ - hl.Struct(updates={hl.Struct(callset='abc', project_guid='project_a')}), + hl.Struct( + updates={ + hl.Struct( + callset='abc', + project_guid='project_a', + remap_pedigree_hash=123, + ), + }, + ), ], ) self.assertEqual( diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py index 5ba1448c9..ef6442f22 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples.py @@ -4,6 +4,7 @@ from v03_pipeline.lib.annotations.fields import get_fields from v03_pipeline.lib.misc.callsets import get_callset_ht +from v03_pipeline.lib.misc.io import remap_pedigree_hash from v03_pipeline.lib.paths import ( lookup_table_path, new_variants_table_path, @@ -42,9 +43,13 @@ def complete(self) -> bool: hl.Struct( callset=self.callset_path, project_guid=project_guid, + remap_pedigree_hash=remap_pedigree_hash( + self.project_remap_paths[i], + self.project_pedigree_paths[i], + ), ), ) - for project_guid in self.project_guids + for i, project_guid in enumerate(self.project_guids) ], ), hl.read_table(self.output().path).updates, @@ -96,8 +101,15 @@ def update_table(self, ht: hl.Table) -> hl.Table: return ht.annotate_globals( updates=ht.updates.union( { - hl.Struct(callset=self.callset_path, project_guid=project_guid) - for project_guid in self.project_guids + hl.Struct( + callset=self.callset_path, + project_guid=project_guid, + remap_pedigree_hash=remap_pedigree_hash( + self.project_remap_paths[i], + self.project_pedigree_paths[i], + ), + ) + for i, project_guid in enumerate(self.project_guids) }, ), ) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 39ccdc799..b8d53b983 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -19,6 +19,7 @@ SV_TYPES, TRANSCRIPT_CONSEQUENCE_TERMS, ) +from v03_pipeline.lib.misc.io import remap_pedigree_hash from v03_pipeline.lib.misc.validation import validate_expected_contig_frequency from v03_pipeline.lib.model import ( CachedReferenceDatasetQuery, @@ -400,6 +401,9 @@ def test_multiple_update_vat( hl.Struct( callset=TEST_SNV_INDEL_VCF, project_guid='R0113_test_project', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash(TEST_REMAP, TEST_PEDIGREE_3), + ), ), }, ], @@ -550,10 +554,22 @@ def test_multiple_update_vat( hl.Struct( callset='v03_pipeline/var/test/callsets/1kg_30variants.vcf', project_guid='R0113_test_project', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + TEST_REMAP, + TEST_PEDIGREE_3, + ), + ), ), hl.Struct( callset='v03_pipeline/var/test/callsets/1kg_30variants.vcf', project_guid='R0114_project4', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + TEST_REMAP, + TEST_PEDIGREE_4, + ), + ), ), }, paths=hl.Struct( @@ -883,6 +899,12 @@ def test_mito_update_vat( hl.Struct( callset='v03_pipeline/var/test/callsets/mito_1.mt', project_guid='R0115_test_project2', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + 'not_a_real_file', + TEST_PEDIGREE_5, + ), + ), ), }, ), @@ -1126,6 +1148,12 @@ def test_sv_update_vat( hl.Struct( callset=TEST_SV_VCF, project_guid='R0115_test_project2', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + 'not_a_real_file', + TEST_PEDIGREE_5, + ), + ), ), }, ), @@ -1698,6 +1726,12 @@ def test_gcnv_update_vat( hl.Struct( callset=TEST_GCNV_BED_FILE, project_guid='R0115_test_project2', + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + 'not_a_real_file', + TEST_PEDIGREE_5, + ), + ), ), }, ), diff --git a/v03_pipeline/lib/tasks/write_new_variants_table.py b/v03_pipeline/lib/tasks/write_new_variants_table.py index b70dc2a6f..9f99d1549 100644 --- a/v03_pipeline/lib/tasks/write_new_variants_table.py +++ b/v03_pipeline/lib/tasks/write_new_variants_table.py @@ -10,6 +10,7 @@ ) from v03_pipeline.lib.misc.allele_registry import register_alleles_in_chunks from v03_pipeline.lib.misc.callsets import get_callset_ht +from v03_pipeline.lib.misc.io import remap_pedigree_hash from v03_pipeline.lib.misc.math import constrain from v03_pipeline.lib.model import Env, ReferenceDatasetCollection from v03_pipeline.lib.paths import ( @@ -128,9 +129,13 @@ def complete(self) -> bool: hl.Struct( callset=self.callset_path, project_guid=project_guid, + remap_pedigree_hash=remap_pedigree_hash( + self.project_remap_paths[i], + self.project_pedigree_paths[i], + ), ), ) - for project_guid in self.project_guids + for i, project_guid in enumerate(self.project_guids) ], ), hl.read_table(self.output().path).updates, @@ -218,7 +223,14 @@ def create_table(self) -> hl.Table: return new_variants_ht.select_globals( updates={ - hl.Struct(callset=self.callset_path, project_guid=project_guid) - for project_guid in self.project_guids + hl.Struct( + callset=self.callset_path, + project_guid=project_guid, + remap_pedigree_hash=remap_pedigree_hash( + self.project_remap_paths[i], + self.project_pedigree_paths[i], + ), + ) + for i, project_guid in enumerate(self.project_guids) }, ) diff --git a/v03_pipeline/lib/tasks/write_project_family_tables_test.py b/v03_pipeline/lib/tasks/write_project_family_tables_test.py index 074d0a294..09ac0b030 100644 --- a/v03_pipeline/lib/tasks/write_project_family_tables_test.py +++ b/v03_pipeline/lib/tasks/write_project_family_tables_test.py @@ -87,22 +87,20 @@ def test_snv_write_project_family_tables_task(self) -> None: DatasetType.SNV_INDEL, 'R0113_test_project', ), - ).family_guids.collect(), + ).family_guids.collect()[0], [ - [ - '123_1', - '234_1', - '345_1', - '456_1', - '567_1', - '678_1', - '789_1', - '890_1', - '901_1', - 'bcd_1', - 'cde_1', - 'def_1', - 'efg_1', - ], + '123_1', + '234_1', + '345_1', + '456_1', + '567_1', + '678_1', + '789_1', + '890_1', + '901_1', + 'bcd_1', + 'cde_1', + 'def_1', + 'efg_1', ], ) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py index 609c3b014..b1e4bc547 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset.py @@ -12,6 +12,7 @@ does_file_exist, import_pedigree, import_remap, + remap_pedigree_hash, ) from v03_pipeline.lib.misc.pedigree import parse_pedigree_ht_to_families from v03_pipeline.lib.misc.sample_ids import remap_sample_ids, subset_samples @@ -36,7 +37,17 @@ class WriteRemappedAndSubsettedCallsetTask(BaseWriteTask): project_pedigree_path = luigi.Parameter() def complete(self) -> luigi.Target: - return not self.force and super().complete() + return ( + not self.force + and super().complete() + and hl.eval( + hl.read_matrix_table(self.output().path).globals.remap_pedigree_hash + == remap_pedigree_hash( + self.project_remap_path, + self.project_pedigree_path, + ), + ) + ) def output(self) -> luigi.Target: return GCSorLocalTarget( @@ -145,6 +156,10 @@ def create_table(self) -> hl.MatrixTable: if field not in self.dataset_type.row_fields: mt = mt.drop(field) return mt.select_globals( + remap_pedigree_hash=remap_pedigree_hash( + self.project_remap_path, + self.project_pedigree_path, + ), family_samples=( { f.family_guid: sorted(f.samples.keys()) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py index 48f2b481a..ad4cd3640 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py @@ -4,6 +4,7 @@ import hail as hl import luigi.worker +from v03_pipeline.lib.misc.io import remap_pedigree_hash from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType from v03_pipeline.lib.paths import relatedness_check_table_path, sex_check_table_path from v03_pipeline.lib.tasks.write_remapped_and_subsetted_callset import ( @@ -94,6 +95,12 @@ def test_write_remapped_and_subsetted_callset_task( mt.globals.collect(), [ hl.Struct( + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + TEST_REMAP, + TEST_PEDIGREE_3, + ), + ), failed_family_samples=hl.Struct( missing_samples={}, relatedness_check={}, @@ -131,6 +138,12 @@ def test_write_remapped_and_subsetted_callset_task_failed_sex_check_family( mt.globals.collect(), [ hl.Struct( + remap_pedigree_hash=hl.eval( + remap_pedigree_hash( + TEST_REMAP, + TEST_PEDIGREE_4, + ), + ), family_samples={ '123_1': ['NA19675_1'], '234_1': ['NA19678_1'], diff --git a/v03_pipeline/migrations/annotations/0002_add_remap_pedigree_hash.py b/v03_pipeline/migrations/annotations/0002_add_remap_pedigree_hash.py new file mode 100644 index 000000000..89d900542 --- /dev/null +++ b/v03_pipeline/migrations/annotations/0002_add_remap_pedigree_hash.py @@ -0,0 +1,28 @@ +import hail as hl + +from v03_pipeline.lib.migration.base_migration import BaseMigration +from v03_pipeline.lib.model import DatasetType, ReferenceGenome + + +class AddRemapPedigreeHash(BaseMigration): + @property + def reference_genome_dataset_types() -> ( + frozenset[tuple[ReferenceGenome, DatasetType]] + ): + return frozenset( + ( + (ReferenceGenome.GRCh37, DatasetType.SNV_INDEL), + (ReferenceGenome.GRCh38, DatasetType.SNV_INDEL), + (ReferenceGenome.GRCh38, DatasetType.MITO), + (ReferenceGenome.GRCh38, DatasetType.GCNV), + (ReferenceGenome.GRCh38, DatasetType.SV), + ), + ) + + @staticmethod + def migrate(ht: hl.Table) -> hl.Table: + return ht.annotate_globals( + updates=ht.globals.updates.map( + lambda u: u.annotate(remap_pedigree_hash=hl.missing(hl.tint32)), + ), + ) diff --git a/v03_pipeline/migrations/lookup/0002_add_remap_pedigree_hash.py b/v03_pipeline/migrations/lookup/0002_add_remap_pedigree_hash.py new file mode 100644 index 000000000..a0815249b --- /dev/null +++ b/v03_pipeline/migrations/lookup/0002_add_remap_pedigree_hash.py @@ -0,0 +1,26 @@ +import hail as hl + +from v03_pipeline.lib.migration.base_migration import BaseMigration +from v03_pipeline.lib.model import DatasetType, ReferenceGenome + + +class AddRemapPedigreeHash(BaseMigration): + @property + def reference_genome_dataset_types() -> ( + frozenset[tuple[ReferenceGenome, DatasetType]] + ): + return frozenset( + ( + (ReferenceGenome.GRCh37, DatasetType.SNV_INDEL), + (ReferenceGenome.GRCh38, DatasetType.SNV_INDEL), + (ReferenceGenome.GRCh38, DatasetType.MITO), + ), + ) + + @staticmethod + def migrate(ht: hl.Table) -> hl.Table: + return ht.annotate_globals( + updates=ht.globals.updates.map( + lambda u: u.annotate(remap_pedigree_hash=hl.missing(hl.tint32)), + ), + ) diff --git a/v03_pipeline/var/test/pedigrees/test_pedigree_3_different_families.tsv b/v03_pipeline/var/test/pedigrees/test_pedigree_3_different_families.tsv new file mode 100644 index 000000000..00e73ff2e --- /dev/null +++ b/v03_pipeline/var/test/pedigrees/test_pedigree_3_different_families.tsv @@ -0,0 +1,3 @@ +Project_GUID Family_GUID Family_ID Individual_ID Paternal_ID Maternal_ID Sex +R0113_test_project abc_1 abc HG00731_1 F +R0113_test_project abc_1 abc HG00733_1 HG00732_1 HG00731_1 F From 416eb5e962c0b0dbe51c72f396c60614c83c7aa7 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 7 Aug 2024 14:31:30 -0400 Subject: [PATCH 7/9] migration --- migration.py | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) create mode 100644 migration.py diff --git a/migration.py b/migration.py new file mode 100644 index 000000000..9f348c591 --- /dev/null +++ b/migration.py @@ -0,0 +1,96 @@ +import subprocess + +from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType + +MIGRATIONS = [ + ( + DatasetType.GCNV, + ReferenceGenome.GRCh38, + ), + ( + DatasetType.SV, + ReferenceGenome.GRCh38, + ), + ( + DatasetType.MITO, + ReferenceGenome.GRCh38, + ), + ( + DatasetType.SNV_INDEL, + ReferenceGenome.GRCh37, + ), + ( + DatasetType.SNV_INDEL, + ReferenceGenome.GRCh38, + ), +] + +PROJECT_LOOKUP = dict(x.split('\t') for x in open('project_sample_types.txt').read().split('\n') if x) +FAMILY_LOOKUP = dict(x.split('\t') for x in open('family_sample_types.txt').read().split('\n') if x) + +for dataset_type, reference_genome in MIGRATIONS: + if dataset_type == DatasetType.MITO or dataset_type == DatasetType.SV: + sample_type = SampleType.WGS + elif dataset_type == DatasetType.GCNV: + sample_type = SampleType.WES + else: + sample_type = None + + # Projects + project_guids = [] + path = f'gs://seqr-hail-search-data/v03/{reference_genome.value}/{dataset_type.value}/projects/' + r = subprocess.run( + ['gsutil', 'ls', path], + capture_output=True, + text=True, + check=True, + ) + if 'matched no objects' in r.stderr: + continue + for line in r.stdout.strip().split('\n'): + if line.endswith('projects/'): + continue + project_guids.append(line.split('/')[-2].replace('.ht','')) + for project_guid in project_guids: + try: + project_sample_type = sample_type if sample_type else SampleType(PROJECT_LOOKUP[project_guid]) + except KeyError: + print('Skipping Project Guid', project_guid) + continue + #r = subprocess.run( + # ['gsutil', 'cp', '-r' , f'{path}{project_guid}.ht', f'gs://seqr-hail-search-data/v3.1/{reference_genome.value}/{dataset_type.value}/projects/{project_sample_type.value}/{project_guid}.ht'], + # capture_output=True, + # text=True, + # check=True, + #) + print(['gsutil', 'cp', '-r' , f'{path}{project_guid}.ht', f'gs://seqr-hail-search-data/v3.1/{reference_genome.value}/{dataset_type.value}/projects/{project_sample_type.value}/{project_guid}.ht']) + + + # Families + family_guids = [] + path = f'gs://seqr-hail-search-data/v03/{reference_genome.value}/{dataset_type.value}/families/' + r = subprocess.run( + ['gsutil', 'ls', path], + capture_output=True, + text=True, + check=True, + ) + if 'matched no objects' in r.stderr: + continue + for line in r.stdout.strip().split('\n'): + if line.endswith('families/'): + continue + family_guids.append(line.split('/')[-2].replace('.ht','')) + for family_guid in family_guids: + try: + family_sample_type = sample_type if sample_type else SampleType(FAMILY_LOOKUP[family_guid]) + except KeyError: + print('Skipping Family Guid', family_guid) + continue + #r = subprocess.run( + # ['gsutil', 'cp', '-r' , f'{path}{family_guid}.ht', f'gs://seqr-hail-search-data/v3.1/{reference_genome.value}/{dataset_type.value}/families/{family_sample_type.value}/{family_guid}.ht'], + # capture_output=True, + # text=True, + # check=True, + #) + print(['gsutil', 'cp', '-r' , f'{path}{family_guid}.ht', f'gs://seqr-hail-search-data/v3.1/{reference_genome.value}/{dataset_type.value}/families/{family_sample_type.value}/{family_guid}.ht']) From 32a342dba0e3665ceea9cad24a68fdd41c502fdf Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 7 Aug 2024 14:33:25 -0400 Subject: [PATCH 8/9] Revert "migration" This reverts commit 416eb5e962c0b0dbe51c72f396c60614c83c7aa7. --- migration.py | 96 ---------------------------------------------------- 1 file changed, 96 deletions(-) delete mode 100644 migration.py diff --git a/migration.py b/migration.py deleted file mode 100644 index 9f348c591..000000000 --- a/migration.py +++ /dev/null @@ -1,96 +0,0 @@ -import subprocess - -from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType - -MIGRATIONS = [ - ( - DatasetType.GCNV, - ReferenceGenome.GRCh38, - ), - ( - DatasetType.SV, - ReferenceGenome.GRCh38, - ), - ( - DatasetType.MITO, - ReferenceGenome.GRCh38, - ), - ( - DatasetType.SNV_INDEL, - ReferenceGenome.GRCh37, - ), - ( - DatasetType.SNV_INDEL, - ReferenceGenome.GRCh38, - ), -] - -PROJECT_LOOKUP = dict(x.split('\t') for x in open('project_sample_types.txt').read().split('\n') if x) -FAMILY_LOOKUP = dict(x.split('\t') for x in open('family_sample_types.txt').read().split('\n') if x) - -for dataset_type, reference_genome in MIGRATIONS: - if dataset_type == DatasetType.MITO or dataset_type == DatasetType.SV: - sample_type = SampleType.WGS - elif dataset_type == DatasetType.GCNV: - sample_type = SampleType.WES - else: - sample_type = None - - # Projects - project_guids = [] - path = f'gs://seqr-hail-search-data/v03/{reference_genome.value}/{dataset_type.value}/projects/' - r = subprocess.run( - ['gsutil', 'ls', path], - capture_output=True, - text=True, - check=True, - ) - if 'matched no objects' in r.stderr: - continue - for line in r.stdout.strip().split('\n'): - if line.endswith('projects/'): - continue - project_guids.append(line.split('/')[-2].replace('.ht','')) - for project_guid in project_guids: - try: - project_sample_type = sample_type if sample_type else SampleType(PROJECT_LOOKUP[project_guid]) - except KeyError: - print('Skipping Project Guid', project_guid) - continue - #r = subprocess.run( - # ['gsutil', 'cp', '-r' , f'{path}{project_guid}.ht', f'gs://seqr-hail-search-data/v3.1/{reference_genome.value}/{dataset_type.value}/projects/{project_sample_type.value}/{project_guid}.ht'], - # capture_output=True, - # text=True, - # check=True, - #) - print(['gsutil', 'cp', '-r' , f'{path}{project_guid}.ht', f'gs://seqr-hail-search-data/v3.1/{reference_genome.value}/{dataset_type.value}/projects/{project_sample_type.value}/{project_guid}.ht']) - - - # Families - family_guids = [] - path = f'gs://seqr-hail-search-data/v03/{reference_genome.value}/{dataset_type.value}/families/' - r = subprocess.run( - ['gsutil', 'ls', path], - capture_output=True, - text=True, - check=True, - ) - if 'matched no objects' in r.stderr: - continue - for line in r.stdout.strip().split('\n'): - if line.endswith('families/'): - continue - family_guids.append(line.split('/')[-2].replace('.ht','')) - for family_guid in family_guids: - try: - family_sample_type = sample_type if sample_type else SampleType(FAMILY_LOOKUP[family_guid]) - except KeyError: - print('Skipping Family Guid', family_guid) - continue - #r = subprocess.run( - # ['gsutil', 'cp', '-r' , f'{path}{family_guid}.ht', f'gs://seqr-hail-search-data/v3.1/{reference_genome.value}/{dataset_type.value}/families/{family_sample_type.value}/{family_guid}.ht'], - # capture_output=True, - # text=True, - # check=True, - #) - print(['gsutil', 'cp', '-r' , f'{path}{family_guid}.ht', f'gs://seqr-hail-search-data/v3.1/{reference_genome.value}/{dataset_type.value}/families/{family_sample_type.value}/{family_guid}.ht']) From d00c6f7d375093eb788d76fd028ee9d537e5b8cb Mon Sep 17 00:00:00 2001 From: Julia Klugherz Date: Mon, 12 Aug 2024 09:51:57 -0400 Subject: [PATCH 9/9] add sample_type to family and project table paths (#842) * add sampletype to family table file path * all sample types * missed 1 * project tables * explicit in family table delete * use enum * optional parameter but second DeleteProjectFamilyTablesTask test fails * oops * should work * cleaner * v3.1 and handle no project tables * missed one * paths test --- v03_pipeline/lib/model/definitions.py | 1 + v03_pipeline/lib/paths.py | 30 ++++++----- v03_pipeline/lib/paths_test.py | 29 +++++----- .../tasks/base/base_update_project_table.py | 3 ++ v03_pipeline/lib/tasks/delete_family_table.py | 3 ++ .../lib/tasks/delete_family_table_test.py | 5 +- .../lib/tasks/delete_family_tables.py | 19 ++++--- .../lib/tasks/delete_family_tables_test.py | 15 +++++- .../lib/tasks/delete_project_family_tables.py | 39 +++++++++----- .../delete_project_family_tables_test.py | 53 +++++++++++++++++-- .../lib/tasks/delete_project_table.py | 4 ++ ...roject_table_with_deleted_families_test.py | 4 +- v03_pipeline/lib/tasks/write_family_table.py | 1 + .../tasks/write_project_family_tables_test.py | 1 + 14 files changed, 153 insertions(+), 54 deletions(-) diff --git a/v03_pipeline/lib/model/definitions.py b/v03_pipeline/lib/model/definitions.py index bd6adf90e..b460ef458 100644 --- a/v03_pipeline/lib/model/definitions.py +++ b/v03_pipeline/lib/model/definitions.py @@ -17,6 +17,7 @@ class Sex(Enum): class PipelineVersion(Enum): V02 = 'v02' V03 = 'v03' + V3_1 = 'v3.1' class ReferenceGenome(Enum): diff --git a/v03_pipeline/lib/paths.py b/v03_pipeline/lib/paths.py index 3dc0ae179..92d877746 100644 --- a/v03_pipeline/lib/paths.py +++ b/v03_pipeline/lib/paths.py @@ -14,14 +14,14 @@ ) -def _v03_pipeline_prefix( +def _pipeline_prefix( root: str, reference_genome: ReferenceGenome, dataset_type: DatasetType, ) -> str: return os.path.join( root, - PipelineVersion.V03.value, + PipelineVersion.V3_1.value, reference_genome.value, dataset_type.value, ) @@ -62,15 +62,17 @@ def cached_reference_dataset_query_path( def family_table_path( reference_genome: ReferenceGenome, dataset_type: DatasetType, + sample_type: SampleType, family_guid: str, ) -> str: return os.path.join( - _v03_pipeline_prefix( + _pipeline_prefix( Env.HAIL_SEARCH_DATA, reference_genome, dataset_type, ), 'families', + sample_type.value, f'{family_guid}.ht', ) @@ -81,7 +83,7 @@ def imputed_sex_path( callset_path: str, ) -> str: return os.path.join( - _v03_pipeline_prefix( + _pipeline_prefix( Env.LOADING_DATASETS, reference_genome, dataset_type, @@ -97,7 +99,7 @@ def imported_callset_path( callset_path: str, ) -> str: return os.path.join( - _v03_pipeline_prefix( + _pipeline_prefix( Env.LOADING_DATASETS, reference_genome, dataset_type, @@ -125,15 +127,17 @@ def metadata_for_run_path( def project_table_path( reference_genome: ReferenceGenome, dataset_type: DatasetType, + sample_type: SampleType, project_guid: str, ) -> str: return os.path.join( - _v03_pipeline_prefix( + _pipeline_prefix( Env.HAIL_SEARCH_DATA, reference_genome, dataset_type, ), 'projects', + sample_type.value, f'{project_guid}.ht', ) @@ -144,7 +148,7 @@ def relatedness_check_table_path( callset_path: str, ) -> str: return os.path.join( - _v03_pipeline_prefix( + _pipeline_prefix( Env.LOADING_DATASETS, reference_genome, dataset_type, @@ -161,7 +165,7 @@ def remapped_and_subsetted_callset_path( project_guid: str, ) -> str: return os.path.join( - _v03_pipeline_prefix( + _pipeline_prefix( Env.LOADING_DATASETS, reference_genome, dataset_type, @@ -177,7 +181,7 @@ def lookup_table_path( dataset_type: DatasetType, ) -> str: return os.path.join( - _v03_pipeline_prefix( + _pipeline_prefix( Env.HAIL_SEARCH_DATA, reference_genome, dataset_type, @@ -191,7 +195,7 @@ def runs_path( dataset_type: DatasetType, ) -> str: return os.path.join( - _v03_pipeline_prefix( + _pipeline_prefix( Env.HAIL_SEARCH_DATA, reference_genome, dataset_type, @@ -206,7 +210,7 @@ def sex_check_table_path( callset_path: str, ) -> str: return os.path.join( - _v03_pipeline_prefix( + _pipeline_prefix( Env.LOADING_DATASETS, reference_genome, dataset_type, @@ -260,7 +264,7 @@ def variant_annotations_table_path( dataset_type: DatasetType, ) -> str: return os.path.join( - _v03_pipeline_prefix( + _pipeline_prefix( Env.HAIL_SEARCH_DATA, reference_genome, dataset_type, @@ -274,7 +278,7 @@ def variant_annotations_vcf_path( dataset_type: DatasetType, ) -> str: return os.path.join( - _v03_pipeline_prefix( + _pipeline_prefix( Env.HAIL_SEARCH_DATA, reference_genome, dataset_type, diff --git a/v03_pipeline/lib/paths_test.py b/v03_pipeline/lib/paths_test.py index ff437cf45..4ba4a270c 100644 --- a/v03_pipeline/lib/paths_test.py +++ b/v03_pipeline/lib/paths_test.py @@ -42,9 +42,10 @@ def test_family_table_path(self) -> None: family_table_path( ReferenceGenome.GRCh37, DatasetType.SNV_INDEL, + SampleType.WES, 'franklin', ), - '/hail-search-data/v03/GRCh37/SNV_INDEL/families/franklin.ht', + '/hail-search-data/v3.1/GRCh37/SNV_INDEL/families/WES/franklin.ht', ) with patch('v03_pipeline.lib.paths.Env') as mock_env: mock_env.HAIL_SEARCH_DATA = 'gs://seqr-datasets/' @@ -52,9 +53,10 @@ def test_family_table_path(self) -> None: family_table_path( ReferenceGenome.GRCh37, DatasetType.SNV_INDEL, + SampleType.WES, 'franklin', ), - 'gs://seqr-datasets/v03/GRCh37/SNV_INDEL/families/franklin.ht', + 'gs://seqr-datasets/v3.1/GRCh37/SNV_INDEL/families/WES/franklin.ht', ) def test_valid_filters_path(self) -> None: @@ -82,9 +84,10 @@ def test_project_table_path(self) -> None: project_table_path( ReferenceGenome.GRCh38, DatasetType.MITO, + SampleType.WES, 'R0652_pipeline_test', ), - '/hail-search-data/v03/GRCh38/MITO/projects/R0652_pipeline_test.ht', + '/hail-search-data/v3.1/GRCh38/MITO/projects/WES/R0652_pipeline_test.ht', ) def test_valid_reference_dataset_collection_path(self) -> None: @@ -113,7 +116,7 @@ def test_lookup_table_path(self) -> None: ReferenceGenome.GRCh37, DatasetType.SV, ), - '/hail-search-data/v03/GRCh37/SV/lookup.ht', + '/hail-search-data/v3.1/GRCh37/SV/lookup.ht', ) def test_sex_check_table_path(self) -> None: @@ -123,7 +126,7 @@ def test_sex_check_table_path(self) -> None: DatasetType.SNV_INDEL, 'gs://abc.efg/callset.vcf.gz', ), - '/seqr-loading-temp/v03/GRCh38/SNV_INDEL/sex_check/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.ht', + '/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/sex_check/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.ht', ) def test_relatedness_check_table_path(self) -> None: @@ -133,7 +136,7 @@ def test_relatedness_check_table_path(self) -> None: DatasetType.SNV_INDEL, 'gs://abc.efg/callset.vcf.gz', ), - '/seqr-loading-temp/v03/GRCh38/SNV_INDEL/relatedness_check/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.ht', + '/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/relatedness_check/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.ht', ) def test_metadata_for_run_path(self) -> None: @@ -143,7 +146,7 @@ def test_metadata_for_run_path(self) -> None: DatasetType.SNV_INDEL, 'manual__2023-06-26T18:30:09.349671+00:00', ), - '/hail-search-data/v03/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/metadata.json', + '/hail-search-data/v3.1/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/metadata.json', ) def test_variant_annotations_table_path(self) -> None: @@ -152,7 +155,7 @@ def test_variant_annotations_table_path(self) -> None: ReferenceGenome.GRCh38, DatasetType.GCNV, ), - '/hail-search-data/v03/GRCh38/GCNV/annotations.ht', + '/hail-search-data/v3.1/GRCh38/GCNV/annotations.ht', ) def test_remapped_and_subsetted_callset_path(self) -> None: @@ -163,7 +166,7 @@ def test_remapped_and_subsetted_callset_path(self) -> None: 'gs://abc.efg/callset.vcf.gz', 'R0111_tgg_bblanken_wes', ), - '/seqr-loading-temp/v03/GRCh38/GCNV/remapped_and_subsetted_callsets/R0111_tgg_bblanken_wes/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt', + '/seqr-loading-temp/v3.1/GRCh38/GCNV/remapped_and_subsetted_callsets/R0111_tgg_bblanken_wes/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt', ) self.assertEqual( remapped_and_subsetted_callset_path( @@ -172,7 +175,7 @@ def test_remapped_and_subsetted_callset_path(self) -> None: 'gs://abc.efg/callset/*.vcf.gz', 'R0111_tgg_bblanken_wes', ), - '/seqr-loading-temp/v03/GRCh38/GCNV/remapped_and_subsetted_callsets/R0111_tgg_bblanken_wes/bce53ccdb49a5ed2513044e1d0c6224e3ffcc323f770dc807d9175fd3c70a050.mt', + '/seqr-loading-temp/v3.1/GRCh38/GCNV/remapped_and_subsetted_callsets/R0111_tgg_bblanken_wes/bce53ccdb49a5ed2513044e1d0c6224e3ffcc323f770dc807d9175fd3c70a050.mt', ) def test_imported_callset_path(self) -> None: @@ -182,7 +185,7 @@ def test_imported_callset_path(self) -> None: DatasetType.SNV_INDEL, 'gs://abc.efg/callset.vcf.gz', ), - '/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imported_callsets/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt', + '/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/imported_callsets/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.mt', ) def test_imputed_sex_path(self) -> None: @@ -192,7 +195,7 @@ def test_imputed_sex_path(self) -> None: DatasetType.SNV_INDEL, 'gs://abc.efg/callset.vcf.gz', ), - '/seqr-loading-temp/v03/GRCh38/SNV_INDEL/imputed_sex/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.tsv', + '/seqr-loading-temp/v3.1/GRCh38/SNV_INDEL/imputed_sex/ead56bb177a5de24178e1e622ce1d8beb3f8892bdae1c925d22ca0af4013d6dd.tsv', ) def test_new_variants_table_path(self) -> None: @@ -202,5 +205,5 @@ def test_new_variants_table_path(self) -> None: DatasetType.SNV_INDEL, 'manual__2023-06-26T18:30:09.349671+00:00', ), - '/hail-search-data/v03/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/new_variants.ht', + '/hail-search-data/v3.1/GRCh38/SNV_INDEL/runs/manual__2023-06-26T18:30:09.349671+00:00/new_variants.ht', ) diff --git a/v03_pipeline/lib/tasks/base/base_update_project_table.py b/v03_pipeline/lib/tasks/base/base_update_project_table.py index ad7ee0d1e..473a31bc2 100644 --- a/v03_pipeline/lib/tasks/base/base_update_project_table.py +++ b/v03_pipeline/lib/tasks/base/base_update_project_table.py @@ -1,12 +1,14 @@ import hail as hl import luigi +from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.paths import project_table_path from v03_pipeline.lib.tasks.base.base_update import BaseUpdateTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget class BaseUpdateProjectTableTask(BaseUpdateTask): + sample_type = luigi.EnumParameter(enum=SampleType) project_guid = luigi.Parameter() def output(self) -> luigi.Target: @@ -14,6 +16,7 @@ def output(self) -> luigi.Target: project_table_path( self.reference_genome, self.dataset_type, + self.sample_type, self.project_guid, ), ) diff --git a/v03_pipeline/lib/tasks/delete_family_table.py b/v03_pipeline/lib/tasks/delete_family_table.py index cbc929fd2..c7946035d 100644 --- a/v03_pipeline/lib/tasks/delete_family_table.py +++ b/v03_pipeline/lib/tasks/delete_family_table.py @@ -1,11 +1,13 @@ import luigi +from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.paths import family_table_path from v03_pipeline.lib.tasks.base.base_delete_table import BaseDeleteTableTask from v03_pipeline.lib.tasks.files import GCSorLocalTarget class DeleteFamilyTableTask(BaseDeleteTableTask): + sample_type = luigi.EnumParameter(enum=SampleType) family_guid = luigi.Parameter() def output(self) -> luigi.Target: @@ -13,6 +15,7 @@ def output(self) -> luigi.Target: family_table_path( self.reference_genome, self.dataset_type, + self.sample_type, self.family_guid, ), ) diff --git a/v03_pipeline/lib/tasks/delete_family_table_test.py b/v03_pipeline/lib/tasks/delete_family_table_test.py index 43e92bb6b..e4d364abb 100644 --- a/v03_pipeline/lib/tasks/delete_family_table_test.py +++ b/v03_pipeline/lib/tasks/delete_family_table_test.py @@ -3,7 +3,7 @@ import hail as hl import luigi.worker -from v03_pipeline.lib.model import DatasetType, ReferenceGenome +from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType from v03_pipeline.lib.paths import family_table_path from v03_pipeline.lib.tasks.delete_family_table import DeleteFamilyTableTask from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase @@ -41,6 +41,7 @@ def setUp(self) -> None: family_table_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, + SampleType.WES, 'abc_1', ), ) @@ -50,6 +51,7 @@ def test_delete_family_table_task(self) -> None: task = DeleteFamilyTableTask( reference_genome=ReferenceGenome.GRCh38, dataset_type=DatasetType.SNV_INDEL, + sample_type=SampleType.WES, family_guid='abc_1', ) worker.add(task) @@ -60,6 +62,7 @@ def test_delete_family_table_task(self) -> None: family_table_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, + SampleType.WES, 'abc_1', ), ).exists(), diff --git a/v03_pipeline/lib/tasks/delete_family_tables.py b/v03_pipeline/lib/tasks/delete_family_tables.py index a68f4dc28..c09e9fca2 100644 --- a/v03_pipeline/lib/tasks/delete_family_tables.py +++ b/v03_pipeline/lib/tasks/delete_family_tables.py @@ -1,5 +1,6 @@ import luigi +from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.tasks.base.base_hail_table import BaseHailTableTask from v03_pipeline.lib.tasks.delete_family_table import DeleteFamilyTableTask @@ -18,12 +19,14 @@ def complete(self) -> bool: ) def run(self): - for family_guid in self.family_guids: - self.dynamic_delete_family_table_tasks.add( - DeleteFamilyTableTask( - reference_genome=self.reference_genome, - dataset_type=self.dataset_type, - family_guid=family_guid, - ), - ) + for sample_type in SampleType: + for family_guid in self.family_guids: + self.dynamic_delete_family_table_tasks.add( + DeleteFamilyTableTask( + reference_genome=self.reference_genome, + dataset_type=self.dataset_type, + sample_type=sample_type, + family_guid=family_guid, + ), + ) yield self.dynamic_delete_family_table_tasks diff --git a/v03_pipeline/lib/tasks/delete_family_tables_test.py b/v03_pipeline/lib/tasks/delete_family_tables_test.py index 535299602..21d0e73cd 100644 --- a/v03_pipeline/lib/tasks/delete_family_tables_test.py +++ b/v03_pipeline/lib/tasks/delete_family_tables_test.py @@ -3,7 +3,7 @@ import hail as hl import luigi.worker -from v03_pipeline.lib.model import DatasetType, ReferenceGenome +from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType from v03_pipeline.lib.paths import family_table_path from v03_pipeline.lib.tasks.delete_family_tables import ( DeleteFamilyTablesTask, @@ -20,6 +20,7 @@ def setUp(self) -> None: family_table_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, + SampleType.WGS, family_guid, ), ) @@ -30,6 +31,17 @@ def test_delete_project_family_tables_task(self) -> None: family_table_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, + SampleType.WGS, + 'family_a', + ), + ).exists(), + ) + self.assertFalse( + pathlib.Path( + family_table_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + SampleType.WES, 'family_a', ), ).exists(), @@ -49,6 +61,7 @@ def test_delete_project_family_tables_task(self) -> None: family_table_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, + SampleType.WGS, family_guid, ), ).exists(), diff --git a/v03_pipeline/lib/tasks/delete_project_family_tables.py b/v03_pipeline/lib/tasks/delete_project_family_tables.py index befca9a45..5ebca0f03 100644 --- a/v03_pipeline/lib/tasks/delete_project_family_tables.py +++ b/v03_pipeline/lib/tasks/delete_project_family_tables.py @@ -1,6 +1,8 @@ import hail as hl +import hailtop.fs as hfs import luigi +from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.paths import project_table_path from v03_pipeline.lib.tasks.base.base_hail_table import BaseHailTableTask from v03_pipeline.lib.tasks.delete_family_table import DeleteFamilyTableTask @@ -21,21 +23,32 @@ def complete(self) -> bool: ) def run(self): - project_table_task: luigi.Target = yield HailTableTask( - project_table_path( + project_tables = set() + for sample_type in SampleType: + project_ht_path = project_table_path( self.reference_genome, self.dataset_type, + sample_type, self.project_guid, - ), - ) - project_ht = hl.read_table(project_table_task.path) - family_guids = hl.eval(project_ht.globals.family_guids) - for family_guid in family_guids: - self.dynamic_delete_family_table_tasks.add( - DeleteFamilyTableTask( - reference_genome=self.reference_genome, - dataset_type=self.dataset_type, - family_guid=family_guid, - ), ) + if hfs.exists(project_ht_path): + project_table_task: luigi.Target = yield HailTableTask(project_ht_path) + project_ht = hl.read_table(project_table_task.path) + project_tables.add((project_ht, sample_type)) + + if len(project_tables) == 0: + msg = f'No project tables found for {self.project_guid}' + raise RuntimeError(msg) + + for project_ht, sample_type in project_tables: + family_guids = hl.eval(project_ht.globals.family_guids) + for family_guid in family_guids: + self.dynamic_delete_family_table_tasks.add( + DeleteFamilyTableTask( + reference_genome=self.reference_genome, + dataset_type=self.dataset_type, + sample_type=sample_type, + family_guid=family_guid, + ), + ) yield self.dynamic_delete_family_table_tasks diff --git a/v03_pipeline/lib/tasks/delete_project_family_tables_test.py b/v03_pipeline/lib/tasks/delete_project_family_tables_test.py index 75f11987a..e6f10edb2 100644 --- a/v03_pipeline/lib/tasks/delete_project_family_tables_test.py +++ b/v03_pipeline/lib/tasks/delete_project_family_tables_test.py @@ -3,7 +3,7 @@ import hail as hl import luigi.worker -from v03_pipeline.lib.model import DatasetType, ReferenceGenome +from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType from v03_pipeline.lib.paths import family_table_path, project_table_path from v03_pipeline.lib.tasks.delete_project_family_tables import ( DeleteProjectFamilyTablesTask, @@ -14,7 +14,7 @@ class DeleteTableTaskTest(MockedDatarootTestCase): def setUp(self) -> None: super().setUp() - ht = hl.Table.parallelize( + project_ht = hl.Table.parallelize( [ { 'locus': hl.Locus( @@ -123,33 +123,77 @@ def setUp(self) -> None: }, ), ) - ht.write( + project_ht.write( project_table_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, + SampleType.WGS, 'project_a', ), ) - for family_guid in hl.eval(ht.globals.family_guids): + for family_guid in hl.eval(project_ht.globals.family_guids): family_ht = hl.utils.range_table(100) family_ht.write( family_table_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, + SampleType.WGS, family_guid, ), ) + def test_no_project_tables_for_project(self) -> None: + worker = luigi.worker.Worker() + task = DeleteProjectFamilyTablesTask( + reference_genome=ReferenceGenome.GRCh38, + dataset_type=DatasetType.SNV_INDEL, + project_guid='project_b', + ) + worker.add(task) + worker.run() + self.assertFalse(task.complete()) + def test_delete_project_family_tables_task(self) -> None: self.assertTrue( pathlib.Path( family_table_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, + SampleType.WGS, 'family_a', ), ).exists(), ) + self.assertFalse( + pathlib.Path( + family_table_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + SampleType.WES, + 'family_a', + ), + ).exists(), + ) + self.assertTrue( + pathlib.Path( + project_table_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + SampleType.WGS, + 'project_a', + ), + ).exists(), + ) + self.assertFalse( + pathlib.Path( + project_table_path( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + SampleType.WES, + 'project_a', + ), + ).exists(), + ) worker = luigi.worker.Worker() task = DeleteProjectFamilyTablesTask( reference_genome=ReferenceGenome.GRCh38, @@ -165,6 +209,7 @@ def test_delete_project_family_tables_task(self) -> None: family_table_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, + SampleType.WGS, family_guid, ), ).exists(), diff --git a/v03_pipeline/lib/tasks/delete_project_table.py b/v03_pipeline/lib/tasks/delete_project_table.py index 0a403ea3b..53ae98a97 100644 --- a/v03_pipeline/lib/tasks/delete_project_table.py +++ b/v03_pipeline/lib/tasks/delete_project_table.py @@ -1,5 +1,6 @@ import luigi +from v03_pipeline.lib.model import SampleType from v03_pipeline.lib.paths import project_table_path from v03_pipeline.lib.tasks.base.base_delete_table import BaseDeleteTableTask from v03_pipeline.lib.tasks.delete_project_family_tables import ( @@ -9,12 +10,14 @@ class DeleteProjectTableTask(BaseDeleteTableTask): + sample_type = luigi.EnumParameter(enum=SampleType) project_guid = luigi.Parameter() def requires(self) -> luigi.Task: return DeleteProjectFamilyTablesTask( self.reference_genome, self.dataset_type, + self.sample_type, self.project_guid, ) @@ -23,6 +26,7 @@ def output(self) -> luigi.Target: project_table_path( self.reference_genome, self.dataset_type, + self.sample_type, self.project_guid, ), ) diff --git a/v03_pipeline/lib/tasks/update_project_table_with_deleted_families_test.py b/v03_pipeline/lib/tasks/update_project_table_with_deleted_families_test.py index b1f5cc5af..72c475d3f 100644 --- a/v03_pipeline/lib/tasks/update_project_table_with_deleted_families_test.py +++ b/v03_pipeline/lib/tasks/update_project_table_with_deleted_families_test.py @@ -1,7 +1,7 @@ import hail as hl import luigi -from v03_pipeline.lib.model import DatasetType, ReferenceGenome +from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType from v03_pipeline.lib.paths import project_table_path from v03_pipeline.lib.tasks.update_project_table_with_deleted_families import ( UpdateProjectTableWithDeletedFamiliesTask, @@ -120,6 +120,7 @@ def setUp(self) -> None: project_table_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, + SampleType.WGS, 'project_a', ), ) @@ -129,6 +130,7 @@ def test_update_project_with_deleted_families(self): task = UpdateProjectTableWithDeletedFamiliesTask( dataset_type=DatasetType.SNV_INDEL, reference_genome=ReferenceGenome.GRCh38, + sample_type=SampleType.WGS, project_guid='project_a', family_guids=['family_b'], ) diff --git a/v03_pipeline/lib/tasks/write_family_table.py b/v03_pipeline/lib/tasks/write_family_table.py index ce4c8679c..4d6683664 100644 --- a/v03_pipeline/lib/tasks/write_family_table.py +++ b/v03_pipeline/lib/tasks/write_family_table.py @@ -23,6 +23,7 @@ def output(self) -> luigi.Target: family_table_path( self.reference_genome, self.dataset_type, + self.sample_type, self.family_guid, ), ) diff --git a/v03_pipeline/lib/tasks/write_project_family_tables_test.py b/v03_pipeline/lib/tasks/write_project_family_tables_test.py index 09ac0b030..27680d8a1 100644 --- a/v03_pipeline/lib/tasks/write_project_family_tables_test.py +++ b/v03_pipeline/lib/tasks/write_project_family_tables_test.py @@ -85,6 +85,7 @@ def test_snv_write_project_family_tables_task(self) -> None: project_table_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, + SampleType.WGS, 'R0113_test_project', ), ).family_guids.collect()[0],