-
Notifications
You must be signed in to change notification settings - Fork 605
Half-bad hetvars: one bad allele fails the genotype [VS-1615] [VS-1649] #9149
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 16 commits
f75c299
1d30881
e21f0ac
3ee277a
756d363
4e18503
0171302
d5b88b6
d428e13
182019c
815c1a3
bda44b7
45b65b2
ed09ccc
ace768f
852e96f
7b47b30
7c9e753
eaf3b08
f919d24
1a21244
23d45a6
f444f30
84eaa02
a040dc3
124d917
c533916
eb82f9f
2c00049
a545738
fdb1988
2311f27
7dc7c85
ab94c99
b3bb025
218dae4
41f3844
27ac2bb
8e52897
0f21f6f
6f831cb
64a5bcc
3a9a0ae
6f606f0
347eb2f
6d92d22
521a606
5fbc434
a8e5796
33a67a8
2a8a8b5
e2bc338
09a0d5f
6c6fa2b
9ade1a4
c09d1a3
7aa2ce1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||
---|---|---|---|---|---|---|---|---|
|
@@ -378,11 +378,11 @@ def convert_array_with_id_keys_to_dense_array(arr, ids, drop=[]): | |||||||
any_yes=acc.any_yes | allele_YES[called_idx - 1], | ||||||||
any_snp=acc.any_snp | allele_is_snp[called_idx - 1], | ||||||||
any_indel=acc.any_indel | ~allele_is_snp[called_idx - 1], | ||||||||
any_snp_ok=acc.any_snp_ok | (allele_is_snp[called_idx - 1] & allele_OK[called_idx - 1]), | ||||||||
any_indel_ok=acc.any_indel_ok | (~allele_is_snp[called_idx - 1] & allele_OK[called_idx - 1]), | ||||||||
), hl.struct(any_no=False, any_yes=False, any_snp=False, any_indel=False, any_snp_ok=False, any_indel_ok=False))) | ||||||||
all_snps_ok=acc.all_snps_ok & (~allele_is_snp[called_idx - 1] | allele_OK[called_idx - 1]), | ||||||||
all_indels_ok=acc.all_indels_ok & (allele_is_snp[called_idx - 1] | allele_OK[called_idx - 1]), | ||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am confused as to why this is not:
Suggested change
Maybe I'm just confounded by the logic There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Taking the SNP line as an example, I interpreted this as "nothing wrong SNP-wise if it's not a SNP at all, OR (implying it is a SNP) if the allele is OK". If the allele was bad and it was an INDEL, the following line would catch it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually now that I look at this again I think I can/should clean it up a bit... I don't think we care about "any" SNPs or INDELs any more now that the default for the "ok" accumulators is |
||||||||
), hl.struct(any_no=False, any_yes=False, any_snp=False, any_indel=False, all_snps_ok=True, all_indels_ok=True))) | ||||||||
|
||||||||
vd = vd.annotate_entries(FT=~ft.any_no & (ft.any_yes | ((~ft.any_snp | ft.any_snp_ok) & (~ft.any_indel | ft.any_indel_ok)))) | ||||||||
vd = vd.annotate_entries(FT=~ft.any_no & (ft.any_yes | ((~ft.any_snp | ft.all_snps_ok) & (~ft.any_indel | ft.all_indels_ok)))) | ||||||||
|
||||||||
vd = vd.drop('allele_NO', 'allele_YES', 'allele_is_snp', 'allele_OK') | ||||||||
hl.vds.VariantDataset( | ||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,34 +17,34 @@ | |
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is all stowaway changes fixing some bonkers whitespace that was driving IntelliJ nuts |
||
|
||
def check_samples_match(vds): | ||
print('checking sample equivalence between reference and variant MTs') | ||
assert vds.reference_data.cols().select().collect() == vds.variant_data.cols().select().collect() | ||
print('checking sample equivalence between reference and variant MTs') | ||
assert vds.reference_data.cols().select().collect() == vds.variant_data.cols().select().collect() | ||
|
||
def check_ref_blocks(vds): | ||
print('checking that:\n * no reference blocks have GQ=0\n * all ref blocks have END after start\n * all ref blocks are max 1000 bases long') | ||
rd = vds.reference_data | ||
rd = rd.annotate_rows(locus_start = rd.locus.position) | ||
print('checking that:\n * no reference blocks have GQ=0\n * all ref blocks have END after start\n * all ref blocks are max 1000 bases long') | ||
rd = vds.reference_data | ||
rd = rd.annotate_rows(locus_start = rd.locus.position) | ||
|
||
LEN = rd.END - rd.locus_start + 1 | ||
LEN = rd.END - rd.locus_start + 1 | ||
|
||
print('checking that: no reference blocks have GQ=0') | ||
assert rd.aggregate_entries(hl.agg.all(hl.all(rd.GQ > 0))) | ||
print('checking that: no reference blocks have GQ=0') | ||
assert rd.aggregate_entries(hl.agg.all(hl.all(rd.GQ > 0))) | ||
|
||
print('checking that: all ref blocks have END after start') | ||
assert rd.aggregate_entries(hl.agg.all(hl.all(LEN >= 0))) | ||
print('checking that: all ref blocks have END after start') | ||
assert rd.aggregate_entries(hl.agg.all(hl.all(LEN >= 0))) | ||
|
||
print('checking that: all ref blocks are max 1000 bases long') | ||
assert rd.aggregate_entries(hl.agg.all(hl.all(LEN <= rd.ref_block_max_length))) | ||
print('checking that: all ref blocks are max 1000 bases long') | ||
assert rd.aggregate_entries(hl.agg.all(hl.all(LEN <= rd.ref_block_max_length))) | ||
|
||
def check_densify_small_region(vds): | ||
print('running densify on 200kb region') | ||
from time import time | ||
t1 = time() | ||
print('running densify on 200kb region') | ||
from time import time | ||
t1 = time() | ||
|
||
filt = hl.vds.filter_intervals(vds, [hl.parse_locus_interval('chr16:29.5M-29.7M', reference_genome='GRCh38')]) | ||
n=hl.vds.to_dense_mt(filt).select_entries('LGT')._force_count_rows() | ||
filt = hl.vds.filter_intervals(vds, [hl.parse_locus_interval('chr16:29.5M-29.7M', reference_genome='GRCh38')]) | ||
n=hl.vds.to_dense_mt(filt).select_entries('LGT')._force_count_rows() | ||
|
||
print(f'took {time() - t1:.1f}s to densify {n} rows after interval query') | ||
print(f'took {time() - t1:.1f}s to densify {n} rows after interval query') | ||
|
||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -178,7 +178,7 @@ workflow GvsQuickstartIntegration { | |
git_branch_or_tag = git_branch_or_tag, | ||
git_hash = GetToolVersions.git_hash, | ||
use_VETS = false, | ||
extract_do_not_filter_override = true, | ||
extract_do_not_filter_override = false, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will revert before merge |
||
dataset_suffix = "vqsr_vcf", | ||
use_default_dockers = use_default_dockers, | ||
gatk_override = effective_gatk_override, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
version 1.0 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. don't think this needs to be merged, revert |
||
|
||
import "../GvsUtils.wdl" as Utils | ||
import "../GvsExtractAvroFilesForHail.wdl" as ExtractAvroFilesForHail | ||
import "../GvsCreateVDS.wdl" as CreateVds | ||
import "GvsQuickstartVcfIntegration.wdl" as QuickstartVcfIntegration | ||
import "GvsTieOutVDS.wdl" as TieOutVDS | ||
|
||
workflow MakeHetvarVDS { | ||
input { | ||
String git_branch_or_tag | ||
String? git_hash | ||
String reference_name = "hg38" | ||
Boolean is_wgs = true | ||
String dataset_name | ||
String filter_set_name | ||
File? interval_list | ||
Boolean use_compressed_references = false | ||
Boolean extract_do_not_filter_override = false | ||
String dataset_suffix = "hail" | ||
Boolean use_default_dockers = false | ||
Boolean bgzip_output_vcfs = false | ||
|
||
String? basic_docker | ||
String? cloud_sdk_docker | ||
String? cloud_sdk_slim_docker | ||
String? variants_docker | ||
String? gatk_docker | ||
|
||
File? gatk_override | ||
String? hail_version | ||
String? sample_set_name ## NOTE: currently we only allow the loading of one sample set at a time | ||
|
||
String? workspace_bucket | ||
String? workspace_id | ||
String? submission_id | ||
|
||
Int? maximum_alternate_alleles | ||
String ploidy_table_name = "sample_chromosome_ploidy" | ||
} | ||
|
||
String project_id = "gvs-internal" | ||
|
||
if (!defined(workspace_bucket) || !defined(workspace_id) || !defined(submission_id) || | ||
!defined(git_hash) || !defined(basic_docker) || !defined(cloud_sdk_docker) || !defined(cloud_sdk_slim_docker) || | ||
!defined(variants_docker) || !defined(gatk_docker) || !defined(hail_version)) { | ||
call Utils.GetToolVersions { | ||
input: | ||
git_branch_or_tag = git_branch_or_tag, | ||
} | ||
} | ||
|
||
String effective_basic_docker = select_first([basic_docker, GetToolVersions.basic_docker]) | ||
String effective_cloud_sdk_docker = select_first([cloud_sdk_docker, GetToolVersions.cloud_sdk_docker]) | ||
String effective_cloud_sdk_slim_docker = select_first([cloud_sdk_slim_docker, GetToolVersions.cloud_sdk_slim_docker]) | ||
String effective_variants_docker = select_first([variants_docker, GetToolVersions.variants_docker]) | ||
String effective_gatk_docker = select_first([gatk_docker, GetToolVersions.gatk_docker]) | ||
String effective_git_hash = select_first([git_hash, GetToolVersions.git_hash]) | ||
String effective_hail_version = select_first([hail_version, GetToolVersions.hail_version]) | ||
|
||
String effective_workspace_bucket = select_first([workspace_bucket, GetToolVersions.workspace_bucket]) | ||
String effective_workspace_id = select_first([workspace_id, GetToolVersions.workspace_id]) | ||
String effective_submission_id = select_first([submission_id, GetToolVersions.submission_id]) | ||
|
||
|
||
call ExtractAvroFilesForHail.GvsExtractAvroFilesForHail { | ||
input: | ||
git_branch_or_tag = git_branch_or_tag, | ||
git_hash = git_hash, | ||
project_id = project_id, | ||
dataset_name = dataset_name, | ||
filter_set_name = filter_set_name, | ||
ploidy_table_name = ploidy_table_name, | ||
scatter_width = 10, | ||
call_set_identifier = git_branch_or_tag, | ||
basic_docker = effective_basic_docker, | ||
cloud_sdk_docker = effective_cloud_sdk_docker, | ||
variants_docker = effective_variants_docker, | ||
} | ||
|
||
String vds_path = GvsExtractAvroFilesForHail.avro_path + "/gvs_export.vds" | ||
|
||
call CreateVds.GvsCreateVDS { | ||
input: | ||
git_branch_or_tag = git_branch_or_tag, | ||
hail_version = effective_hail_version, | ||
avro_path = GvsExtractAvroFilesForHail.avro_path, | ||
vds_destination_path = vds_path, | ||
cluster_prefix = "vds-cluster", | ||
gcs_subnetwork_name = "subnetwork", | ||
region = "us-central1", | ||
basic_docker = effective_basic_docker, | ||
variants_docker = effective_variants_docker, | ||
cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker, | ||
cluster_max_age_minutes = 120, | ||
cluster_max_idle_minutes = 60, | ||
leave_cluster_running_at_end = false, | ||
} | ||
|
||
output { | ||
String vds_output_path = GvsExtractAvroFilesForHail.avro_path + "/gvs_export.vds" | ||
String recorded_git_hash = effective_git_hash | ||
Boolean done = true | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
All the changes in this file were just for testing purposes and don't need to be merged.