diff --git a/v03_pipeline/lib/misc/family_entries.py b/v03_pipeline/lib/misc/family_entries.py index e2258e309..d4ed9811c 100644 --- a/v03_pipeline/lib/misc/family_entries.py +++ b/v03_pipeline/lib/misc/family_entries.py @@ -98,13 +98,22 @@ def remove_family_guids( family_guids: hl.SetExpression, ) -> hl.Table: # Remove families from the existing project table structure (both the entries arrays and the globals are mutated) - family_indexes_to_keep = hl.array( - hl.enumerate(ht.globals.family_guids) - .filter(lambda item: ~family_guids.contains(item[1])) - .map(lambda item: item[0]), + family_indexes_to_keep = hl.eval( + hl.array( + hl.enumerate(ht.globals.family_guids) + .filter(lambda item: ~family_guids.contains(item[1])) + .map(lambda item: item[0]), + ), ) ht = ht.annotate( - family_entries=family_indexes_to_keep.map(lambda i: ht.family_entries[i]), + # NB: this "should" work without the extra if statement (and does in the tests) + # however, experiments on dataproc showed this statement hanging with an empty + # unevaluated indexes array. + family_entries=hl.array(family_indexes_to_keep).map( + lambda i: ht.family_entries[i], + ) + if len(family_indexes_to_keep) > 0 + else hl.empty_array(ht.family_entries.dtype.element_type), ) ht = ht.filter(hl.any(ht.family_entries.map(hl.is_defined))) return ht.annotate_globals( diff --git a/v03_pipeline/lib/misc/lookup.py b/v03_pipeline/lib/misc/lookup.py index 6a7e3cbbc..1c3884638 100644 --- a/v03_pipeline/lib/misc/lookup.py +++ b/v03_pipeline/lib/misc/lookup.py @@ -128,22 +128,23 @@ def remove_project( existing_project_guids = hl.eval(ht.globals.project_guids) if project_guid not in existing_project_guids: return ht - project_indexes_to_keep = ( + project_indexes_to_keep = hl.eval( hl.enumerate(existing_project_guids) .filter(lambda item: item[1] != project_guid) - .map(lambda item: item[0]) + .map(lambda item: item[0]), ) ht = ht.annotate( project_stats=( - project_indexes_to_keep.map( - lambda i: ht.project_stats[i], - ) + # See "remove_family_guids" func for why this was necessary + hl.array(project_indexes_to_keep).map(lambda i: ht.project_stats[i]) + if len(project_indexes_to_keep) > 0 + else hl.empty_array(ht.project_stats.dtype.element_type) ), ) ht = ht.filter(hl.any(ht.project_stats.map(hl.is_defined))) return ht.annotate_globals( - project_guids=project_indexes_to_keep.map( - lambda i: ht.project_guids[i], + project_guids=ht.project_guids.filter( + lambda p: p != project_guid, ), project_families=hl.dict( ht.project_families.items().filter(lambda item: item[0] != project_guid),