From 8641f001624becf3e434128afe5c0cf626809e25 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 18:00:24 -0400 Subject: [PATCH 01/11] i guess this is necessary --- v03_pipeline/lib/misc/family_entries.py | 11 ++++++++--- v03_pipeline/lib/misc/lookup.py | 7 ++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/v03_pipeline/lib/misc/family_entries.py b/v03_pipeline/lib/misc/family_entries.py index b3f3c2b50..8ea1255ee 100644 --- a/v03_pipeline/lib/misc/family_entries.py +++ b/v03_pipeline/lib/misc/family_entries.py @@ -98,13 +98,18 @@ def remove_family_guids( family_guids: hl.SetExpression, ) -> hl.Table: # Remove families from the existing project table structure (both the entries arrays and the globals are mutated) - family_indexes_to_keep = hl.array( + family_indexes_to_keep = hl.eval(hl.array( hl.enumerate(ht.globals.family_guids) .filter(lambda item: ~family_guids.contains(item[1])) .map(lambda item: item[0]), - ) + )) ht = ht.annotate( - family_entries=family_indexes_to_keep.map(lambda i: ht.family_entries[i]), + # NB: this "should" work without the extra if statement (and does in the tests) + # however, experiments on dataproc showed this statement hanging with an empty + # indexes array. + family_entries=family_indexes_to_keep.map(lambda i: ht.family_entries[i]) + if len(family_indexes_to_keep) > 0 + else hl.empty_array(ht.family_entries.dtype.element_type) ) return ht.annotate_globals( family_guids=ht.family_guids.filter( diff --git a/v03_pipeline/lib/misc/lookup.py b/v03_pipeline/lib/misc/lookup.py index 6a7e3cbbc..2377688d1 100644 --- a/v03_pipeline/lib/misc/lookup.py +++ b/v03_pipeline/lib/misc/lookup.py @@ -135,9 +135,10 @@ def remove_project( ) ht = ht.annotate( project_stats=( - project_indexes_to_keep.map( - lambda i: ht.project_stats[i], - ) + # See "remove_family_guids" func for why this was necessary + project_stats=project_indexes_to_keep.map(lambda i: ht.project_stats[i]) + if len(project_indexes_to_keep) > 0 + else hl.empty_array(ht.project_stats.dtype.element_type) ), ) ht = ht.filter(hl.any(ht.project_stats.map(hl.is_defined))) From 41300a31edfe0d216fd30ea570dc6d128ac800a3 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 18:02:30 -0400 Subject: [PATCH 02/11] bump eval up --- v03_pipeline/lib/misc/lookup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/misc/lookup.py b/v03_pipeline/lib/misc/lookup.py index 2377688d1..21a336b5f 100644 --- a/v03_pipeline/lib/misc/lookup.py +++ b/v03_pipeline/lib/misc/lookup.py @@ -128,7 +128,7 @@ def remove_project( existing_project_guids = hl.eval(ht.globals.project_guids) if project_guid not in existing_project_guids: return ht - project_indexes_to_keep = ( + project_indexes_to_keep = hl.eval( hl.enumerate(existing_project_guids) .filter(lambda item: item[1] != project_guid) .map(lambda item: item[0]) From 34da8a3da6b0f7f237d9d79050d0dfbe0e1b1132 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 18:06:39 -0400 Subject: [PATCH 03/11] lint --- v03_pipeline/lib/misc/family_entries.py | 4 ++-- v03_pipeline/lib/misc/lookup.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/v03_pipeline/lib/misc/family_entries.py b/v03_pipeline/lib/misc/family_entries.py index 8ea1255ee..09afbe4bb 100644 --- a/v03_pipeline/lib/misc/family_entries.py +++ b/v03_pipeline/lib/misc/family_entries.py @@ -105,11 +105,11 @@ def remove_family_guids( )) ht = ht.annotate( # NB: this "should" work without the extra if statement (and does in the tests) - # however, experiments on dataproc showed this statement hanging with an empty + # however, experiments on dataproc showed this statement hanging with an empty # indexes array. family_entries=family_indexes_to_keep.map(lambda i: ht.family_entries[i]) if len(family_indexes_to_keep) > 0 - else hl.empty_array(ht.family_entries.dtype.element_type) + else hl.empty_array(ht.family_entries.dtype.element_type), ) return ht.annotate_globals( family_guids=ht.family_guids.filter( diff --git a/v03_pipeline/lib/misc/lookup.py b/v03_pipeline/lib/misc/lookup.py index 21a336b5f..b22fa0377 100644 --- a/v03_pipeline/lib/misc/lookup.py +++ b/v03_pipeline/lib/misc/lookup.py @@ -131,12 +131,12 @@ def remove_project( project_indexes_to_keep = hl.eval( hl.enumerate(existing_project_guids) .filter(lambda item: item[1] != project_guid) - .map(lambda item: item[0]) + .map(lambda item: item[0]), ) ht = ht.annotate( project_stats=( # See "remove_family_guids" func for why this was necessary - project_stats=project_indexes_to_keep.map(lambda i: ht.project_stats[i]) + project_indexes_to_keep.map(lambda i: ht.project_stats[i]) if len(project_indexes_to_keep) > 0 else hl.empty_array(ht.project_stats.dtype.element_type) ), From be1a8cddec5f7fcfbd2911804c819613caa0da91 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 18:08:53 -0400 Subject: [PATCH 04/11] ruff --- v03_pipeline/lib/misc/family_entries.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/v03_pipeline/lib/misc/family_entries.py b/v03_pipeline/lib/misc/family_entries.py index 09afbe4bb..070f6abd9 100644 --- a/v03_pipeline/lib/misc/family_entries.py +++ b/v03_pipeline/lib/misc/family_entries.py @@ -98,11 +98,13 @@ def remove_family_guids( family_guids: hl.SetExpression, ) -> hl.Table: # Remove families from the existing project table structure (both the entries arrays and the globals are mutated) - family_indexes_to_keep = hl.eval(hl.array( - hl.enumerate(ht.globals.family_guids) - .filter(lambda item: ~family_guids.contains(item[1])) - .map(lambda item: item[0]), - )) + family_indexes_to_keep = hl.eval( + hl.array( + hl.enumerate(ht.globals.family_guids) + .filter(lambda item: ~family_guids.contains(item[1])) + .map(lambda item: item[0]), + ) + ) ht = ht.annotate( # NB: this "should" work without the extra if statement (and does in the tests) # however, experiments on dataproc showed this statement hanging with an empty From 055154e048bda384b5a6eb3a401c62a4098baa61 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 18:13:04 -0400 Subject: [PATCH 05/11] ruff --- v03_pipeline/lib/misc/family_entries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/misc/family_entries.py b/v03_pipeline/lib/misc/family_entries.py index 070f6abd9..a84d833af 100644 --- a/v03_pipeline/lib/misc/family_entries.py +++ b/v03_pipeline/lib/misc/family_entries.py @@ -103,7 +103,7 @@ def remove_family_guids( hl.enumerate(ht.globals.family_guids) .filter(lambda item: ~family_guids.contains(item[1])) .map(lambda item: item[0]), - ) + ), ) ht = ht.annotate( # NB: this "should" work without the extra if statement (and does in the tests) From d698404e20cbe44e1e053bd6cd058b4805ce0c10 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 18:14:35 -0400 Subject: [PATCH 06/11] Update family_entries.py --- v03_pipeline/lib/misc/family_entries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/misc/family_entries.py b/v03_pipeline/lib/misc/family_entries.py index a84d833af..550a150ab 100644 --- a/v03_pipeline/lib/misc/family_entries.py +++ b/v03_pipeline/lib/misc/family_entries.py @@ -108,7 +108,7 @@ def remove_family_guids( ht = ht.annotate( # NB: this "should" work without the extra if statement (and does in the tests) # however, experiments on dataproc showed this statement hanging with an empty - # indexes array. + # unevaluated indexes array. family_entries=family_indexes_to_keep.map(lambda i: ht.family_entries[i]) if len(family_indexes_to_keep) > 0 else hl.empty_array(ht.family_entries.dtype.element_type), From 416b535f2f73334c1b7cb4a11e5f1383907e04cd Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 19:26:03 -0400 Subject: [PATCH 07/11] wrap in an array --- v03_pipeline/lib/misc/family_entries.py | 2 +- v03_pipeline/lib/misc/lookup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/v03_pipeline/lib/misc/family_entries.py b/v03_pipeline/lib/misc/family_entries.py index a84d833af..c470bfada 100644 --- a/v03_pipeline/lib/misc/family_entries.py +++ b/v03_pipeline/lib/misc/family_entries.py @@ -109,7 +109,7 @@ def remove_family_guids( # NB: this "should" work without the extra if statement (and does in the tests) # however, experiments on dataproc showed this statement hanging with an empty # indexes array. - family_entries=family_indexes_to_keep.map(lambda i: ht.family_entries[i]) + family_entries=hl.array(family_indexes_to_keep).map(lambda i: ht.family_entries[i]) if len(family_indexes_to_keep) > 0 else hl.empty_array(ht.family_entries.dtype.element_type), ) diff --git a/v03_pipeline/lib/misc/lookup.py b/v03_pipeline/lib/misc/lookup.py index b22fa0377..bdf9ae735 100644 --- a/v03_pipeline/lib/misc/lookup.py +++ b/v03_pipeline/lib/misc/lookup.py @@ -136,7 +136,7 @@ def remove_project( ht = ht.annotate( project_stats=( # See "remove_family_guids" func for why this was necessary - project_indexes_to_keep.map(lambda i: ht.project_stats[i]) + hl.array(project_indexes_to_keep).map(lambda i: ht.project_stats[i]) if len(project_indexes_to_keep) > 0 else hl.empty_array(ht.project_stats.dtype.element_type) ), From 187cdf5129e9f47520def5640a02184ba1502eea Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 20:48:52 -0400 Subject: [PATCH 08/11] format --- v03_pipeline/lib/misc/family_entries.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/v03_pipeline/lib/misc/family_entries.py b/v03_pipeline/lib/misc/family_entries.py index 085235215..4c70a7214 100644 --- a/v03_pipeline/lib/misc/family_entries.py +++ b/v03_pipeline/lib/misc/family_entries.py @@ -109,7 +109,9 @@ def remove_family_guids( # NB: this "should" work without the extra if statement (and does in the tests) # however, experiments on dataproc showed this statement hanging with an empty # unevaluated indexes array. - family_entries=hl.array(family_indexes_to_keep).map(lambda i: ht.family_entries[i]) + family_entries=hl.array(family_indexes_to_keep).map( + lambda i: ht.family_entries[i] + ) if len(family_indexes_to_keep) > 0 else hl.empty_array(ht.family_entries.dtype.element_type), ) From ba2216820d1a31184a9ad34c8d0ee9d69fccaa85 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 20:54:14 -0400 Subject: [PATCH 09/11] format --- v03_pipeline/lib/misc/family_entries.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/misc/family_entries.py b/v03_pipeline/lib/misc/family_entries.py index 4c70a7214..4428970ae 100644 --- a/v03_pipeline/lib/misc/family_entries.py +++ b/v03_pipeline/lib/misc/family_entries.py @@ -110,7 +110,7 @@ def remove_family_guids( # however, experiments on dataproc showed this statement hanging with an empty # unevaluated indexes array. family_entries=hl.array(family_indexes_to_keep).map( - lambda i: ht.family_entries[i] + lambda i: ht.family_entries[i], ) if len(family_indexes_to_keep) > 0 else hl.empty_array(ht.family_entries.dtype.element_type), From 9ad3702d1b763cbc729f1ca512e5fa82b96eeb02 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 21:35:57 -0400 Subject: [PATCH 10/11] lookup --- v03_pipeline/lib/misc/lookup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/v03_pipeline/lib/misc/lookup.py b/v03_pipeline/lib/misc/lookup.py index bdf9ae735..5aa6ad48d 100644 --- a/v03_pipeline/lib/misc/lookup.py +++ b/v03_pipeline/lib/misc/lookup.py @@ -143,8 +143,8 @@ def remove_project( ) ht = ht.filter(hl.any(ht.project_stats.map(hl.is_defined))) return ht.annotate_globals( - project_guids=project_indexes_to_keep.map( - lambda i: ht.project_guids[i], + project_guids = ht.project_guids.filter( + lambda p: p != project_guid, ), project_families=hl.dict( ht.project_families.items().filter(lambda item: item[0] != project_guid), From d23f441629653663ebe88a9e5e19980e92bd677d Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Thu, 13 Jun 2024 21:38:15 -0400 Subject: [PATCH 11/11] format --- v03_pipeline/lib/misc/lookup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/misc/lookup.py b/v03_pipeline/lib/misc/lookup.py index 5aa6ad48d..1c3884638 100644 --- a/v03_pipeline/lib/misc/lookup.py +++ b/v03_pipeline/lib/misc/lookup.py @@ -143,7 +143,7 @@ def remove_project( ) ht = ht.filter(hl.any(ht.project_stats.map(hl.is_defined))) return ht.annotate_globals( - project_guids = ht.project_guids.filter( + project_guids=ht.project_guids.filter( lambda p: p != project_guid, ), project_families=hl.dict(