Skip to content

Commit d68bdc4

Browse files
jklugherzbpblanken
andauthored
lookup sample_type migration (#867)
* add sampletype to family table file path * all sample types * missed 1 * project tables * explicit in family table delete * use enum * optional parameter but second DeleteProjectFamilyTablesTask test fails * oops * should work * cleaner * v3.1 and handle no project tables * missed one * paths test * add lookup migration for sample_type * old * no array * do not modify updates * comments * use param_kwargs * Update base_migration.py --------- Co-authored-by: Benjamin Blankenmeister <bblanken@broadinstitute.org>
1 parent a77e20c commit d68bdc4

9 files changed

+93
-9
lines changed

v03_pipeline/lib/migration/base_migration.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,5 @@ class BaseMigration(ABC):
1212

1313
@staticmethod
1414
@abstractmethod
15-
def migrate(ht: hl.Table) -> hl.Table:
15+
def migrate(ht: hl.Table, **kwargs) -> hl.Table:
1616
pass

v03_pipeline/lib/tasks/base/base_migrate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def update_table(self, ht: hl.Table) -> hl.Table:
4444
if (
4545
(self.reference_genome, self.dataset_type)
4646
) in migration.reference_genome_dataset_types:
47-
ht = migration.migrate(ht)
47+
ht = migration.migrate(ht, **self.param_kwargs)
4848
return ht.annotate_globals(
4949
migrations=ht.globals.migrations.append(self.migration_name),
5050
)

v03_pipeline/lib/tasks/migrate_variant_annotations_table_test.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ class MockMigration(BaseMigration):
1919
)
2020

2121
@staticmethod
22-
def migrate(ht: hl.Table) -> hl.Table:
22+
def migrate(ht: hl.Table, **_) -> hl.Table:
2323
ht = ht.annotate(
2424
variant_id_id=hl.format('%s_id', ht.variant_id),
2525
)
@@ -34,7 +34,7 @@ class MockMigration2(BaseMigration):
3434
)
3535

3636
@staticmethod
37-
def migrate(ht: hl.Table) -> hl.Table:
37+
def migrate(ht: hl.Table, **_) -> hl.Table:
3838
return ht.annotate_globals(mock_migration2='a second mock migration')
3939

4040

v03_pipeline/migrations/annotations/0001_add_migrations_global.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,5 @@ class AddMigrationsGlobals(BaseMigration):
1818
)
1919

2020
@staticmethod
21-
def migrate(ht: hl.Table) -> hl.Table:
21+
def migrate(ht: hl.Table, **_) -> hl.Table:
2222
return ht.annotate_globals(migrations=hl.empty_array(hl.tstr))

v03_pipeline/migrations/annotations/0002_add_remap_pedigree_hash.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ class AddRemapPedigreeHash(BaseMigration):
1818
)
1919

2020
@staticmethod
21-
def migrate(ht: hl.Table) -> hl.Table:
21+
def migrate(ht: hl.Table, **_) -> hl.Table:
2222
return ht.annotate_globals(
2323
updates=ht.globals.updates.map(
2424
lambda u: u.annotate(remap_pedigree_hash=hl.missing(hl.tint32)),

v03_pipeline/migrations/annotations/0003_add_rg38_locus.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ class AddRG38Locus(BaseMigration):
1313
)
1414

1515
@staticmethod
16-
def migrate(ht: hl.Table) -> hl.Table:
16+
def migrate(ht: hl.Table, **_) -> hl.Table:
1717
return ht.annotate(
1818
rg38_locus=snv_indel.rg38_locus(ht, Env.GRCH37_TO_GRCH38_LIFTOVER_REF_PATH),
1919
)

v03_pipeline/migrations/lookup/0001_remove_null_families.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class RemoveNullFamilies(BaseMigration):
1616
)
1717

1818
@staticmethod
19-
def migrate(ht: hl.Table) -> hl.Table:
19+
def migrate(ht: hl.Table, **_) -> hl.Table:
2020
ht = ht.annotate(
2121
project_stats=ht.project_stats.map(
2222
lambda ps: hl.or_missing(hl.any(ps.map(hl.is_defined)), ps),

v03_pipeline/migrations/lookup/0002_add_remap_pedigree_hash.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class AddRemapPedigreeHash(BaseMigration):
1616
)
1717

1818
@staticmethod
19-
def migrate(ht: hl.Table) -> hl.Table:
19+
def migrate(ht: hl.Table, **_) -> hl.Table:
2020
return ht.annotate_globals(
2121
updates=ht.globals.updates.map(
2222
lambda u: u.annotate(remap_pedigree_hash=hl.missing(hl.tint32)),
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import hail as hl
2+
import hailtop.fs as hfs
3+
4+
from v03_pipeline.lib.logger import get_logger
5+
from v03_pipeline.lib.migration.base_migration import BaseMigration
6+
from v03_pipeline.lib.model import DatasetType, ReferenceGenome, SampleType
7+
from v03_pipeline.lib.paths import project_table_path
8+
9+
logger = get_logger(__name__)
10+
11+
12+
class AddLookupSampleType(BaseMigration):
13+
reference_genome_dataset_types: frozenset[
14+
tuple[ReferenceGenome, DatasetType]
15+
] = frozenset(
16+
(
17+
(ReferenceGenome.GRCh37, DatasetType.SNV_INDEL),
18+
(ReferenceGenome.GRCh38, DatasetType.SNV_INDEL),
19+
(ReferenceGenome.GRCh38, DatasetType.MITO),
20+
),
21+
)
22+
23+
@staticmethod
24+
def migrate(
25+
ht: hl.Table,
26+
reference_genome: ReferenceGenome,
27+
dataset_type: DatasetType,
28+
**_,
29+
) -> hl.Table:
30+
"""
31+
Renames project_guids to project_sample_types.
32+
Adds sample_type to global fields project_sample_types and project_families.
33+
Assumes that only one project_ht exists for each project across both sample types.
34+
35+
Old Global fields:
36+
'project_guids': array<str>
37+
'project_families': dict<str, array<str>>
38+
'updates': set<struct {
39+
callset: str,
40+
project_guid: str,
41+
remap_pedigree_hash: int32
42+
}>
43+
New Global fields:
44+
'project_sample_types': array<tuple (
45+
str,
46+
str
47+
)>
48+
'project_families': dict<tuple (
49+
str,
50+
str
51+
), array<str>>
52+
'updates': set<struct {
53+
callset: str,
54+
project_guid: str,
55+
remap_pedigree_hash: int32
56+
}>
57+
"""
58+
ht = ht.transmute_globals(
59+
project_sample_types=ht.globals.project_guids,
60+
)
61+
collected_globals = ht.globals.collect()[0]
62+
project_sample_types = collected_globals['project_sample_types']
63+
project_families = collected_globals['project_families']
64+
65+
for i, project_guid in enumerate(project_sample_types):
66+
for sample_type in SampleType:
67+
project_ht_path = project_table_path(
68+
reference_genome,
69+
dataset_type,
70+
sample_type,
71+
project_guid,
72+
)
73+
if not hfs.exists(project_ht_path):
74+
continue
75+
76+
key = (project_guid, sample_type.value)
77+
project_sample_types[i] = key
78+
project_families[key] = project_families.pop(project_guid)
79+
break
80+
81+
return ht.annotate_globals(
82+
project_sample_types=project_sample_types,
83+
project_families=project_families,
84+
)

0 commit comments

Comments
 (0)