Skip to content

Commit 254374e

Browse files
committed
feat: only new keys for annotations
1 parent b3b005e commit 254374e

File tree

2 files changed

+60
-14
lines changed

2 files changed

+60
-14
lines changed

v03_pipeline/lib/misc/clickhouse.py

Lines changed: 53 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,9 @@ def key_field(self):
8181
@property
8282
def join_condition(self):
8383
return (
84-
'src.variantId = dst.variantId'
84+
'assumeNotNull(src.variantId) = dst.variantId'
8585
if self == ClickHouseTable.KEY_LOOKUP
86-
else 'toUInt32(src.key) = dst.key'
86+
else 'assumeNotNull(toUInt32(src.key)) = dst.key'
8787
)
8888

8989
@property
@@ -92,11 +92,32 @@ def select_fields(self):
9292

9393
@property
9494
def insert(self) -> Callable:
95-
return (
96-
functools.partial(direct_insert, clickhouse_table=self)
97-
if self != ClickHouseTable.ENTRIES
98-
else functools.partial(atomic_entries_insert, _clickhouse_table=self)
99-
)
95+
return {
96+
ClickHouseTable.ANNOTATIONS_DISK: functools.partial(
97+
direct_insert_new_keys,
98+
clickhouse_table=self,
99+
),
100+
ClickHouseTable.ANNOTATIONS_MEMORY: functools.partial(
101+
direct_insert_new_keys,
102+
clickhouse_table=self,
103+
),
104+
ClickHouseTable.CLINVAR: functools.partial(
105+
direct_insert_all_keys,
106+
clickhouse_table=self,
107+
),
108+
ClickHouseTable.ENTRIES: functools.partial(
109+
atomic_entries_insert,
110+
_clickhouse_table=self,
111+
),
112+
ClickHouseTable.KEY_LOOKUP: functools.partial(
113+
direct_insert_all_keys,
114+
_clickhouse_table=self,
115+
),
116+
ClickHouseTable.TRANSCRIPTS: functools.partial(
117+
direct_insert_all_keys,
118+
_clickhouse_table=self,
119+
),
120+
}[self]
100121

101122

102123
class ClickHouseDictionary(StrEnum):
@@ -414,7 +435,7 @@ def exchange_entity(
414435

415436

416437
@retry()
417-
def direct_insert(
438+
def direct_insert_new_keys(
418439
clickhouse_table: ClickHouseTable,
419440
reference_genome: ReferenceGenome,
420441
dataset_type: DatasetType,
@@ -456,6 +477,30 @@ def direct_insert(
456477
drop_staging_db()
457478

458479

480+
@retry()
481+
def direct_insert_all_keys(
482+
clickhouse_table: ClickHouseTable,
483+
reference_genome: ReferenceGenome,
484+
dataset_type: DatasetType,
485+
run_id: str,
486+
**_,
487+
) -> None:
488+
table_name_builder = TableNameBuilder(
489+
reference_genome,
490+
dataset_type,
491+
run_id,
492+
)
493+
dst_table = table_name_builder.dst_table(clickhouse_table)
494+
src_table = table_name_builder.src_table(clickhouse_table)
495+
logged_query(
496+
f"""
497+
INSERT INTO {dst_table}
498+
SELECT {clickhouse_table.select_fields}
499+
FROM {src_table}
500+
""",
501+
)
502+
503+
459504
@retry()
460505
def atomic_entries_insert(
461506
_clickhouse_table: ClickHouseTable,

v03_pipeline/lib/misc/clickhouse_test.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
create_staging_non_table_entities,
1616
create_staging_tables,
1717
delete_existing_families_from_staging_entries,
18-
direct_insert,
18+
direct_insert_all_keys,
19+
direct_insert_new_keys,
1920
exchange_entity,
2021
get_clickhouse_client,
2122
insert_new_entries,
@@ -234,7 +235,7 @@ def test_table_name_builder(self):
234235
"gcs('https://storage.googleapis.com/mock_bucket/v3.1/GRCh38/SNV_INDEL/runs/manual__2025-05-07T17-20-59.702114+00-00/new_entries.parquet/*.parquet', '', '', 'Parquet')",
235236
)
236237

237-
def test_direct_insert(self):
238+
def test_direct_insert_all_keys(self):
238239
client = get_clickhouse_client()
239240
df = pd.DataFrame({'key': [1, 2, 3, 4], 'transcripts': ['a', 'b', 'c', 'd']})
240241
table = pa.Table.from_pandas(df)
@@ -260,7 +261,7 @@ def test_direct_insert(self):
260261
f'INSERT INTO {Env.CLICKHOUSE_DATABASE}.`GRCh38/SNV_INDEL/transcripts` VALUES',
261262
[(1, 'a'), (10, 'b'), (7, 'c')],
262263
)
263-
direct_insert(
264+
direct_insert_all_keys(
264265
ClickHouseTable.TRANSCRIPTS,
265266
ReferenceGenome.GRCh38,
266267
DatasetType.SNV_INDEL,
@@ -275,7 +276,7 @@ def test_direct_insert(self):
275276
)
276277

277278
# ensure multiple calls are idempotent
278-
direct_insert(
279+
direct_insert_all_keys(
279280
ClickHouseTable.TRANSCRIPTS,
280281
ReferenceGenome.GRCh38,
281282
DatasetType.SNV_INDEL,
@@ -289,7 +290,7 @@ def test_direct_insert(self):
289290
[(1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (7, 'c'), (10, 'b')],
290291
)
291292

292-
def test_direct_insert_key_lookup(self):
293+
def test_direct_insert_key_lookup_new_keys(self):
293294
client = get_clickhouse_client()
294295
df = pd.DataFrame(
295296
{
@@ -325,7 +326,7 @@ def test_direct_insert_key_lookup(self):
325326
f'INSERT INTO {Env.CLICKHOUSE_DATABASE}.`GRCh38/SNV_INDEL/key_lookup` VALUES',
326327
[('1-123-A-C', 1), ('2-234-C-T', 2), ('M-345-C-G', 3)],
327328
)
328-
direct_insert(
329+
direct_insert_new_keys(
329330
ClickHouseTable.KEY_LOOKUP,
330331
ReferenceGenome.GRCh38,
331332
DatasetType.SNV_INDEL,

0 commit comments

Comments
 (0)