1
1
import enum
2
- import logging
3
2
import functools
4
- from typing import (
5
- Any ,
6
- Optional ,
7
- Dict ,
8
- Tuple ,
9
- Union ,
10
- TYPE_CHECKING ,
11
- )
12
-
3
+ import logging
13
4
from abc import ABC
5
+ from collections import defaultdict
6
+ from typing import Any , Optional , Dict , Tuple , Union , TYPE_CHECKING , Set
14
7
15
- from quixstreams .state .exceptions import (
16
- StateTransactionError ,
17
- InvalidChangelogOffset ,
18
- )
8
+ from quixstreams .state .exceptions import StateTransactionError , InvalidChangelogOffset
19
9
from quixstreams .state .metadata import (
20
10
CHANGELOG_CF_MESSAGE_HEADER ,
21
11
CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER ,
25
15
Undefined ,
26
16
DEFAULT_PREFIX ,
27
17
)
28
- from quixstreams .state .serialization import (
29
- serialize ,
30
- deserialize ,
31
- LoadsFunc ,
32
- DumpsFunc ,
33
- )
18
+ from quixstreams .state .serialization import serialize , deserialize , LoadsFunc , DumpsFunc
34
19
from quixstreams .utils .json import dumps as json_dumps
35
-
36
20
from .state import State , TransactionState
37
21
38
22
if TYPE_CHECKING :
39
23
from quixstreams .state .recovery import ChangelogProducer
40
24
from .partition import StorePartition
41
25
42
- __all__ = ("PartitionTransactionStatus" , "PartitionTransaction" , "CACHE_TYPE" )
26
+ __all__ = (
27
+ "PartitionTransactionStatus" ,
28
+ "PartitionTransaction" ,
29
+ "PartitionTransactionCache" ,
30
+ )
43
31
44
32
logger = logging .getLogger (__name__ )
45
- CACHE_TYPE = Dict [str , Dict [bytes , Dict [bytes , Union [bytes , Undefined ]]]]
33
+
34
+
35
+ class PartitionTransactionCache :
36
+ """
37
+ A cache with the data updated in the current PartitionTransaction.
38
+ It is used to read-your-own-writes before the transaction is committed to the Store.
39
+
40
+ Internally, updates and deletes are separated into two separate structures
41
+ to simplify the querying over them.
42
+ """
43
+
44
+ def __init__ (self ):
45
+ # A map with updated keys in format {<cf>: {<prefix>: {<key>: <value>}}}
46
+ # Note: "updates" are bucketed per prefix to speed up iterating over the
47
+ # specific set of keys when we merge updates with data from the stores.
48
+ # Using a prefix like that allows us to perform fewer iterations.
49
+ self ._updated : dict [str , dict [bytes , dict [bytes , bytes ]]] = defaultdict (
50
+ lambda : defaultdict (dict )
51
+ )
52
+ # Dict of sets with deleted keys in format {<cf>: set[<key1>, <key2>]}
53
+ # Deletes are stored without prefixes because we don't need to iterate over
54
+ # them.
55
+ self ._deleted : dict [str , set [bytes ]] = defaultdict (set )
56
+ self ._empty = True
57
+
58
+ def get (
59
+ self ,
60
+ key : bytes ,
61
+ prefix : bytes ,
62
+ cf_name : str = "default" ,
63
+ ) -> Union [bytes , Undefined ]:
64
+ """
65
+ Get a value for the key.
66
+
67
+ Returns the key value if it has been updated during the transaction.
68
+
69
+ If the key has already been deleted, returns "DELETED" sentinel
70
+ (we don't need to check the actual store).
71
+ If the key is not present in the cache, returns "UNDEFINED sentinel
72
+ (we need to check the store).
73
+
74
+ :param: key: key as bytes
75
+ :param: prefix: key prefix as bytes
76
+ :param: cf_name: column family name
77
+ """
78
+ # Check if the key has been deleted
79
+ if key in self ._deleted [cf_name ]:
80
+ # The key is deleted and the store doesn't need to be checked
81
+ return DELETED
82
+
83
+ # Check if the key has been updated
84
+ # If the key is not present in the cache, we need to check the store and return
85
+ # UNDEFINED to signify that
86
+ return self ._updated [cf_name ][prefix ].get (key , UNDEFINED )
87
+
88
+ def set (self , key : bytes , value : bytes , prefix : bytes , cf_name : str = "default" ):
89
+ """
90
+ Set a value for the key.
91
+
92
+ :param: key: key as bytes
93
+ :param: value: value as bytes
94
+ :param: prefix: key prefix as bytes
95
+ :param: cf_name: column family name
96
+ """
97
+ self ._updated [cf_name ][prefix ][key ] = value
98
+ self ._deleted [cf_name ].discard (key )
99
+ self ._empty = False
100
+
101
+ def delete (self , key : Any , prefix : bytes , cf_name : str = "default" ):
102
+ """
103
+ Delete a key.
104
+
105
+ :param: key: key as bytes
106
+ :param: value: value as bytes
107
+ :param: prefix: key prefix as bytes
108
+ :param: cf_name: column family name
109
+ """
110
+ self ._updated [cf_name ][prefix ].pop (key , None )
111
+ self ._deleted [cf_name ].add (key )
112
+ self ._empty = False
113
+
114
+ def is_empty (self ) -> bool :
115
+ """
116
+ Return True if any changes have been made (updates or deletes), otherwise
117
+ return False.
118
+ """
119
+ return self ._empty
120
+
121
+ def get_column_families (self ) -> Set [str ]:
122
+ """
123
+ Get all update column families.
124
+ """
125
+ return set (self ._updated .keys ()) | set (self ._deleted .keys ())
126
+
127
+ def get_updates (self , cf_name : str = "default" ) -> Dict [bytes , Dict [bytes , bytes ]]:
128
+ """
129
+ Get all updated keys (excluding deleted)
130
+ in the format "{<prefix>: {<key>: <value>}}".
131
+
132
+ :param: cf_name: column family name
133
+ """
134
+ return self ._updated .get (cf_name , {})
135
+
136
+ def get_deletes (self , cf_name : str = "default" ) -> Set [bytes ]:
137
+ """
138
+ Get all deleted keys (excluding updated) as a set.
139
+ """
140
+ return self ._deleted [cf_name ]
46
141
47
142
48
143
class PartitionTransactionStatus (enum .Enum ):
@@ -97,7 +192,7 @@ def __init__(
97
192
self ._loads = loads
98
193
self ._partition = partition
99
194
100
- self ._update_cache : CACHE_TYPE = {}
195
+ self ._update_cache = PartitionTransactionCache ()
101
196
102
197
@property
103
198
def changelog_producer (self ) -> Optional ["ChangelogProducer" ]:
@@ -197,14 +292,13 @@ def get(
197
292
:param key: key
198
293
:param prefix: a key prefix
199
294
:param default: default value to return if the key is not found
295
+ :param cf_name: column family name
200
296
:return: value or None if the key is not found and `default` is not provided
201
297
"""
202
298
key_serialized = self ._serialize_key (key , prefix = prefix )
203
299
204
- cached = (
205
- self ._update_cache .get (cf_name , {})
206
- .get (prefix , {})
207
- .get (key_serialized , UNDEFINED )
300
+ cached = self ._update_cache .get (
301
+ key = key_serialized , prefix = prefix , cf_name = cf_name
208
302
)
209
303
if cached is DELETED :
210
304
return default
@@ -225,14 +319,18 @@ def set(self, key: Any, value: Any, prefix: bytes, cf_name: str = "default"):
225
319
:param key: key
226
320
:param prefix: a key prefix
227
321
:param value: value
322
+ :param cf_name: column family name
228
323
"""
229
324
230
325
try :
231
326
key_serialized = self ._serialize_key (key , prefix = prefix )
232
327
value_serialized = self ._serialize_value (value )
233
- self ._update_cache .setdefault (cf_name , {}).setdefault (prefix , {})[
234
- key_serialized
235
- ] = value_serialized
328
+ self ._update_cache .set (
329
+ key = key_serialized ,
330
+ value = value_serialized ,
331
+ prefix = prefix ,
332
+ cf_name = cf_name ,
333
+ )
236
334
except Exception :
237
335
self ._status = PartitionTransactionStatus .FAILED
238
336
raise
@@ -245,12 +343,13 @@ def delete(self, key: Any, prefix: bytes, cf_name: str = "default"):
245
343
This function always returns `None`, even if value is not found.
246
344
:param key: key
247
345
:param prefix: a key prefix
346
+ :param cf_name: column family name
248
347
"""
249
348
try :
250
349
key_serialized = self ._serialize_key (key , prefix = prefix )
251
- self ._update_cache .setdefault ( cf_name , {}). setdefault ( prefix , {})[
252
- key_serialized
253
- ] = DELETED
350
+ self ._update_cache .delete (
351
+ key = key_serialized , prefix = prefix , cf_name = cf_name
352
+ )
254
353
except Exception :
255
354
self ._status = PartitionTransactionStatus .FAILED
256
355
raise
@@ -261,21 +360,19 @@ def exists(self, key: Any, prefix: bytes, cf_name: str = "default") -> bool:
261
360
Check if the key exists in state.
262
361
:param key: key
263
362
:param prefix: a key prefix
363
+ :param cf_name: column family name
264
364
:return: True if key exists, False otherwise
265
365
"""
266
366
key_serialized = self ._serialize_key (key , prefix = prefix )
267
- cached = (
268
- self ._update_cache .get (cf_name , {})
269
- .get (prefix , {})
270
- .get (key_serialized , UNDEFINED )
367
+ cached = self ._update_cache .get (
368
+ key = key_serialized , prefix = prefix , cf_name = cf_name
271
369
)
272
370
if cached is DELETED :
273
371
return False
274
-
275
- if cached is not UNDEFINED :
372
+ elif cached is not UNDEFINED :
276
373
return True
277
-
278
- return self ._partition .exists (key_serialized , cf_name = cf_name )
374
+ else :
375
+ return self ._partition .exists (key_serialized , cf_name = cf_name )
279
376
280
377
@validate_transaction_status (PartitionTransactionStatus .STARTED )
281
378
def prepare (self , processed_offset : int ):
@@ -310,21 +407,32 @@ def _prepare(self, processed_offset: int):
310
407
f"partition={ self ._changelog_producer .partition } "
311
408
f"processed_offset={ processed_offset } "
312
409
)
313
- for cf_name , cf_update_cache in self ._update_cache .items ():
314
- source_tp_offset_header = json_dumps (processed_offset )
410
+ source_tp_offset_header = json_dumps (processed_offset )
411
+ column_families = self ._update_cache .get_column_families ()
412
+
413
+ for cf_name in column_families :
315
414
headers = {
316
415
CHANGELOG_CF_MESSAGE_HEADER : cf_name ,
317
416
CHANGELOG_PROCESSED_OFFSET_MESSAGE_HEADER : source_tp_offset_header ,
318
417
}
319
- for _ , prefix_update_cache in cf_update_cache .items ():
418
+
419
+ updates = self ._update_cache .get_updates (cf_name = cf_name )
420
+ for prefix_update_cache in updates .values ():
320
421
for key , value in prefix_update_cache .items ():
321
- # Produce changes to the changelog topic
322
422
self ._changelog_producer .produce (
323
423
key = key ,
324
- value = value if value is not DELETED else None ,
424
+ value = value ,
325
425
headers = headers ,
326
426
)
327
427
428
+ deletes = self ._update_cache .get_deletes (cf_name = cf_name )
429
+ for key in deletes :
430
+ self ._changelog_producer .produce (
431
+ key = key ,
432
+ value = None ,
433
+ headers = headers ,
434
+ )
435
+
328
436
@validate_transaction_status (
329
437
PartitionTransactionStatus .STARTED , PartitionTransactionStatus .PREPARED
330
438
)
@@ -357,7 +465,7 @@ def flush(
357
465
raise
358
466
359
467
def _flush (self , processed_offset : Optional [int ], changelog_offset : Optional [int ]):
360
- if not self ._update_cache :
468
+ if self ._update_cache . is_empty () :
361
469
return
362
470
363
471
if changelog_offset is not None :
@@ -371,7 +479,7 @@ def _flush(self, processed_offset: Optional[int], changelog_offset: Optional[int
371
479
)
372
480
373
481
self ._partition .write (
374
- data = self ._update_cache ,
482
+ cache = self ._update_cache ,
375
483
processed_offset = processed_offset ,
376
484
changelog_offset = changelog_offset ,
377
485
)
0 commit comments