18
18
use std:: { collections:: BTreeSet , fmt, sync:: Mutex } ;
19
19
20
20
use growable_bloom_filter:: { GrowableBloom , GrowableBloomBuilder } ;
21
- use ruma:: OwnedEventId ;
21
+ use matrix_sdk_base:: event_cache:: store:: EventCacheStoreLock ;
22
+ use ruma:: { OwnedEventId , OwnedRoomId } ;
22
23
use tracing:: { debug, warn} ;
23
24
24
- use super :: room:: events:: { Event , RoomEvents } ;
25
+ use super :: {
26
+ room:: events:: { Event , RoomEvents } ,
27
+ EventCacheError ,
28
+ } ;
25
29
26
- /// `Deduplicator` is an efficient type to find duplicated events.
30
+ pub enum Deduplicator {
31
+ InMemory ( BloomFilterDeduplicator ) ,
32
+ PersistentStore ( StoreDeduplicator ) ,
33
+ }
34
+
35
+ impl Deduplicator {
36
+ /// Create an empty deduplicator instance that uses an internal Bloom
37
+ /// filter.
38
+ ///
39
+ /// Such a deduplicator is stateful, with no initial known events, and it
40
+ /// will learn over time by using a Bloom filter which events are
41
+ /// duplicates or not.
42
+ ///
43
+ /// When the persistent storage is enabled by default, this constructor
44
+ /// (and the associated variant) will be removed.
45
+ pub fn new_memory_based ( ) -> Self {
46
+ Self :: InMemory ( BloomFilterDeduplicator :: new ( ) )
47
+ }
48
+
49
+ /// Create new store-based deduplicator that will run queries against the
50
+ /// store to find if any event is deduplicated or not.
51
+ ///
52
+ /// This deduplicator is stateless.
53
+ ///
54
+ /// When the persistent storage is enabled by default, this will become the
55
+ /// default, and [`Deduplicator`] will be replaced with
56
+ /// [`StoreDeduplicator`].
57
+ pub fn new_store_based ( room_id : OwnedRoomId , store : EventCacheStoreLock ) -> Self {
58
+ Self :: PersistentStore ( StoreDeduplicator { room_id, store } )
59
+ }
60
+
61
+ /// Find duplicates in the given collection of events, and return both
62
+ /// valid events (those with an event id) as well as the event ids of
63
+ /// duplicate events.
64
+ pub async fn filter_duplicate_events < I > (
65
+ & self ,
66
+ events : I ,
67
+ room_events : & RoomEvents ,
68
+ ) -> Result < ( Vec < Event > , Vec < OwnedEventId > ) , EventCacheError >
69
+ where
70
+ I : Iterator < Item = Event > ,
71
+ {
72
+ match self {
73
+ Deduplicator :: InMemory ( dedup) => Ok ( dedup. filter_duplicate_events ( events, room_events) ) ,
74
+ Deduplicator :: PersistentStore ( dedup) => dedup. filter_duplicate_events ( events) . await ,
75
+ }
76
+ }
77
+ }
78
+
79
+ /// A deduplication mechanism based on the persistent storage associated to the
80
+ /// event cache.
81
+ ///
82
+ /// It will use queries to the persistent storage to figure where events are
83
+ /// duplicates or not, making it entirely stateless.
84
+ pub struct StoreDeduplicator {
85
+ /// The room this deduplicator applies to.
86
+ room_id : OwnedRoomId ,
87
+ /// The actual event cache store implementation used to query events.
88
+ store : EventCacheStoreLock ,
89
+ }
90
+
91
+ impl StoreDeduplicator {
92
+ async fn filter_duplicate_events < I > (
93
+ & self ,
94
+ events : I ,
95
+ ) -> Result < ( Vec < Event > , Vec < OwnedEventId > ) , EventCacheError >
96
+ where
97
+ I : Iterator < Item = Event > ,
98
+ {
99
+ let store = self . store . lock ( ) . await ?;
100
+
101
+ // Collect event ids as we "validate" events (i.e. check they have a valid event
102
+ // id.)
103
+ let mut event_ids = Vec :: new ( ) ;
104
+ let events = events
105
+ . filter_map ( |event| {
106
+ if let Some ( event_id) = event. event_id ( ) {
107
+ event_ids. push ( event_id) ;
108
+ Some ( event)
109
+ } else {
110
+ None
111
+ }
112
+ } )
113
+ . collect :: < Vec < _ > > ( ) ;
114
+
115
+ // Let the store do its magic ✨
116
+ let duplicates = store. filter_duplicated_events ( & self . room_id , event_ids) . await ?;
117
+
118
+ Ok ( ( events, duplicates) )
119
+ }
120
+ }
121
+
122
+ /// `BloomFilterDeduplicator` is an efficient type to find duplicated events,
123
+ /// using an in-memory cache.
27
124
///
28
125
/// It uses a [bloom filter] to provide a memory efficient probabilistic answer
29
126
/// to: “has event E been seen already?”. False positives are possible, while
@@ -49,34 +146,18 @@ impl BloomFilterDeduplicator {
49
146
const DESIRED_FALSE_POSITIVE_RATE : f64 = 0.01 ;
50
147
51
148
/// Create a new `Deduplicator` with no prior knowledge of known events.
52
- #[ cfg( test) ]
53
- pub fn new ( ) -> Self {
54
- Self :: with_initial_events ( std:: iter:: empty ( ) )
55
- }
56
-
57
- /// Create a new `Deduplicator` filled with initial events.
58
- ///
59
- /// This won't detect duplicates in the initial events, only learn about
60
- /// those events.
61
- pub fn with_initial_events < ' a > ( events : impl Iterator < Item = & ' a Event > ) -> Self {
62
- let mut bloom_filter = GrowableBloomBuilder :: new ( )
149
+ fn new ( ) -> Self {
150
+ let bloom_filter = GrowableBloomBuilder :: new ( )
63
151
. estimated_insertions ( Self :: APPROXIMATED_MAXIMUM_NUMBER_OF_EVENTS )
64
152
. desired_error_ratio ( Self :: DESIRED_FALSE_POSITIVE_RATE )
65
153
. build ( ) ;
66
- for e in events {
67
- let Some ( event_id) = e. event_id ( ) else {
68
- warn ! ( "initial event in deduplicator had no event id" ) ;
69
- continue ;
70
- } ;
71
- bloom_filter. insert ( event_id) ;
72
- }
73
154
Self { bloom_filter : Mutex :: new ( bloom_filter) }
74
155
}
75
156
76
157
/// Find duplicates in the given collection of events, and return both
77
158
/// valid events (those with an event id) as well as the event ids of
78
159
/// duplicate events.
79
- pub fn filter_duplicate_events < ' a , I > (
160
+ fn filter_duplicate_events < ' a , I > (
80
161
& ' a self ,
81
162
events : I ,
82
163
room_events : & ' a RoomEvents ,
@@ -184,7 +265,7 @@ impl BloomFilterDeduplicator {
184
265
185
266
/// Information about the scanned collection of events.
186
267
#[ derive( Debug ) ]
187
- pub enum Decoration < I > {
268
+ enum Decoration < I > {
188
269
/// This event is not duplicated.
189
270
Unique ( I ) ,
190
271
0 commit comments