fix(event cache): properly clear all rooms, including those sleeping in the store backend

bnjbvr · bnjbvr · commit c719cd11f3e0 · 2025-04-02T13:54:43.000+02:00
diff --git a/crates/matrix-sdk/src/event_cache/mod.rs b/crates/matrix-sdk/src/event_cache/mod.rs
@@ -35,6 +35,7 @@ use std::{
 
 use eyeball::{SharedObservable, Subscriber};
 use eyeball_im::VectorDiff;
+use futures_util::future::{join_all, try_join_all};
 use matrix_sdk_base::{
     deserialized_responses::{AmbiguityChange, TimelineEvent},
     event_cache::store::{EventCacheStoreError, EventCacheStoreLock},
@@ -459,18 +460,73 @@ impl EventCacheInner {
 
     /// Clears all the room's data.
     async fn clear_all_rooms(&self) -> Result<()> {
-        // Note: one must NOT clear the `by_room` map, because if something subscribed
-        // to a room update, they would never get any new update for that room, since
-        // re-creating the `RoomEventCache` would create a new unrelated sender.
-
-        // Note 2: we don't need to clear the [`Self::events`] map, because events are
-        // immutable in the Matrix protocol.
+        // Okay, here's where things get complicated.
+        //
+        // On the one hand, `by_room` may include storage for *some* rooms that we know
+        // about, but not *all* of them. Any room that hasn't been loaded in the
+        // client, or touched by a sync, will remain unloaded in memory, so it
+        // will be missing from `self.by_room`. As a result, we need to make
+        // sure that we're hitting the storage backend to *really* clear all the
+        // rooms, including those that haven't been loaded yet.
+        //
+        // On the other hand, one must NOT clear the `by_room` map, because if someone
+        // subscribed to a room update, they would never get any new update for
+        // that room, since re-creating the `RoomEventCache` would create a new,
+        // unrelated sender.
+        //
+        // So we need to *keep* the rooms in `by_room` alive, while clearing them in the
+        // store backend.
+        //
+        // As a result, for a short while, the in-memory linked chunks
+        // will be desynchronized from the storage. We need to be careful then. During
+        // that short while, we don't want *anyone* to touch the linked chunk
+        // (be it in memory or in the storage).
+        //
+        // And since that requirement applies to *any* room in `by_room` at the same
+        // time, we'll have to take the locks for *all* the live rooms, so as to
+        // properly clear the underlying storage.
+        //
+        // At this point, you might be scared about the potential for deadlocking. I am
+        // as well, but I'm convinced we're fine:
+        // 1. the lock for `by_room` is usually held only for a short while, and
+        //    independently of the other two kinds.
+        // 2. the state may acquire the store cross-process lock internally, but only
+        //    while the state's methods are called (so it's always transient). As a
+        //    result, as soon as we've acquired the state locks, the store lock ought to
+        //    be free.
+        // 3. The store lock is held explicitly only in a small scoped area below.
+        // 4. Then the store lock will be held internally when calling `reset()`, but at
+        //    this point it's only held for a short while each time, so rooms will take
+        //    turn to acquire it.
 
         let rooms = self.by_room.write().await;
-        for room in rooms.values() {
-            room.clear().await?;
+
+        // Collect all the rooms' state locks, first: we can clear the storage only when
+        // nobody will touch it at the same time.
+        let room_locks = join_all(
+            rooms.values().map(|room| async move { (room, room.inner.state.write().await) }),
+        )
+        .await;
+
+        // Clear the storage for all the rooms, using the storage facility.
+        if let Some(store) = self.store.get() {
+            let store_guard = store.lock().await?;
+            store_guard.clear_all_rooms_chunks().await?;
         }
 
+        // At this point, all the in-memory linked chunks are desynchronized from the
+        // storage. Resynchronize them manually by calling reset(), and
+        // propagate updates to observers.
+        try_join_all(room_locks.into_iter().map(|(room, mut state_guard)| async move {
+            let updates_as_vector_diffs = state_guard.reset().await?;
+            let _ = room.inner.sender.send(RoomEventCacheUpdate::UpdateTimelineEvents {
+                diffs: updates_as_vector_diffs,
+                origin: EventsOrigin::Cache,
+            });
+            Ok::<_, EventCacheError>(())
+        }))
+        .await?;
+
         Ok(())
     }
 
diff --git a/crates/matrix-sdk/tests/integration/event_cache.rs b/crates/matrix-sdk/tests/integration/event_cache.rs
@@ -1,24 +1,30 @@
-use std::{ops::Not, time::Duration};
+use std::{ops::Not, sync::Arc, time::Duration};
 
 use assert_matches::assert_matches;
 use assert_matches2::assert_let;
 use eyeball_im::VectorDiff;
 use futures_util::FutureExt;
+use imbl::Vector;
 use matrix_sdk::{
     assert_let_timeout, assert_next_matches_with_timeout,
     deserialized_responses::TimelineEvent,
     event_cache::{
         BackPaginationOutcome, EventCacheError, RoomEventCacheUpdate, RoomPaginationStatus,
     },
     linked_chunk::{ChunkIdentifier, Position, Update},
+    store::StoreConfig,
     test_utils::{
         assert_event_matches_msg,
         mocks::{MatrixMockServer, RoomMessagesResponseTemplate},
     },
 };
-use matrix_sdk_base::event_cache::Gap;
+use matrix_sdk_base::event_cache::{
+    store::{EventCacheStore, MemoryStore},
+    Gap,
+};
 use matrix_sdk_test::{
     async_test, event_factory::EventFactory, GlobalAccountDataTestEvent, JoinedRoomBuilder, ALICE,
+    BOB,
 };
 use ruma::{
     event_id,
@@ -2540,3 +2546,84 @@ async fn test_dont_remove_only_gap() {
     let outcome = room_event_cache.pagination().run_backwards_once(16).await.unwrap();
     assert!(outcome.reached_start);
 }
+
+#[async_test]
+async fn test_clear_all_rooms() {
+    let sleeping_room_id = room_id!("!dodo:saucisse.bzh");
+    let event_cache_store = Arc::new(MemoryStore::new());
+
+    let f = EventFactory::new().room(sleeping_room_id);
+    let ev0 = f.text_msg("hi").sender(*ALICE).event_id(event_id!("$ev0")).into_event();
+
+    // Feed the cache with one room with one event, before the client is created.
+    // This room will remain sleeping.
+    {
+        let cid = ChunkIdentifier::new(0);
+        event_cache_store
+            .handle_linked_chunk_updates(
+                sleeping_room_id,
+                vec![
+                    Update::NewItemsChunk { previous: None, new: cid, next: None },
+                    Update::PushItems { at: Position::new(cid, 0), items: vec![ev0] },
+                ],
+            )
+            .await
+            .unwrap();
+    }
+
+    let server = MatrixMockServer::new().await;
+    let client = server
+        .client_builder()
+        .store_config(
+            StoreConfig::new("hodlor".to_owned()).event_cache_store(event_cache_store.clone()),
+        )
+        .build()
+        .await;
+
+    client.event_cache().subscribe().unwrap();
+    client.event_cache().enable_storage().unwrap();
+
+    // Another room gets a live event: it's loaded in the event cache now, while
+    // sleeping_room_id is not.
+    let room_id = room_id!("!galette:saucisse.bzh");
+    let room = server
+        .sync_room(
+            &client,
+            JoinedRoomBuilder::new(room_id).add_timeline_event(
+                f.text_msg("bonchourhan").sender(*BOB).event_id(event_id!("$ev1")),
+            ),
+        )
+        .await;
+
+    let (room_event_cache, _drop_handles) = room.event_cache().await.unwrap();
+    let (initial, mut room_updates) = room_event_cache.subscribe().await;
+
+    let mut initial = Vector::from(initial);
+    // Wait for the ev1 event.
+    if initial.is_empty() {
+        assert_let_timeout!(
+            Ok(RoomEventCacheUpdate::UpdateTimelineEvents { diffs, .. }) = room_updates.recv()
+        );
+        assert_eq!(diffs.len(), 1);
+        assert_matches!(diffs[0], VectorDiff::Append { .. });
+        diffs[0].clone().apply(&mut initial);
+    }
+    // The room state now contains one event.
+    assert_eq!(initial.len(), 1);
+    assert_event_id!(initial[0], "$ev1");
+
+    // Now, clear all the rooms.
+    client.event_cache().clear_all_rooms().await.unwrap();
+
+    // We should get an update for the live room.
+    assert_let_timeout!(
+        Ok(RoomEventCacheUpdate::UpdateTimelineEvents { diffs, .. }) = room_updates.recv()
+    );
+    assert_eq!(diffs.len(), 1);
+    assert_let!(VectorDiff::Clear = &diffs[0]);
+
+    // The sleeping room should have been cleared too.
+    let (maybe_last_chunk, _chunk_id_gen) =
+        event_cache_store.load_last_chunk(sleeping_room_id).await.unwrap();
+    assert!(maybe_last_chunk.is_none());
+}