|
17 | 17 | //! 2. [`CdcStream::into_stream`] returns a [`futures::Stream`] of [`CdcEvent`]s
|
18 | 18 | //! optionally from the [`Lsn`] returned in step 1.
|
19 | 19 | //!
|
20 |
| -//! Internally we get a snapshot by setting our transaction isolation level to |
21 |
| -//! [`TransactionIsolationLevel::Snapshot`], getting the current maximum LSN with |
22 |
| -//! [`crate::inspect::get_max_lsn`] and then running a `SELECT *`. We've observed that by |
23 |
| -//! using [`TransactionIsolationLevel::Snapshot`] the LSN remains stable for the entire |
24 |
| -//! transaction. |
| 20 | +//! The snapshot process is responsible for identifying a [`Lsn`] that provides |
| 21 | +//! a point-in-time view of the data for the table(s) being copied. Similarly to |
| 22 | +//! MySQL, Microsoft SQL server, as far as we know, does not provide an API to |
| 23 | +//! achieve this. |
| 24 | +//! |
| 25 | +//! SQL Server `SNAPSHOT` isolation provides guarantees that a reader will only |
| 26 | +//! see writes committed before the transaction began. More specficially, this |
| 27 | +//! snapshot is implemented using versions that are visibile based on the |
| 28 | +//! transaction sequence number (`XSN`). The `XSN` is set at the first |
| 29 | +//! read or write, not at `BEGIN TRANSACTION`, see [here](https://learn.microsoft.com/en-us/sql/relational-databases/sql-server-transaction-locking-and-row-versioning-guide?view=sql-server-ver17). |
| 30 | +//! This provides us a suitable starting point for capturing the table data. |
| 31 | +//! |
| 32 | +//! Due to the asynchronous nature of CDC, we can assume that the [`Lsn`] |
| 33 | +//! returned from any CDC tables or CDC functions will always be stale, |
| 34 | +//! in relation to the source table that CDC is tracking. The system table |
| 35 | +//! [sys.dm_tran_database_transactions](https://learn.microsoft.com/en-us/sql/relational-databases/system-dynamic-management-views/sys-dm-tran-database-transactions-transact-sql?view=sql-server-ver17) |
| 36 | +//! will contain a [`Lsn`] for any transaction that performs a write operation. |
| 37 | +//! Creating a savepoint using [SAVE TRANSACTION](https://learn.microsoft.com/en-us/sql/t-sql/language-elements/save-transaction-transact-sql?view=sql-server-ver17). |
| 38 | +//! is sufficient to generate a [`Lsn`] in this case, and will additionally |
| 39 | +//! establish the `XSN` used to determine data visibiity. |
| 40 | +//! |
| 41 | +//! Unfortunately, it isn't sufficient to just create a savepoint. There exists |
| 42 | +//! the possibility that a write transaction may have inserted a row into one |
| 43 | +//! of the tables in the snapshot, but that write has not comitted. In this case, |
| 44 | +//! the `INSERT` has already been written to the transaction log at a [`Lsn`] |
| 45 | +//! less than than the one captured, but the snapshot will *not* that INSERT |
| 46 | +//! because the transaction has not committed, and may not commit until after |
| 47 | +//! the snapshot is complete. In order to force a clear delineation of updates, |
| 48 | +//! the upstream tables in the snapshot must be locked. This lock only needs |
| 49 | +//! to exist long enough to establish the [`Lsn`] and `XSN`. |
| 50 | +//! |
| 51 | +//! SQL server supports exclusive table locks, but those will only be released |
| 52 | +//! once the outermost transaction completes. For this reason, this module |
| 53 | +//! uses two connections for the snapshot process. The first conection is used |
| 54 | +//! to initiate a transaction and lock the upstream tables. While the first |
| 55 | +//! connection maintains the locks, the second connection starts a transaction |
| 56 | +//! with [`TransactionIsolationLevel::Snapshot`] isolation and creates a |
| 57 | +//! savepoint. Once the savepoint is created and [`Lsn`] is captured, the first |
| 58 | +//! connection rolls back the transaction. The snapshot is created by the second |
| 59 | +//! connection within the existing transaction. |
25 | 60 | //!
|
26 | 61 | //! After completing the snapshot we use [`crate::inspect::get_changes_asc`] which will return
|
27 | 62 | //! all changes between a `[lower, upper)` bound of [`Lsn`]s.
|
@@ -57,7 +92,7 @@ pub struct CdcStream<'a> {
|
57 | 92 | ///
|
58 | 93 | /// Note: When CDC is first enabled in an instance of SQL Server it can take a moment
|
59 | 94 | /// for it to "completely" startup. Before starting a `TRANSACTION` for our snapshot
|
60 |
| - /// we'll wait this duration for SQL Server to report an LSN and thus indicate CDC is |
| 95 | + /// we'll wait this duration for SQL Server to report a [`Lsn`] and thus indicate CDC is |
61 | 96 | /// ready to go.
|
62 | 97 | max_lsn_wait: Duration,
|
63 | 98 | }
|
@@ -96,7 +131,7 @@ impl<'a> CdcStream<'a> {
|
96 | 131 | self
|
97 | 132 | }
|
98 | 133 |
|
99 |
| - /// The max duration we'll wait for SQL Server to return an LSN before taking a |
| 134 | + /// The max duration we'll wait for SQL Server to return a [`Lsn`] before taking a |
100 | 135 | /// snapshot.
|
101 | 136 | ///
|
102 | 137 | /// When CDC is first enabled in SQL Server it can take a moment before it is fully
|
@@ -143,45 +178,39 @@ impl<'a> CdcStream<'a> {
|
143 | 178 | // the upstream DB is ready for CDC.
|
144 | 179 | self.wait_for_ready().await?;
|
145 | 180 |
|
146 |
| - let mut fencing_client = self.client.new_connection().await?; |
| 181 | + tracing::info!("Upstream is ready"); |
| 182 | + |
147 | 183 | // The client that will be used for fencing does not need any special isolation level
|
148 |
| - // as it will be just be locking the tables |
| 184 | + // as it will be just be locking the table(s). |
| 185 | + let mut fencing_client = self.client.new_connection().await?; |
149 | 186 | let mut fence_txn = fencing_client.transaction().await?;
|
150 |
| - // lock all the tables we are planning to snapshot so that we can ensure that |
151 |
| - // writes that might be in progress are properly ordered before or after this snapshot |
152 |
| - // in addition to the LSN being properly ordered. |
153 |
| - // TODO (maz): we should considering a timeout here because we may lock some tables, |
154 |
| - // and the next table may be locked for some extended period, resulting in a traffic |
155 |
| - // jam. |
| 187 | + |
| 188 | + // TODO (maz): we should consider a timeout or a lock + snapshot per-table instead of collectively |
156 | 189 | for (_capture_instance, schema, table) in &tables {
|
157 | 190 | tracing::trace!(%schema, %table, "locking table");
|
158 |
| - crate::inspect::lock_table(&mut fence_txn, &*schema, &*table).await?; |
| 191 | + fence_txn.lock_table_exclusive(&*schema, &*table).await?; |
159 | 192 | }
|
| 193 | + tracing::info!(?tables, "Locked tables"); |
160 | 194 |
|
161 | 195 | self.client
|
162 | 196 | .set_transaction_isolation(TransactionIsolationLevel::Snapshot)
|
163 | 197 | .await?;
|
164 | 198 | let mut txn = self.client.transaction().await?;
|
165 |
| - // The result here is not important, what we are doing is establishing a transaction sequence number (XSN) |
166 |
| - // while using SNAPSHOT isolation that will be concurrent with a quiesced set of tables that we |
167 |
| - // wish to snapshot. Regardless what you might read in *many* articles on Microsoft's site, the XSN is not |
168 |
| - // set at BEGIN TRANSACTION, but at the first read/write. |
169 |
| - // The choice of table is driven by a few factors: |
170 |
| - // - it's a system table, we know it must exist and it will have a well defined schema |
171 |
| - // - MZ is a CDC client, so should be be able to read from it |
172 |
| - let res = txn |
173 |
| - .simple_query("SELECT TOP 1 object_id FROM cdc.change_tables;") |
174 |
| - .await?; |
175 |
| - // TODO (maz): nicer error if, somehow, there are no more change tables |
176 |
| - assert_eq!(res.len(), 1); |
177 | 199 |
|
178 |
| - // Creating a savepoint forces a write to the transaction log, which will assign an LSN to this transaction |
179 |
| - // which is concurrent with the tables that are currently in a consistent state. |
| 200 | + // Creating a savepoint forces a write to the transaction log, which will |
| 201 | + // assign a LSN and a transaction sequence number (XSN). |
| 202 | + // Because the tables are exclusively locked, any write operation has either |
| 203 | + // completed, or is blocked. The LSN and XSN acquired now represents a |
| 204 | + // consistent point-in-time view, such that any comitted write will be |
| 205 | + // visible to this snapshot and the LSN of such a write will be less than |
| 206 | + // or equal to the LSN captured here. |
180 | 207 | txn.create_savepoint(SAVEPOINT_NAME).await?;
|
| 208 | + tracing::info!(%SAVEPOINT_NAME, "Created savepoint"); |
| 209 | + let lsn = txn.get_lsn().await?; |
181 | 210 |
|
182 |
| - let lsn = crate::inspect::get_lsn(&mut txn).await?; |
183 |
| - |
184 |
| - // once we have the snapshot, we can rollback the fencing transaction to allow access to the tables. |
| 211 | + // Once the XSN is esablished and the LSN captured, the tables no longer |
| 212 | + // need to be locked. Any writes that happen to the upstream tables |
| 213 | + // will have a LSN higher than our captured LSN, and will be read from CDC. |
185 | 214 | fence_txn.rollback().await?;
|
186 | 215 |
|
187 | 216 | tracing::info!(?tables, ?lsn, "starting snapshot");
|
@@ -392,7 +421,7 @@ impl<'a> CdcStream<'a> {
|
392 | 421 | }
|
393 | 422 | }
|
394 | 423 |
|
395 |
| - // Ensure all of the capture instances are reporting an LSN. |
| 424 | + // Ensure all of the capture instances are reporting a LSN. |
396 | 425 | for instance in self.capture_instances.keys() {
|
397 | 426 | let (_client, min_result) = mz_ore::retry::Retry::default()
|
398 | 427 | .max_duration(self.max_lsn_wait)
|
@@ -508,7 +537,7 @@ pub struct Lsn {
|
508 | 537 | impl Lsn {
|
509 | 538 | const SIZE: usize = 10;
|
510 | 539 |
|
511 |
| - /// Interpret the provided bytes as an [`Lsn`]. |
| 540 | + /// Interpret the provided bytes as a [`Lsn`]. |
512 | 541 | pub fn try_from_bytes(bytes: &[u8]) -> Result<Self, String> {
|
513 | 542 | if bytes.len() != Self::SIZE {
|
514 | 543 | return Err(format!("incorrect length, expected 10 got {}", bytes.len()));
|
@@ -660,7 +689,7 @@ impl timely::order::PartialOrder for Lsn {
|
660 | 689 | }
|
661 | 690 | impl timely::order::TotalOrder for Lsn {}
|
662 | 691 |
|
663 |
| -/// Structured format of an [`Lsn`]. |
| 692 | +/// Structured format of a [`Lsn`]. |
664 | 693 | ///
|
665 | 694 | /// Note: The derived impl of [`PartialOrd`] and [`Ord`] relies on the field
|
666 | 695 | /// ordering so do not change it.
|
|
0 commit comments